Task 1

In [157]:
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [100]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm

In [101]:

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [102]:
prompt = "Today I believe we can finally"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

outputs = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
generated_text

['Today I believe we can finally get to the point where we can make a difference in the lives of the people of the United States of America.\n']

In [103]:
encodings = tokenizer(generated_text, return_tensors="pt").input_ids

In [104]:
device = "cpu"
max_length = model.config.n_positions
stride = 512
seq_len = encodings.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


tensor(6.2287)

In [106]:
prompt = "With a heavy heart"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs1 = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(outputs1, skip_special_tokens=True)
generated_text

["With a heavy heart, I am going to take a break from the game and focus on the next game. I'm going to focus on the next"]

In [107]:
encodings = tokenizer(generated_text, return_tensors="pt").input_ids

In [108]:
device = "cpu"
max_length = model.config.n_positions
stride = 512
seq_len = encodings.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


tensor(6.2029)

In [110]:
prompt = "Standing on the edge"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs2 = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(outputs2, skip_special_tokens=True)
generated_text

['Standing on the edge of the cliff, the two men were able to get out of the way of the two men\'s pursuers.\n\n"']

In [111]:
encodings = tokenizer(generated_text, return_tensors="pt").input_ids

In [112]:
device = "cpu"
max_length = model.config.n_positions
stride = 512
seq_len = encodings.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


tensor(6.5713)

In [113]:
prompt = "Surrounded by strangers"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs3 = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(outputs3, skip_special_tokens=True)

In [114]:
encodings = tokenizer(generated_text, return_tensors="pt").input_ids

In [115]:
device = "cpu"
max_length = model.config.n_positions
stride = 512
seq_len = encodings.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


tensor(9.0802)

In [116]:
prompt = "Caught in a dilemma,"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs4 = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(outputs4, skip_special_tokens=True)
generated_text

['Caught in a dilemma, the two men are forced to fight for their lives.\n\nThe film is set in the same year as the first']

In [117]:
encodings = tokenizer(generated_text, return_tensors="pt").input_ids

In [118]:
device = "cpu"
max_length = model.config.n_positions
stride = 512
seq_len = encodings.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


tensor(10.4501)

In [120]:
prompt = "At the crossroads"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs5 = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(outputs5, skip_special_tokens=True)
generated_text

['At the crossroads of the two worlds, the two worlds are not the same. The two worlds are not the same. The two worlds are not']

In [121]:
encodings = tokenizer(generated_text, return_tensors="pt").input_ids

In [122]:
device = "cpu"
max_length = model.config.n_positions
stride = 512
seq_len = encodings.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


tensor(5.3548)

Task 2

In [148]:
from datasets import load_dataset
dataset = load_dataset('cnn_dailymail', '2.0.0')


Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [149]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [129]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
# tokenizer = AutoTokenizer.from_pretrained("gniemiec/t5-small-finetuned-xsum")
# model = AutoModelForSeq2SeqLM.from_pretrained("gniemiec/t5-small-finetuned-xsum")

In [146]:
test_dataset = dataset["test"]

sentence_list = []
reference_list = []
# Iterate through elements in the test dataset
counter = 0
for item in test_dataset:
  if counter == 50:
    break
  # Access and process each item in the test dataset

  sentence_list.append(item["article"])
  reference_list.append(item["highlights"])
  counter += 1

In [138]:
len(sentence_list[14])

1619

In [139]:
max_length = tokenizer.model_max_length
print("Maximum sequence length for this BART model:", max_length)

Maximum sequence length for this BART model: 1024


In [153]:
prediction_list = []
for i in range(len(sentence_list)):
  input_ids = tokenizer(sentence_list[i], return_tensors="pt", max_length=1024, truncation=True).input_ids
  outputs1 = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30)
  print(tokenizer.batch_decode(outputs1, skip_special_tokens=True)[0])
  prediction_list.append(tokenizer.batch_decode(outputs1, skip_special_tokens=True)[0])


The Palestinian Authority becomes the 123rd member of the International Criminal Court. The move gives the court jurisdiction over alleged crimes in Palestinian territories.
A dog apparently hit by a car and buried in a field survives. The dog, now named Theia, was found emaciated
Mohammad Javad Zarif is the Iranian foreign minister. He has been John Kerry's opposite number in securing a breakthrough in nuclear discussions
The five were exposed to Ebola in Sierra Leone in March. None developed the deadly virus. They are clinicians for Partners in Health, a
The student admitted to hanging the noose, Duke University says. The student is no longer on campus and will face student conduct review.
Trey Moses, a star basketball player, asked Ellie Meredith, a freshman with Down syndrome, to be his prom date. Photos of
Amnesty International says governments are using the threat of terrorism to advance executions. The organization's annual report catalogs the use of state-
Andrew Getty, 47, was

In [145]:
for i in range(len(sentence_list)):
  print(sentence_list[i][:1024])

(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, sa

In [147]:
for i in range(len(sentence_list)):
  print(reference_list[i])

Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June . Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .
Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field . "She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .
Mohammad Javad Zarif has spent more time with John Kerry than any other foreign minister . He once participated in a takeover of the Iranian Consulate in San Francisco . The Iranian foreign minister tweets in English .
17 Americans were exposed to the Ebola virus while in Sierra Leone in March . Another person was diagnosed with the disease and taken to hospital in Maryland . National Institutes of Health says the patient is in fair condition after weeks of treatment .
Student is no longer on Duke University campus and will face disciplinary rev

In [155]:
reference_list[0]
prediction_list[0]

'The Palestinian Authority becomes the 123rd member of the International Criminal Court. The move gives the court jurisdiction over alleged crimes in Palestinian territories.'

Task 3

In [166]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=4ad5c10ae676c24a83f4104d0afd9ce7dd99350ed09b2add14d567a8543e2458
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [170]:
import evaluate

rouge = evaluate.load("rouge")
results = rouge.compute(predictions=prediction_list, references=reference_list, use_aggregator=False)
print(results)

{'rouge1': [0.44827586206896547, 0.4375, 0.48275862068965525, 0.4242424242424242, 0.6461538461538461, 0.35555555555555557, 0.49350649350649356, 0.15384615384615383, 0.47826086956521735, 0.46511627906976744, 0.4150943396226416, 0.3404255319148936, 0.2181818181818182, 0.14705882352941177, 0.11428571428571428, 0.29411764705882354, 0.4137931034482759, 0.18666666666666665, 0.5652173913043478, 0.23076923076923078, 0.16666666666666669, 0.25806451612903225, 0.34375000000000006, 0.631578947368421, 0.5098039215686274, 0.2388059701492537, 0.3111111111111111, 0.2926829268292683, 0.20833333333333331, 0.4489795918367347, 0.2337662337662338, 0.18181818181818182, 0.35000000000000003, 0.2916666666666667, 0.3103448275862069, 0.27272727272727276, 0.3508771929824562, 0.2903225806451613, 0.2333333333333333, 0.17777777777777778, 0.2857142857142857, 0.08450704225352111, 0.6956521739130435, 0.47457627118644063, 0.4186046511627907, 0.13043478260869565, 0.8799999999999999, 0.23076923076923075, 0.218181818181818

In [172]:
for item in results["rouge1"]:
  print(item)

0.44827586206896547
0.4375
0.48275862068965525
0.4242424242424242
0.6461538461538461
0.35555555555555557
0.49350649350649356
0.15384615384615383
0.47826086956521735
0.46511627906976744
0.4150943396226416
0.3404255319148936
0.2181818181818182
0.14705882352941177
0.11428571428571428
0.29411764705882354
0.4137931034482759
0.18666666666666665
0.5652173913043478
0.23076923076923078
0.16666666666666669
0.25806451612903225
0.34375000000000006
0.631578947368421
0.5098039215686274
0.2388059701492537
0.3111111111111111
0.2926829268292683
0.20833333333333331
0.4489795918367347
0.2337662337662338
0.18181818181818182
0.35000000000000003
0.2916666666666667
0.3103448275862069
0.27272727272727276
0.3508771929824562
0.2903225806451613
0.2333333333333333
0.17777777777777778
0.2857142857142857
0.08450704225352111
0.6956521739130435
0.47457627118644063
0.4186046511627907
0.13043478260869565
0.8799999999999999
0.23076923076923075
0.21818181818181817
0.3333333333333333


In [177]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m584.4 kB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [178]:
bert = evaluate.load("bertscore")
results = bert.compute(predictions=prediction_list, references=reference_list, lang="en")
print(results)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.9087958335876465, 0.9049921631813049, 0.9053109288215637, 0.8992224931716919, 0.9483097195625305, 0.8947829604148865, 0.9070981740951538, 0.8673754334449768, 0.903397262096405, 0.9113286137580872, 0.8709957599639893, 0.8874913454055786, 0.8703935742378235, 0.8557463884353638, 0.7901948690414429, 0.8925226926803589, 0.9354259967803955, 0.87926185131073, 0.9455327987670898, 0.8720071315765381, 0.8646916747093201, 0.8817465305328369, 0.8519949316978455, 0.9565417766571045, 0.9128168225288391, 0.8740154504776001, 0.8750764727592468, 0.8552663922309875, 0.8639880418777466, 0.8988494873046875, 0.9024897813796997, 0.8413506150245667, 0.9011999368667603, 0.8449734449386597, 0.898521900177002, 0.8988035917282104, 0.8994429707527161, 0.9110718369483948, 0.8837074041366577, 0.8368695974349976, 0.8867510557174683, 0.828952968120575, 0.9364964365959167, 0.9428212642669678, 0.8772562742233276, 0.8639482855796814, 0.9715203046798706, 0.8689782619476318, 0.8742637634277344, 0.89457458

In [179]:
for item in results["precision"]:
  print(item)

0.9087958335876465
0.9049921631813049
0.9053109288215637
0.8992224931716919
0.9483097195625305
0.8947829604148865
0.9070981740951538
0.8673754334449768
0.903397262096405
0.9113286137580872
0.8709957599639893
0.8874913454055786
0.8703935742378235
0.8557463884353638
0.7901948690414429
0.8925226926803589
0.9354259967803955
0.87926185131073
0.9455327987670898
0.8720071315765381
0.8646916747093201
0.8817465305328369
0.8519949316978455
0.9565417766571045
0.9128168225288391
0.8740154504776001
0.8750764727592468
0.8552663922309875
0.8639880418777466
0.8988494873046875
0.9024897813796997
0.8413506150245667
0.9011999368667603
0.8449734449386597
0.898521900177002
0.8988035917282104
0.8994429707527161
0.9110718369483948
0.8837074041366577
0.8368695974349976
0.8867510557174683
0.828952968120575
0.9364964365959167
0.9428212642669678
0.8772562742233276
0.8639482855796814
0.9715203046798706
0.8689782619476318
0.8742637634277344
0.8945745825767517
