Task 1

In [99]:
!pip install datasets



In [100]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm

In [101]:

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [102]:
prompt = "Today I believe we can finally"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

outputs = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
generated_text

['Today I believe we can finally get to the point where we can make a difference in the lives of the people of the United States of America.\n']

In [103]:
encodings = tokenizer(generated_text, return_tensors="pt").input_ids

In [104]:
device = "cpu"
max_length = model.config.n_positions
stride = 512
seq_len = encodings.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


tensor(6.2287)

In [106]:
prompt = "With a heavy heart"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs1 = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(outputs1, skip_special_tokens=True)
generated_text

["With a heavy heart, I am going to take a break from the game and focus on the next game. I'm going to focus on the next"]

In [107]:
encodings = tokenizer(generated_text, return_tensors="pt").input_ids

In [108]:
device = "cpu"
max_length = model.config.n_positions
stride = 512
seq_len = encodings.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


tensor(6.2029)

In [110]:
prompt = "Standing on the edge"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs2 = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(outputs2, skip_special_tokens=True)
generated_text

['Standing on the edge of the cliff, the two men were able to get out of the way of the two men\'s pursuers.\n\n"']

In [111]:
encodings = tokenizer(generated_text, return_tensors="pt").input_ids

In [112]:
device = "cpu"
max_length = model.config.n_positions
stride = 512
seq_len = encodings.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


tensor(6.5713)

In [113]:
prompt = "Surrounded by strangers"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs3 = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(outputs3, skip_special_tokens=True)

In [114]:
encodings = tokenizer(generated_text, return_tensors="pt").input_ids

In [115]:
device = "cpu"
max_length = model.config.n_positions
stride = 512
seq_len = encodings.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


tensor(9.0802)

In [116]:
prompt = "Caught in a dilemma,"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs4 = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(outputs4, skip_special_tokens=True)
generated_text

['Caught in a dilemma, the two men are forced to fight for their lives.\n\nThe film is set in the same year as the first']

In [117]:
encodings = tokenizer(generated_text, return_tensors="pt").input_ids

In [118]:
device = "cpu"
max_length = model.config.n_positions
stride = 512
seq_len = encodings.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


tensor(10.4501)

In [120]:
prompt = "At the crossroads"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs5 = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(outputs5, skip_special_tokens=True)
generated_text

['At the crossroads of the two worlds, the two worlds are not the same. The two worlds are not the same. The two worlds are not']

In [121]:
encodings = tokenizer(generated_text, return_tensors="pt").input_ids

In [122]:
device = "cpu"
max_length = model.config.n_positions
stride = 512
seq_len = encodings.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())
ppl

  0%|          | 0/1 [00:00<?, ?it/s]


tensor(5.3548)

Task 2

In [127]:
from datasets import load_dataset
dataset = load_dataset('cnn_dailymail', '1.0.0')


In [128]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [129]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
# tokenizer = AutoTokenizer.from_pretrained("gniemiec/t5-small-finetuned-xsum")
# model = AutoModelForSeq2SeqLM.from_pretrained("gniemiec/t5-small-finetuned-xsum")

In [131]:
test_dataset = dataset["test"]

sentence_list = []
# Iterate through elements in the test dataset
counter = 0
for item in test_dataset:
  if counter == 50:
    break
  # Access and process each item in the test dataset

  sentence_list.append(item["article"])
  counter += 1

In [138]:
len(sentence_list[14])

1619

In [139]:
max_length = tokenizer.model_max_length
print("Maximum sequence length for this BART model:", max_length)

Maximum sequence length for this BART model: 1024


In [142]:
for i in range(len(sentence_list)):
  input_ids = tokenizer(sentence_list[i], return_tensors="pt", max_length=1024, truncation=True).input_ids
  outputs1 = model.generate(input_ids, num_beams=1, do_sample=False, max_length=30)
  print(tokenizer.batch_decode(outputs1, skip_special_tokens=True))


['The Palestinian Authority becomes the 123rd member of the International Criminal Court. The move gives the court jurisdiction over alleged crimes in Palestinian territories.']
['A dog apparently hit by a car and buried in a field survives. The dog, now named Theia, was found emaciated']
["Mohammad Javad Zarif is the Iranian foreign minister. He has been John Kerry's opposite number in securing a breakthrough in nuclear discussions"]
['The five were exposed to Ebola in Sierra Leone in March. None developed the deadly virus. They are clinicians for Partners in Health, a']
['The student admitted to hanging the noose, Duke University says. The student is no longer on campus and will face student conduct review.']
['Trey Moses, a star basketball player, asked Ellie Meredith, a freshman with Down syndrome, to be his prom date. Photos of']
["Amnesty International says governments are using the threat of terrorism to advance executions. The organization's annual report catalogs the use of st

In [145]:
for i in range(len(sentence_list)):
  print(sentence_list[i][:1024])

(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's ceremony, sa