In [1]:
! pip install rouge_score --quiet
! pip install sacrebleu --quiet

[0m

In [2]:
import numpy as np
import pandas as pd

In [3]:
from datasets import load_dataset

dataset = load_dataset('cnn_dailymail', version='3.0.0')

Downloading builder script:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/default to /root/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/default/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [5]:
train = dataset['train']
validation = dataset['validation']
test = dataset['test']

In [6]:
column_names = train.column_names
column_names

['article', 'highlights', 'id']

In [7]:
# let's take a look at some of the entries in the train dataset

text, summary = train['article'][:3], train['highlights'][:3]

for tx, summ in zip(text, summary):
    print(f"Article (excerpt of 500 characters) {tx[:500]}\n")
    print(f"Summary: {summ}")
    print("")

Article (excerpt of 500 characters) It's official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of che

Summary: Syrian official: Obama climbed to the top of the tree, "doesn't know how to get down"
Obama sends a letter to the heads of the House and Senate .
Obama to seek congressional approval on military action against Syria .
Aim is to determine whether CW were used, not by whom, says U.N. spokesman .

Article (excerpt of 500 characters) (CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaic

In [8]:
# same for the validation set

text, summary = validation['article'][:3], validation['highlights'][:3]

for tx, summ in zip(text, summary):
    print(f"Article (excerpt of 500 characters) {tx[:500]}\n")
    print(f"Summary: {summ}")
    print("")

Article (excerpt of 500 characters) (CNN)Singer-songwriter David Crosby hit a jogger with his car Sunday evening, a spokesman said. The accident happened in Santa Ynez, California, near where Crosby lives. Crosby was driving at approximately 50 mph when he struck the jogger, according to California Highway Patrol Spokesman Don Clotworthy. The posted speed limit was 55. The jogger suffered multiple fractures, and was airlifted to a hospital in Santa Barbara, Clotworthy said. His injuries are not believed to be life threatening. "Mr

Summary: Accident happens in Santa Ynez, California, near where Crosby lives .
The jogger suffered multiple fractures; his injuries are not believed to be life-threatening .

Article (excerpt of 500 characters) (CNN)Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members singing a racist chant. SAE's national chapter suspended the students, but University of Oklahoma President David Boren took it a step further, saying the univer

## Evaluating different transformers on the task

In [9]:
sample_text = train[1]['article'][:2000]
sample_text

'(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men\'s 4x100m relay. The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds. The U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover. The 26-year-old Bolt has now collected eight gold medals at world championships, equaling the record held by American trio Carl Lewis, Michael Johnson and Allyson Felix, not to mention the small matter of six Olympic titles. The relay triumph followed individual successes in the 100 and 200 meters in the Russian capital. "I\'m proud of myself and I\'ll continue to work to dominate for as long as possible," Bolt said, having previously expressed his intention to carry on until the 2016 Rio Olympics. Victory was never se

### GPT-2

In [10]:
from transformers import pipeline, set_seed

set_seed(42)
pipe = pipeline('text-generation', model = 'gpt2')

query = sample_text + "\nTL;DR:\n"
output = pipe(query, max_length = 512, clean_up_tokenization_spaces = True)

gpt2_output = output[0]['generated_text'][len(query):]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

2023-01-26 21:27:02.480328: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [11]:
gpt2_output

"Usain Bolt triumphed after a difficult final, but then he got into trouble when Canada made him try to block. It's the last match he's held all of last year with the men, who were later picked in the third and fourth rounds of the men's 100m and 200m and at Rio-Gold.\nUPDATE - November 8 - Team USA won in 36.36,"

### T5

In [12]:
pipe = pipeline('summarization', model = 't5-base')
output = pipe(sample_text)

t5_output = output[0]['summary_text']

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [13]:
t5_output

'usain bolt wins third gold medal at world championships . anchors Jamaica to victory in 4x100m relay . anchored by shelly-Ann Fraser-Pryce who completes triple . world champions the united states finish second .'

### PEGASUS

In [14]:
pipe = pipeline('summarization', model = 'google/pegasus-cnn_dailymail')
output = pipe(sample_text)

pegasus_output = output[0]['summary_text'].replace(" .<n>", ".\n")

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

In [15]:
pegasus_output

"Usain Bolt wins third gold of world championships.\nAnchors Jamaica to victory in men's 4x100m relay.\nEighth gold at the championships for Bolt.\nJamaica also win women's 4x100m relay ."

### BART

In [16]:
pipe = pipeline('summarization', model = 'facebook/bart-large-cnn')

output = pipe(sample_text)
bart_output = output[0]['summary_text']

Downloading:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [17]:
bart_output

"Usain Bolt wins his third gold of the world championships in Moscow. Bolt anchors Jamaica to victory in the men's 4x100m relay. The 26-year-old has now won eight gold medals at world championships. Jamaica's women also win gold in the relay, beating France in the process."

Let's compare the different summaries of the sample text by the different models

In [18]:
ground_truth = train[1]['highlights']
print(ground_truth)
print("")

for output in [gpt2_output, t5_output, pegasus_output, bart_output]:
    print(output)
    print("")

Usain Bolt wins third gold of world championship .
Anchors Jamaica to 4x100m relay victory .
Eighth gold at the championships for Bolt .
Jamaica double up in women's 4x100m relay .

Usain Bolt triumphed after a difficult final, but then he got into trouble when Canada made him try to block. It's the last match he's held all of last year with the men, who were later picked in the third and fourth rounds of the men's 100m and 200m and at Rio-Gold.
UPDATE - November 8 - Team USA won in 36.36,

usain bolt wins third gold medal at world championships . anchors Jamaica to victory in 4x100m relay . anchored by shelly-Ann Fraser-Pryce who completes triple . world champions the united states finish second .

Usain Bolt wins third gold of world championships.
Anchors Jamaica to victory in men's 4x100m relay.
Eighth gold at the championships for Bolt.
Jamaica also win women's 4x100m relay .

Usain Bolt wins his third gold of the world championships in Moscow. Bolt anchors Jamaica to victory in th

Based on the summaries above, I can say that the Pegasus, BART and T5 summaries were quite good - with a preference to the summary from the BART model. 

## Metrics and Finetuning

Let's understand the BLEU metric and compute it for an example

In [19]:
from datasets import load_metric

bleu_metric = load_metric('sacrebleu')
rouge_metric = load_metric('rouge')

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [20]:
bleu_metric.add(prediction='the the the the the', reference=['the cat is on the mat'])
results = bleu_metric.compute(smooth_method='floor', smooth_value=0)
results['precisions'] = [np.round(p, 2) for p in results['precisions']]

results

{'score': 0.0,
 'counts': [2, 0, 0, 0],
 'totals': [5, 4, 3, 2],
 'precisions': [40.0, 0.0, 0.0, 0.0],
 'bp': 0.8187307530779819,
 'sys_len': 5,
 'ref_len': 6}

In [21]:
bleu_metric.add(prediction='the cat is on mat', reference=['the cat is on the mat'])
results = bleu_metric.compute(smooth_method='floor', smooth_value=0)
results['precisions'] = [np.round(p, 2) for p in results['precisions']]

results

{'score': 57.89300674674101,
 'counts': [5, 3, 2, 1],
 'totals': [5, 4, 3, 2],
 'precisions': [100.0, 75.0, 66.67, 50.0],
 'bp': 0.8187307530779819,
 'sys_len': 5,
 'ref_len': 6}

Let's calculate the ROUGE score for the summaries we had from the different models above.

In [22]:
summaries = {'GPT2': gpt2_output, 'T5': t5_output, 'PEGASUS': pegasus_output,'BART': bart_output}
reference = ground_truth

records = []
rouge_names = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']

for model_name in summaries:
    rouge_metric.add(prediction=summaries[model_name], reference=reference)
    score = rouge_metric.compute()
    rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
    records.append(rouge_dict)
    
pd.DataFrame.from_records(records, index=summaries.keys())

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
GPT2,0.212766,0.021739,0.12766,0.148936
T5,0.52459,0.237288,0.42623,0.459016
PEGASUS,0.866667,0.655172,0.8,0.833333
BART,0.582278,0.207792,0.455696,0.506329


As seen from the dataframe above, we see that PEGASUS has the highest ROUGE scores followed by BART

## Evaluating PEGASUS on Test dataset

In [23]:
from tqdm import tqdm
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def chunks(list_of_elements, batch_size):
    """Yield successive batches"""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i: i + batch_size]
        
def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, 
                               batch_size=16, column_text = 'article', column_summary = 'highlights'):
    
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))
    
    for article_batch, target_batch in tqdm(zip(article_batches, target_batches)):
        inputs = tokenizer(article_batch, max_length = 1024, 
                           truncation = True, padding = 'max_length', return_tensors = 'pt')
        summaries = model.generate(inputs['input_ids'].to(device), 
                                   attention_mask = inputs['attention_mask'].to(device), 
                                   length_penalty = 0.8, num_beams = 1, max_length = 128)
        
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens = True, clean_up_tokenization_spaces = True) 
                             for s in summaries]
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        metric.add_batch(predictions = decoded_summaries, references = target_batch)
        
    score = metric.compute()
    return score

In [24]:
test_sampled = dataset["test"].shuffle(seed=42).select(range(100))

In [25]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

The cell below takes quite long to run So will be left unrun in this notebook

In [None]:
score = evaluate_summaries_pegasus(test_sampled, rouge_metric, model, tokenizer, batch_size=8)
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=["pegasus"])

12it [13:58, 65.96s/it]