In [1]:
import torch
from datasets import load_dataset

import pandas as pd
import gc
from transformers import AutoTokenizer, T5ForConditionalGeneration

import evaluate
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\youse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
full_dataset = load_dataset(
    "cnn_dailymail", "3.0.0"
)  # Note: We specify cache_dir to use pre-cached data.

# Use a small sample of the data during this lab, for speed.
sample_size = 100
sample = (
    full_dataset["train"]
    .filter(lambda r: "CNN" in r["article"][:25])
    .shuffle(seed=42)
    .select(range(sample_size))
)
sample

Downloading data: 100%|██████████| 313M/313M [00:22<00:00, 13.7MB/s] 
Downloading data: 100%|██████████| 304M/304M [00:21<00:00, 14.1MB/s] 
Downloading data: 100%|██████████| 155M/155M [00:11<00:00, 14.0MB/s] 
Downloading data: 100%|██████████| 34.7M/34.7M [00:03<00:00, 11.2MB/s]
Downloading data: 100%|██████████| 30.0M/30.0M [00:02<00:00, 11.9MB/s]
Generating train split: 100%|██████████| 287113/287113 [00:03<00:00, 85132.32 examples/s]
Generating validation split: 100%|██████████| 13368/13368 [00:00<00:00, 95727.57 examples/s]
Generating test split: 100%|██████████| 11490/11490 [00:00<00:00, 88069.42 examples/s]
Filter: 100%|██████████| 287113/287113 [00:02<00:00, 96653.98 examples/s]


Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 100
})

In [3]:
display(sample.to_pandas())

Unnamed: 0,article,highlights,id
0,(CNN) -- A magnitude 6.7 earthquake rattled Pa...,Papua New Guinea is on the so-called Ring of F...,8093dba7bc2260c26f18939826909ef27549c758
1,(CNN) -- Pakistan took big steps towards level...,Australia collapse to 88 all out on opening da...,67d626156f971d0bf55e5f2a48e1ed965eb622a6
2,(CNN) -- Federal prosecutors are pushing to fo...,Jared Loughner is refusing the government's re...,0d02fb8f0d406db956b128a5c1cc7bf3f13860a6
3,"Centennial, Colorado (CNN) -- McKayla Hicks sa...",Shooting victim McKayla Hicks went to hearing ...,39aee887c6d34bd311c826142b14037e6f2639ee
4,(CNN) -- Double-amputee sprinter Oscar Pistori...,Oscar Pistorius to become first double-amputee...,cc83ecdf08f0b598c3b97b3e2819c7e0ae7ca4f2
...,...,...,...
95,(CNN) -- Samuel Eto'o netted a superb hat-tric...,Samuel Eto'o scored a hat-trick as Inter Milan...,6c1924f5852b6980a0835877d3f9591a00c70f37
96,Washington (CNN) -- President Barack Obama's r...,Obama raised almost $30 million less than Romn...,0a5691b8fe654b6b2cdace5ab87aff2ee4c23577
97,(CNN) -- Violence swept across Syria on Friday...,NEW: U.N. Secretary-General Ban Ki-moon joins ...,2cc6e4db9758192ac467bbd7424782e4c92206c1
98,(CNN) -- New HIV infections have fallen worldw...,New infections in sub-Saharan Africa 15 percen...,acb2148184f83ecb516ad19a1b0a0e1bc5047237


In [4]:
example_article = sample["article"][0]
example_summary = sample["highlights"][0]
print(f"Article:\n{example_article}\n")
print(f"Summary:\n{example_summary}")

Article:

Summary:
Papua New Guinea is on the so-called Ring of Fire .
It's on an arc of fault lines that is prone to frequent earthquakes .


In [5]:
def batch_generator(data: list, batch_size: int):
    """
    Creates batches of size `batch_size` from a list.
    """
    s = 0
    e = s + batch_size
    while s < len(data):
        yield data[s:e]
        s = e
        e = min(s + batch_size, len(data))

In [6]:
def summarize_with_t5(
    model_checkpoint: str, articles: list, batch_size: int = 8
) -> list:
    """
    Compute summaries using a T5 model.
    This is similar to a `pipeline` for a T5 model but does tokenization manually.

    :param model_checkpoint: Name for a model checkpoint in Hugging Face, such as "t5-small" or "t5-base"
    :param articles: List of strings, where each string represents one article.
    :return: List of strings, where each string represents one article's generated summary
    """
    if torch.cuda.is_available():
        device = "cuda:0"
    else:
        device = "cpu"

    model = T5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=1024)

    def perform_inference(batch: list) -> list:
        inputs = tokenizer(
            batch, max_length=1024, return_tensors="pt", padding=True, truncation=True
        )

        summary_ids = model.generate(
            inputs.input_ids.to(device),
            attention_mask=inputs.attention_mask.to(device),
            num_beams=2,
            min_length=0,
            max_length=40,
        )
        return tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

    res = []

    summary_articles = list(map(lambda article: "summarize: " + article, articles))
    for batch in batch_generator(summary_articles, batch_size=batch_size):
        res += perform_inference(batch)

        torch.cuda.empty_cache()
        gc.collect()

    # clean up
    del tokenizer
    del model
    torch.cuda.empty_cache()
    gc.collect()
    return res

In [7]:
t5_small_summaries = summarize_with_t5("t5-small", sample["article"])

config.json: 100%|██████████| 1.21k/1.21k [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
model.safetensors: 100%|██████████| 242M/242M [00:08<00:00, 28.2MB/s] 
generation_config.json: 100%|██████████| 147/147 [00:00<?, ?B/s] 
tokenizer_config.json: 100%|██████████| 2.32k/2.32k [00:00<?, ?B/s]
spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.83MB/s]
tokenizer.json: 100%|██████████| 1.39M/1.39M [00:01<00:00, 1.06MB/s]


In [8]:
reference_summaries = sample["highlights"]

In [9]:
display(
    pd.DataFrame.from_dict(
        {
            "generated": t5_small_summaries,
            "reference": reference_summaries,
        }
    )
)

Unnamed: 0,generated,reference
0,a magnitude 6.7 earthquake rattles Papua new G...,Papua New Guinea is on the so-called Ring of F...
1,the two-Test cricket series is being played in...,Australia collapse to 88 all out on opening da...
2,federal prosecutors want jared Lee Loughner to...,Jared Loughner is refusing the government's re...
3,"new: ""he tried to kill people,"" a 17-year-old ...",Shooting victim McKayla Hicks went to hearing ...
4,double-amputee sprinter Oscar Pistorius will c...,Oscar Pistorius to become first double-amputee...
...,...,...
95,holders Inter Milan thrash Werder Bremen 4-0 i...,Samuel Eto'o scored a hat-trick as Inter Milan...
96,president's re-election campaign raises $71 mi...,Obama raised almost $30 million less than Romn...
97,"at least 75 people were killed in protests, an...",NEW: U.N. Secretary-General Ban Ki-moon joins ...
98,new infections have fallen by 17 percent in th...,New infections in sub-Saharan Africa 15 percen...


In [10]:
accuracy = 0.0
for i in range(len(reference_summaries)):
    generated_summary = t5_small_summaries[i]
    if generated_summary == reference_summaries[i]:
        accuracy += 1.0
accuracy = accuracy / len(reference_summaries)

print(f"Achieved accuracy {accuracy}!")

Achieved accuracy 0.0!


In [12]:
rouge_score = evaluate.load("rouge")

In [13]:
def compute_rouge_score(generated: list, reference: list) -> dict:
    """
    Compute ROUGE scores on a batch of articles.

    This is a convenience function wrapping Hugging Face `rouge_score`,
    which expects sentences to be separated by newlines.

    :param generated: Summaries (list of strings) produced by the model
    :param reference: Ground-truth summaries (list of strings) for comparison
    """
    generated_with_newlines = ["\n".join(sent_tokenize(s.strip())) for s in generated]
    reference_with_newlines = ["\n".join(sent_tokenize(s.strip())) for s in reference]
    return rouge_score.compute(
        predictions=generated_with_newlines,
        references=reference_with_newlines,
        use_stemmer=True,
    )

In [14]:
compute_rouge_score(t5_small_summaries, reference_summaries)

{'rouge1': 0.3099475501450091,
 'rouge2': 0.1063625707267443,
 'rougeL': 0.22168703371543252,
 'rougeLsum': 0.2818316064759516}

In [15]:
compute_rouge_score(reference_summaries, reference_summaries)

{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}

In [16]:
compute_rouge_score(
    generated=["" for _ in range(len(reference_summaries))],
    reference=reference_summaries,
)

{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}

In [17]:
rouge_score.compute(
    predictions=["Large language models beat world record"],
    references=["Large language models beating world records"],
    use_stemmer=False,
)

{'rouge1': 0.6666666666666666,
 'rouge2': 0.4000000000000001,
 'rougeL': 0.6666666666666666,
 'rougeLsum': 0.6666666666666666}

In [18]:
rouge_score.compute(
    predictions=["Large language models beat world record"],
    references=["Large language models beating world records"],
    use_stemmer=True,
)

{'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}

In [19]:
rouge_score.compute(
    predictions=["Large language models beat world record"],
    references=["Large"],
    use_stemmer=True,
)

{'rouge1': 0.2857142857142857,
 'rouge2': 0.0,
 'rougeL': 0.2857142857142857,
 'rougeLsum': 0.2857142857142857}

In [20]:
rouge_score.compute(
    predictions=["Large"],
    references=["Large language models beat world record"],
    use_stemmer=True,
)

{'rouge1': 0.2857142857142857,
 'rouge2': 0.0,
 'rougeL': 0.2857142857142857,
 'rougeLsum': 0.2857142857142857}

In [21]:
rouge_score.compute(
    predictions=["Large language"],
    references=["Large language models beat world record"],
    use_stemmer=True,
)

{'rouge1': 0.5, 'rouge2': 0.33333333333333337, 'rougeL': 0.5, 'rougeLsum': 0.5}

In [22]:
rouge_score.compute(
    predictions=["Models beat large language world record"],
    references=["Large language models beat world record"],
    use_stemmer=True,
)

{'rouge1': 1.0,
 'rouge2': 0.6,
 'rougeL': 0.6666666666666666,
 'rougeLsum': 0.6666666666666666}

In [29]:
from evaluate import load
bertscore = load("bertscore")
predictions=["Large language models beat world record"]
references=["Large language models beating world records"]
results = bertscore.compute(predictions=predictions, references=references, lang="en")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
results

{'precision': [0.9795935750007629],
 'recall': [0.9795935750007629],
 'f1': [0.9795935750007629],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.36.2)'}