In [None]:
import nltk  # Import the Natural Language Toolkit library
nltk.download("punkt")  # Download the 'punkt' tokenizer models from NLTK
from nltk.tokenize import sent_tokenize  # Import the sentence tokenizer from NLTK

from transformers import pipeline, set_seed  # Import the pipeline and set_seed functions from the transformers library


from sumy.parsers.plaintext import PlaintextParser  # Import PlainTextParser from the sumy package
from sumy.nlp.tokenizers import Tokenizer  # Import Tokenizer from the sumy package
from sumy.summarizers.text_rank import TextRankSummarizer  # Import TextRankSummarizer from the sumy package

from datasets import load_dataset, load_metric  # Import functions to load datasets and metrics from the datasets library


import pandas as pd  # Import the pandas library to work with tables


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
dataset = load_dataset('ccdv/cnn_dailymail', '3.0.0')  # Load the CNN/DailyMail dataset
print(f"Features: {dataset['train'].column_names}")  # Print the column names of the training dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


  0%|          | 0/3 [00:00<?, ?it/s]

Features: ['article', 'highlights', 'id']


In [None]:
sample_text = dataset["train"][1]["article"][:2000]  # Get the first 2000 characters of the second article in the training dataset

# We'll collect the generated summaries of each model in a dictionary
summaries = {}  # Initialize an empty dictionary to store summaries


In [None]:
def three_sentence_summary(text):  # Define a function to generate a three-sentence summary
    return "\n".join(sent_tokenize(text)[:3])  # Return the first three sentences of the text


In [None]:
summaries["baseline"] = three_sentence_summary(sample_text)  # Generate a baseline summary and store it in the dictionary

In [None]:
summaries["baseline"]

"(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay.\nThe fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds.\nThe U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover."

In [None]:
# GPT2 summary
set_seed(42)  # Set the random seed for reproducibility
pipe = pipeline("text-generation", model="gpt2-xl")  # Initialize a text generation pipeline with the GPT-2 XL model
query = sample_text + "\nTL;DR:\n"  # Create a query for the GPT-2 model
pipe_out = pipe(query, max_new_tokens=1000, clean_up_tokenization_spaces=True)  # Generate text using the GPT-2 model
summaries["gpt2"] = "\n".join(  # Store the generated summary in the dictionary
    sent_tokenize(pipe_out[0]["generated_text"][len(query) :]))  # Tokenize the generated text into sentences
del pipe
del pipe_out
del query

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [1]:
# DeepSeek Summary
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # Define the model name for the DeepSeek model
set_seed(42)  # Set the random seed for reproducibility
pipe = pipeline("text-generation", model=model_name)  # Initialize a text generation pipeline with the DeepSeek model
query = sample_text + "\nTL;DR:\n"  # Create a query for the DeepSeek model
pipe_out = pipe(query, max_new_tokens=1000, clean_up_tokenization_spaces=True)  # Generate text using the DeepSeek model
summaries["DeepSeek"] = "\n".join(  # Store the generated summary in the dictionary
    sent_tokenize(pipe_out[0]["generated_text"][len(query) :]))  # Tokenize the generated text into sentences
del pipe
del pipe_out
del query

NameError: name 'set_seed' is not defined

In [None]:
#BART summary
pipe = pipeline("summarization", model="facebook/bart-large-cnn")  # Initialize a summarization pipeline with the BART model
pipe_out = pipe(sample_text)  # Generate a summary using the BART model
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))  # Store the generated summary in the dictionary
del pipe
del pipe_out


Device set to use cpu


NameError: name 'query' is not defined

In [None]:
summaries["DeepSeek"]

"In the men's 4x100 relay, Usain Bolt secured his third gold medal in Moscow, defeating Justin Gatlin, resulting in Jamaica's victory.\nThe U.S. team took second, while Canada and Britain finished third and fourth, respectively.\nThe relay was won by Bolt with Ashmeade providing the baton, and Bolt successfully took control of the baton from Gatlin.\nThe individual performances in the sprint events also contributed to Jamaica's dominance.\n</think>\n\nIn the men's 4x100m relay, Usain Bolt secured his third gold medal in Moscow, defeating Justin Gatlin, resulting in Jamaica's victory.\nThe U.S. team took second, while Canada and Britain finished third and fourth, respectively.\nThe relay was won by Bolt with Ashmeade providing the baton, and Bolt successfully took control of the baton from Gatlin.\nThe individual performances in the sprint events also contributed to Jamaica's dominance."

In [None]:
parser = PlaintextParser.from_string(sample_text, tokenizer = Tokenizer("english"))  # Use PlainTextParser to parse the sample text
summarizer = TextRankSummarizer()  # Initialize the TextRank summarizer

# Collect the summary sentences in a list
summary_sentences = []  # Initialize an empty list to store summary sentences
for sentence in summarizer(parser.document, 5):  # Generate summary sentences using the TextRank summarizer
    summary_sentences.append(str(sentence))  # Append each summary sentence to the list

# Join the sentences to form a single summary string
summaries["sumy"] = "\n".join(summary_sentences)  # Store the generated summary in the dictionary

# Print the summary (optional)
print(summaries["sumy"])  # Print the summary generated by the TextRank summarizer


(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay.
The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds.
The 26-year-old Bolt has now collected eight gold medals at world championships, equaling the record held by American trio Carl Lewis, Michael Johnson and Allyson Felix, not to mention the small matter of six Olympic titles.
The relay triumph followed individual successes in the 100 and 200 meters in the Russian capital.
Defending champions, the United States, were initially back in the bronze medal position after losing time on the second handover between Alexandria Anderson and English Gardner, but promoted to silver when France were subsequently disqualified for an illegal handover.


Comparing summaries

In [None]:
print("ORIGINAL TEXT")
for sentence in sent_tokenize(sample_text):
    print(sentence)
print("")

print("HUMAN SUMMARY")
print(dataset["train"][1]["highlights"])
print("")

for model_name in summaries:
    print(model_name.upper())
    print(summaries[model_name])
    print("")

ORIGINAL TAXT
(CNN) -- Usain Bolt rounded off the world championships Sunday by claiming his third gold in Moscow as he anchored Jamaica to victory in the men's 4x100m relay.
The fastest man in the world charged clear of United States rival Justin Gatlin as the Jamaican quartet of Nesta Carter, Kemar Bailey-Cole, Nickel Ashmeade and Bolt won in 37.36 seconds.
The U.S finished second in 37.56 seconds with Canada taking the bronze after Britain were disqualified for a faulty handover.
The 26-year-old Bolt has now collected eight gold medals at world championships, equaling the record held by American trio Carl Lewis, Michael Johnson and Allyson Felix, not to mention the small matter of six Olympic titles.
The relay triumph followed individual successes in the 100 and 200 meters in the Russian capital.
"I'm proud of myself and I'll continue to work to dominate for as long as possible," Bolt said, having previously expressed his intention to carry on until the 2016 Rio Olympics.
Victory wa

In [None]:
rouge_metric = load_metric("rouge")  # Load the ROUGE metric

In [None]:
reference = dataset["train"][1]["highlights"]  # Get the reference summary from the dataset
records = []  # Initialize an empty list to store ROUGE scores
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]  # Define the names of the ROUGE metrics

for model_name in summaries:  # Iterate over the generated summaries
    rouge_metric.add(prediction=summaries[model_name], reference=reference)  # Add the prediction and reference to the ROUGE metric
    score = rouge_metric.compute()  # Compute the ROUGE scores
    rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)  # Create a dictionary of ROUGE scores
    records.append(rouge_dict)  # Append the ROUGE scores to the list

pd.DataFrame.from_records(records, index=summaries.keys())  # Create a DataFrame from the ROUGE scores


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
baseline,0.303571,0.090909,0.214286,0.232143
gpt2,0.212121,0.0,0.121212,0.212121
DeepSeek,0.195402,0.046512,0.149425,0.172414
bart,0.582278,0.207792,0.455696,0.506329
sumy,0.218579,0.055249,0.131148,0.185792
