In [None]:
!pip install "numpy<2.0"
import os
os.kill(os.getpid(), 9)  # Force restart the runtime after install




In [1]:
# Install correct versions of datasets and fsspec
!pip install -q datasets==2.14.0 fsspec==2023.6.0

from datasets import load_dataset

# Load a small subset (100 samples) from CNN/DailyMail for quick testing
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:50]")

multidoc_test = []
for i in range(0, 50, 10):
    docs = " ".join(dataset[i + j]["article"] for j in range(10))
    summary = dataset[i]["highlights"]
    multidoc_test.append({"documents": docs, "summary": summary})

print(f"✅ Loaded {len(multidoc_test)} multi-document samples.")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Loaded 5 multi-document samples.


In [2]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch

# Load PEGASUS model and tokenizer
model_name = "google/pegasus-multi_news"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Generate summaries
generated_summaries = []

for item in multidoc_test:
    inputs = tokenizer(
        item["documents"],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=1024,
    ).to(model.device)

    summary_ids = model.generate(
        **inputs,
        num_beams=4,
        max_length=256,
        min_length=32,
        length_penalty=2.0,
        early_stopping=True,
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    generated_summaries.append({
        "reference": item["summary"],
        "generated": summary
    })

print("✅ Done! Generated", len(generated_summaries), "summaries.")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-multi_news and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Done! Generated 5 summaries.


In [5]:
# STEP 3: Evaluate ROUGE Scores

from rouge_score import rouge_scorer
import time

scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

rouge1_scores, rouge2_scores, rougeL_scores = [], [], []
start_time = time.time()

for sample in generated_summaries:
    scores = scorer.score(sample["reference"], sample["generated"])
    rouge1_scores.append(scores["rouge1"].fmeasure)
    rouge2_scores.append(scores["rouge2"].fmeasure)
    rougeL_scores.append(scores["rougeL"].fmeasure)

end_time = time.time()
elapsed_time = end_time - start_time
avg_time_per_sample = elapsed_time / len(generated_summaries)

# Final averaged scores
print("📊 ROUGE Evaluation on", len(generated_summaries), "samples")
print(f"ROUGE-1: {sum(rouge1_scores) / len(rouge1_scores):.4f}")
print(f"ROUGE-2: {sum(rouge2_scores) / len(rouge2_scores):.4f}")
print(f"ROUGE-L: {sum(rougeL_scores) / len(rougeL_scores):.4f}")
print(f"\n⏱️ Avg time per sample: {avg_time_per_sample:.2f} sec")


📊 ROUGE Evaluation on 5 samples
ROUGE-1: 0.1748
ROUGE-2: 0.0556
ROUGE-L: 0.1250

⏱️ Avg time per sample: 0.01 sec
