In [2]:
!pip install transformers>=4.36.0 datasets==2.14.0 rouge-score fsspec==2023.6.0 --quiet

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from rouge_score import rouge_scorer
import time


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [3]:
# Load 100 CNN/DailyMail test samples and simulate MDS (10 docs per sample)
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:100]")

# Group into 10 multi-document samples
multidoc_test = []
for i in range(0, 100, 10):
    docs = " ".join(dataset[i + j]["article"] for j in range(10))
    summary = dataset[i]["highlights"]
    multidoc_test.append({"documents": docs, "summary": summary})

print(f"✅ Loaded {len(multidoc_test)} multi-document samples.")


✅ Loaded 10 multi-document samples.


In [4]:
def compute_rouge_scores(references, predictions):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]

    avg_scores = {
        "ROUGE-1": sum([s["rouge1"].fmeasure for s in scores]) / len(scores),
        "ROUGE-2": sum([s["rouge2"].fmeasure for s in scores]) / len(scores),
        "ROUGE-L": sum([s["rougeL"].fmeasure for s in scores]) / len(scores)
    }
    return avg_scores


In [5]:
def run_t5_inference(model_name, mds_data):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    print(f"✅ {model_name} loaded on {device}")

    references = []
    predictions = []

    start_time = time.time()
    for item in mds_data:
        input_text = "summarize: " + item["documents"]
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)

        summary_ids = model.generate(
            **inputs,
            num_beams=4,
            max_length=256,
            min_length=32,
            length_penalty=2.0,
            early_stopping=True,
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        predictions.append(summary)
        references.append(item["summary"])
    end_time = time.time()

    rouge_scores = compute_rouge_scores(references, predictions)
    avg_time_per_sample = (end_time - start_time) / len(mds_data)

    return rouge_scores, avg_time_per_sample


In [6]:
# T5-Base
print("\n🔹 Running T5-Base")
scores_base, time_base = run_t5_inference("t5-base", multidoc_test)
print("📊 ROUGE (T5-Base):", scores_base)
print("⏱️ Avg Time/sample (T5-Base):", round(time_base, 4), "sec")

# T5-Large
print("\n🔹 Running T5-Large")
scores_large, time_large = run_t5_inference("t5-large", multidoc_test)
print("📊 ROUGE (T5-Large):", scores_large)
print("⏱️ Avg Time/sample (T5-Large):", round(time_large, 4), "sec")



🔹 Running T5-Base




spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

✅ t5-base loaded on cuda
📊 ROUGE (T5-Base): {'ROUGE-1': 0.36808773776260567, 'ROUGE-2': 0.1682702932217026, 'ROUGE-L': 0.2922502866508864}
⏱️ Avg Time/sample (T5-Base): 1.5225 sec

🔹 Running T5-Large


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

✅ t5-large loaded on cuda
📊 ROUGE (T5-Large): {'ROUGE-1': 0.3666637238987361, 'ROUGE-2': 0.15719075955622336, 'ROUGE-L': 0.26758998668050543}
⏱️ Avg Time/sample (T5-Large): 3.1691 sec
