In [12]:
# Final package fix — clean uninstall + reinstall compatible versions
!pip uninstall -y gcsfs
!pip install --upgrade fsspec==2023.6.0


Found existing installation: gcsfs 2025.3.2
Uninstalling gcsfs-2025.3.2:
  Successfully uninstalled gcsfs-2025.3.2
Collecting fsspec==2023.6.0
  Downloading fsspec-2023.6.0-py3-none-any.whl.metadata (6.7 kB)
Downloading fsspec-2023.6.0-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires gcsfs!=2025.5.0,>=2023.3.0, which is not installed.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3

In [None]:
import os
os.kill(os.getpid(), 9)


In [2]:
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import load_dataset
import torch
import numpy as np

# Use a dummy topic extractor for simplicity (normally you'd use LDA or KeyBERT etc.)
def extract_keywords(text, top_k=5):
    words = list(set(text.lower().split()))
    return words[:top_k]  # crude keyword extractor


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load pre-trained BART model
topic_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)
topic_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")


In [4]:
def generate_topic_guided_summary(documents, model, tokenizer, top_k=5, device="cpu"):
    joined_docs = " ".join(documents)
    topic_keywords = extract_keywords(joined_docs, top_k=top_k)
    prompt = "summarize with focus on: " + ", ".join(topic_keywords) + " </s> " + joined_docs

    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, min_length=30, max_length=150, length_penalty=2.0)

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [1]:
from datasets import load_dataset

raw_dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
subset = raw_dataset.select(range(50))  # Select first 50 samples

# Form 5 multi-document samples of 10 docs each
multidoc_test = []
for i in range(0, 50, 10):
    docs = subset[i:i+10]
    articles = docs["article"]
    reference_summary = " ".join(docs["highlights"])
    multidoc_test.append({
        "documents": articles,
        "summary": reference_summary
    })

print("✅ Created", len(multidoc_test), "multi-document samples.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

✅ Created 5 multi-document samples.


In [5]:
!pip install rouge-score




In [6]:
from rouge_score import rouge_scorer

def compute_rouge_scores(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {"rouge1": [], "rouge2": [], "rougeL": []}

    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores["rouge1"].append(score["rouge1"].fmeasure)
        scores["rouge2"].append(score["rouge2"].fmeasure)
        scores["rougeL"].append(score["rougeL"].fmeasure)

    # Average scores
    avg_scores = {k: sum(v) / len(v) for k, v in scores.items()}
    return avg_scores


In [10]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
model.to("cuda" if torch.cuda.is_available() else "cpu")

generated_topic_guided = []

for item in multidoc_test:
    summary = generate_topic_guided_summary(
        item["documents"], model=model, tokenizer=tokenizer, top_k=7
    )
    generated_topic_guided.append({
        "reference": item["summary"],
        "generated": summary
    })

print("✅ Generated", len(generated_topic_guided), "summaries using topic-guided BART.")



✅ Generated 5 summaries using topic-guided BART.


In [11]:
from rouge_score import rouge_scorer
import numpy as np

def compute_rouge_scores(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {"rouge1": [], "rouge2": [], "rougeL": []}

    for pred, ref in zip(predictions, references):
        result = scorer.score(ref, pred)
        scores["rouge1"].append(result["rouge1"].fmeasure)
        scores["rouge2"].append(result["rouge2"].fmeasure)
        scores["rougeL"].append(result["rougeL"].fmeasure)

    # Average scores
    return {k: np.mean(v) for k, v in scores.items()}

# Run evaluation
preds = [x["generated"] for x in generated_topic_guided]
refs = [x["reference"] for x in generated_topic_guided]
rouge_scores = compute_rouge_scores(preds, refs)

# Print results
print("🔍 ROUGE-1:", rouge_scores["rouge1"])
print("🔍 ROUGE-2:", rouge_scores["rouge2"])
print("🔍 ROUGE-L:", rouge_scores["rougeL"])


🔍 ROUGE-1: 0.1236122816163372
🔍 ROUGE-2: 0.044526540151861825
🔍 ROUGE-L: 0.08448216441889086
