In [3]:
!pip uninstall -y gcsfs
!pip install --upgrade fsspec==2023.6.0
!pip install spacy rouge-score datasets
!python -m spacy download en_core_web_sm


Found existing installation: gcsfs 2025.3.2
Uninstalling gcsfs-2025.3.2:
  Successfully uninstalled gcsfs-2025.3.2
Collecting fsspec==2023.6.0
  Downloading fsspec-2023.6.0-py3-none-any.whl.metadata (6.7 kB)
Downloading fsspec-2023.6.0-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires gcsfs!=2025.5.0,>=2023.3.0, which is not installed.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3

In [None]:
import os
os.kill(os.getpid(), 9)


In [1]:
# STEP 3: Now load the dataset — this should succeed
from datasets import load_dataset

raw_dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
subset = raw_dataset.select(range(50))

# Form 5 multi-document samples, 10 docs per sample
multidoc_test = []
for i in range(0, 50, 10):
    docs = subset[i:i+10]
    documents = docs["article"]
    reference_summary = " ".join(docs["highlights"])
    multidoc_test.append({
        "documents": documents,
        "summary": reference_summary
    })

print("✅ Created", len(multidoc_test), "multi-document samples.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

✅ Created 5 multi-document samples.


In [2]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

# Sentence encoder
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased")

class HierarchicalEncoder(nn.Module):
    def __init__(self, sentence_hidden_size=768, document_hidden_size=512):
        super(HierarchicalEncoder, self).__init__()
        self.sentence_encoder = bert
        self.doc_encoder = nn.LSTM(input_size=sentence_hidden_size, hidden_size=document_hidden_size, batch_first=True, bidirectional=True)
        self.decoder = nn.Linear(document_hidden_size * 2, 1)

    def forward(self, sentence_embeddings):
        sentence_embeddings = sentence_embeddings.unsqueeze(0)  # Add batch dim
        doc_out, _ = self.doc_encoder(sentence_embeddings)
        scores = self.decoder(doc_out).squeeze(-1)
        return scores.squeeze(0)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

def spacy_sent_tokenize(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]


In [4]:
def generate_hierarchical_summary(docs, model, tokenizer, device="cpu", top_k=5):
    all_sentences = []
    for doc in docs:
        all_sentences.extend(spacy_sent_tokenize(doc))

    if not all_sentences:
        return ""

    inputs = tokenizer(all_sentences, padding=True, truncation=True, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS tokens

    sentence_scores = embeddings.mean(dim=1)
    topk_indices = sentence_scores.topk(top_k).indices
    selected = sorted(topk_indices.tolist())

    return " ".join([all_sentences[i] for i in selected])


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
hier_model = HierarchicalEncoder().to(device)
generated_hierarchical = []

for item in multidoc_test:
    summary = generate_hierarchical_summary(
        item["documents"], model=bert, tokenizer=tokenizer, device=device, top_k=5
    )
    generated_hierarchical.append({
        "reference": item["summary"],
        "generated": summary
    })

print("✅ Generated", len(generated_hierarchical), "summaries using Hierarchical Transformer.")


✅ Generated 5 summaries using Hierarchical Transformer.


In [6]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

r1, r2, rl = [], [], []

for item in generated_hierarchical:
    scores = scorer.score(item["reference"], item["generated"])
    r1.append(scores["rouge1"].fmeasure)
    r2.append(scores["rouge2"].fmeasure)
    rl.append(scores["rougeL"].fmeasure)

print("🔍 ROUGE-1:", sum(r1)/len(r1))
print("🔍 ROUGE-2:", sum(r2)/len(r2))
print("🔍 ROUGE-L:", sum(rl)/len(rl))


🔍 ROUGE-1: 0.20021640330953666
🔍 ROUGE-2: 0.03017643594449842
🔍 ROUGE-L: 0.11530348561202135
