In [None]:
!pip install rouge-score sentence-transformers


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-many

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt

# Load models
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Small dataset
documents = [
    [
        "The stock market crashed on Monday.",
        "Investors were worried about inflation.",
        "Tech stocks led the losses.",
        "Analysts predict a slow recovery.",
        "Some investors see buying opportunities."
    ],
    [
        "The hurricane caused severe flooding.",
        "Thousands were evacuated from coastal areas.",
        "Emergency crews rescued stranded residents.",
        "Roads and bridges were damaged.",
        "Authorities declared a state of emergency."
    ]
]

# Generate realistic summaries
generated_summaries = []
for doc in documents:
    input_text = " ".join(doc)
    inputs = tokenizer([input_text], max_length=1024, return_tensors="pt", truncation=True)
    summary_ids = bart_model.generate(
        inputs["input_ids"],
        num_beams=4,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        max_length=60,
        min_length=20,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    generated_summaries.append(summary)

# Print generated summaries
for i, summary in enumerate(generated_summaries):
    print(f"\nGenerated Summary {i+1}:\n{summary}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Generated Summary 1:
The stock market crashed on Monday. Tech stocks led the losses. Analysts predict a slow recovery. Some investors see buying opportunities.

Generated Summary 2:
The hurricane caused severe flooding. Thousands were evacuated from coastal areas. Roads and bridges were damaged.


In [None]:
# Compute ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
for i, summary in enumerate(generated_summaries):
    reference = " ".join(documents[i])
    score = scorer.score(reference, summary)
    print(f"\nDocument {i+1} ROUGE Scores:")
    for metric in score:
        print(f"{metric.upper()} - Precision: {score[metric].precision:.3f}, "
              f"Recall: {score[metric].recall:.3f}, F1: {score[metric].fmeasure:.3f}")

# Cosine similarity between centroid and summary
print("\nCentroid vs Summary Similarity (cosine):")
for i, doc in enumerate(documents):
    doc_embed = embedder.encode(doc)
    centroid = np.mean(doc_embed, axis=0)
    summary_embed = embedder.encode([generated_summaries[i]])[0]
    sim = cosine_similarity([centroid], [summary_embed])[0][0]
    print(f"Document {i+1}: {sim:.4f}")



Document 1 ROUGE Scores:
ROUGE1 - Precision: 1.000, Recall: 0.808, F1: 0.894
ROUGE2 - Precision: 0.950, Recall: 0.760, F1: 0.844
ROUGEL - Precision: 1.000, Recall: 0.808, F1: 0.894

Document 2 ROUGE Scores:
ROUGE1 - Precision: 1.000, Recall: 0.593, F1: 0.744
ROUGE2 - Precision: 0.933, Recall: 0.538, F1: 0.683
ROUGEL - Precision: 1.000, Recall: 0.593, F1: 0.744

Centroid vs Summary Similarity (cosine):
Document 1: 0.8201
Document 2: 0.8352
