In [2]:
!pip install sumy transformers datasets evaluate bert-score --quiet

# 📚 Imports
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import load_dataset
from tqdm import tqdm
import evaluate

# ✅ Load trained BART model
model_path = "bart_arxiv_30k_1024_model"  # Change this if needed
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

# ✅ Load dataset (100 test samples)
dataset = load_dataset("ccdv/arxiv-summarization")
test_data = dataset["test"].select(range(100))

# ✅ Evaluation metrics
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")

# ✅ Storage
rouge1s, rouge2s, rougel, rougelsum = [], [], [], []
meteors = []
bert_precisions, bert_recalls, bert_f1s = [], [], []

# ✅ Loop over 100 samples
for sample in tqdm(test_data, desc="Running Hybrid Summarization"):
    article = sample["article"]
    reference = sample["abstract"]

    # --- Extractive Step ---
    parser = PlaintextParser.from_string(article, Tokenizer("english"))
    top_sentences = TextRankSummarizer()(parser.document, 5)
    extractive_summary = " ".join(str(sent) for sent in top_sentences)

    # --- Abstractive Step (BART) ---
    inputs = tokenizer(extractive_summary, return_tensors="pt", truncation=True, max_length=768)
    summary_ids = model.generate(
        **inputs,
        max_length=128,
        num_beams=4,
        no_repeat_ngram_size=3,
        length_penalty=2.0,
        early_stopping=True
    )
    pred = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # --- Evaluation ---
    r = rouge.compute(predictions=[pred], references=[reference])
    rouge1s.append(r["rouge1"])
    rouge2s.append(r["rouge2"])
    rougel.append(r["rougeL"])
    rougelsum.append(r["rougeLsum"])

    m = meteor.compute(predictions=[pred], references=[reference])
    meteors.append(m["meteor"])

    b = bertscore.compute(predictions=[pred], references=[reference], lang="en")
    bert_precisions.append(b["precision"][0])
    bert_recalls.append(b["recall"][0])
    bert_f1s.append(b["f1"][0])

# ✅ Final Average Metrics
print("\n📊 Final Average Scores (Hybrid Extractive + Abstractive on 100 Samples):")
print(f"ROUGE-1:     {sum(rouge1s)/100:.4f}")
print(f"ROUGE-2:     {sum(rouge2s)/100:.4f}")
print(f"ROUGE-L:     {sum(rougel)/100:.4f}")
print(f"ROUGE-Lsum:  {sum(rougelsum)/100:.4f}")
print(f"METEOR:      {sum(meteors)/100:.4f}")
print(f"BERT Precision: {sum(bert_precisions)/100:.4f}")
print(f"BERT Recall:    {sum(bert_recalls)/100:.4f}")
print(f"BERT F1 Score:  {sum(bert_f1s)/100:.4f}")


[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Running Hybrid Summarization:   0%|          | 0/100 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Running Hybrid Summarization: 100%|██████████| 100/100 [13:55<00:00,  8.36s/it]


📊 Final Average Scores (Hybrid Extractive + Abstractive on 100 Samples):
ROUGE-1:     0.3285
ROUGE-2:     0.0895
ROUGE-L:     0.1907
ROUGE-Lsum:  0.2796
METEOR:      0.1946
BERT Precision: 0.8456
BERT Recall:    0.8276
BERT F1 Score:  0.8363



