#### Environment Setup & Imports

In [1]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # For easier debugging (no effect on CPU)

import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import pandas as pd
import evaluate
from tqdm.auto import tqdm
import html
import re
import random
import warnings
warnings.filterwarnings("ignore")


####  Configuration

In [2]:
MODEL_DIR = "../../Models/pegasus_english_finetuned/final_model" 
RAW_TEST_PATH = "../../Data/Processed/english_test_cleaned.csv"           

ARTICLE_COL = "Article"  
SUMMARY_COL = "Summary"

MAX_INPUT_LENGTH = 1024
MAX_SUMMARY_LENGTH = 100
BEAM_SIZE = 4

SAMPLE_SIZE = 300  # Number of clean & valid samples to evaluate
MIN_ARTICLE_WORDS = 10
MIN_SUMMARY_WORDS = 5


#### Load Model, Tokenizer and Setup Device (CPU for stability)

In [3]:
device = torch.device("cpu")
print(f"Using device: {device}")

tokenizer = PegasusTokenizer.from_pretrained(MODEL_DIR)
model = PegasusForConditionalGeneration.from_pretrained(MODEL_DIR)
model = model.to(device).eval()

assert tokenizer.vocab_size == model.config.vocab_size, "Tokenizer and model vocab size mismatch!"
print(f"Loaded model and tokenizer with vocab size: {tokenizer.vocab_size}")


Using device: cpu
Loaded model and tokenizer with vocab size: 96103


#### Load Raw Test Data and Initial Cleaning

In [4]:
test_df = pd.read_csv(RAW_TEST_PATH)
print(f"Total raw test samples loaded: {len(test_df)}")

def clean_text(text):
    """Clean and normalize input text."""
    if not isinstance(text, str):
        return ""
    text = html.unescape(text)                      # decode HTML entities
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)   # zero-width characters
    text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)    # control chars
    text = text.replace('#', ' ')                    # remove '#' symbol
    disallowed_pattern = re.compile(r'[#$@%&*{}<>\\^~`|]')
    # Early reject: drop rows with these disallowed special characters
    if disallowed_pattern.search(text):
        return ""
    text = re.sub(r'\s+', ' ', text).strip()           # normalize spaces
    return text

# Clean articles and summaries
test_df[ARTICLE_COL] = test_df[ARTICLE_COL].astype(str).apply(clean_text)
test_df[SUMMARY_COL] = test_df[SUMMARY_COL].astype(str).apply(clean_text)

# Drop empty articles or summaries after cleaning
test_df = test_df[(test_df[ARTICLE_COL] != "") & (test_df[SUMMARY_COL] != "")]
print(f"Samples after cleaning and dropping empty texts: {len(test_df)}")

# Filter by word length thresholds
test_df['Article_len'] = test_df[ARTICLE_COL].apply(lambda x: len(x.split()))
test_df['Summary_len'] = test_df[SUMMARY_COL].apply(lambda x: len(x.split()))
test_df = test_df[
    (test_df['Article_len'] >= MIN_ARTICLE_WORDS) & 
    (test_df['Summary_len'] >= MIN_SUMMARY_WORDS)
].reset_index(drop=True)

print(f"Samples after applying min word length filter: {len(test_df)}")

# Drop auxiliary length columns
test_df.drop(columns=['Article_len', 'Summary_len'], inplace=True)


Total raw test samples loaded: 2889
Samples after cleaning and dropping empty texts: 503
Samples after applying min word length filter: 503


#### Validate Token IDs and Randomly Select 300 Valid Samples

In [5]:
valid_indices = []

print("Filtering and validating samples by token ID range...")

for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    article_text = row[ARTICLE_COL]
    inputs = tokenizer(
        article_text,
        return_tensors="pt",
        truncation=True,
        padding="longest",
        max_length=MAX_INPUT_LENGTH,
    )
    max_id = inputs.input_ids.max().item()
    min_id = inputs.input_ids.min().item()

    if min_id < 0 or max_id >= tokenizer.vocab_size:
        continue  # skip invalid samples

    valid_indices.append(idx)

print(f"Total valid samples found: {len(valid_indices)}")

if len(valid_indices) < SAMPLE_SIZE:
    print(f"Warning: Only {len(valid_indices)} valid samples found, less than desired {SAMPLE_SIZE}")

# Random sampling for representative, random subset
sampled_indices = random.sample(valid_indices, min(SAMPLE_SIZE, len(valid_indices)))
eval_df = test_df.loc[sampled_indices].reset_index(drop=True)

print(f"Randomly selected {len(eval_df)} valid samples for evaluation.")


Filtering and validating samples by token ID range...


  0%|          | 0/503 [00:00<?, ?it/s]

Total valid samples found: 503
Randomly selected 300 valid samples for evaluation.


#### Inference on Clean Subset

In [None]:
results = []

print("Starting inference on selected clean samples...")

for idx, row in tqdm(eval_df.iterrows(), total=len(eval_df)):
    article_text = row[ARTICLE_COL]
    reference_summary = row[SUMMARY_COL]

    try:
        inputs = tokenizer(
            article_text,
            return_tensors="pt",
            truncation=True,
            padding="longest",
            max_length=MAX_INPUT_LENGTH,
        ).to(device)

        with torch.no_grad():
            summary_ids = model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=MAX_SUMMARY_LENGTH,
                num_beams=BEAM_SIZE,
                early_stopping=True,
            )
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error at index {idx}: {e}")
        generated_summary = ""

    results.append({
        "article": article_text,
        "reference": reference_summary,
        "generated": generated_summary
    })

results_df = pd.DataFrame(results)
results_df.to_csv("pegasus_clean_filtered_eval_results.csv", index=False)
print("Saved inference results to 'pegasus_clean_filtered_eval_results.csv'")


#### Compute and Display ROUGE Metrics

In [7]:
rouge = evaluate.load("rouge")

predictions = results_df["generated"].tolist()
references = results_df["reference"].tolist()

rouge_scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)

print("ROUGE Scores on clean, randomly selected evaluation subset:")
print(f"ROUGE-1 F1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2 F1: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L F1: {rouge_scores['rougeL']:.4f}")


ROUGE Scores on clean, randomly selected evaluation subset:
ROUGE-1 F1: 0.2296
ROUGE-2 F1: 0.1440
ROUGE-L F1: 0.1996
