In [None]:
import json
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Load JSONL files
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return pd.DataFrame(data)

# Load your files
print("Loading JSONL files...")
model_summaries = load_jsonl('test_summaries.jsonl')  # Change to your file path
gold_summaries = load_jsonl('test_ref_summ.jsonl')    # Change to your file path

print(f"✓ Loaded {len(model_summaries)} model summaries")
print(f"✓ Loaded {len(gold_summaries)} gold summaries")

# Ensure both files have the same IDs and are aligned
if 'ID' in model_summaries.columns and 'ID' in gold_summaries.columns:
    model_summaries = model_summaries.sort_values('ID').reset_index(drop=True)
    gold_summaries = gold_summaries.sort_values('ID').reset_index(drop=True)
    print("✓ Aligned summaries by ID")

# Extract summary texts (adjust column names as needed)
model_texts = model_summaries['Summary'].tolist()  # Change 'Summary' to your column name
gold_texts = gold_summaries['reference_summary'].tolist()    # Change 'Summary' to your column name

print(f"\n{'='*70}")
print("Evaluating summaries...")
print(f"{'='*70}")

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge2', 'rougeL'], use_stemmer=True)

# Initialize smoothing for BLEU (handles edge cases)
smoothing = SmoothingFunction().method1

# Store scores for each summary
rouge2_scores = []
rougeL_scores = []
bleu_scores = []

# Calculate scores for each summary pair
for i, (model_text, gold_text) in enumerate(zip(model_texts, gold_texts)):
    # ROUGE scores
    rouge_results = scorer.score(gold_text, model_text)
    rouge2_scores.append(rouge_results['rouge2'].fmeasure)
    rougeL_scores.append(rouge_results['rougeL'].fmeasure)
    
    # BLEU score
    reference = [word_tokenize(gold_text.lower())]
    hypothesis = word_tokenize(model_text.lower())
    bleu = sentence_bleu(reference, hypothesis, smoothing_function=smoothing)
    bleu_scores.append(bleu)
    
    # Print progress every 100 summaries
    if (i + 1) % 100 == 0:
        print(f"Processed {i + 1}/{len(model_texts)} summaries...")

print(f"✓ Evaluation complete!")

# Calculate average scores
avg_rouge2 = np.mean(rouge2_scores)
avg_rougeL = np.mean(rougeL_scores)
avg_bleu = np.mean(bleu_scores)
avg_overall = (avg_rouge2 + avg_rougeL + avg_bleu) / 3

# Calculate standard deviations
std_rouge2 = np.std(rouge2_scores)
std_rougeL = np.std(rougeL_scores)
std_bleu = np.std(bleu_scores)

# Print results
print(f"\n{'='*70}")
print("EVALUATION RESULTS")
print(f"{'='*70}")
print(f"\n Average Scores:")
print(f"  • ROUGE-2:      {avg_rouge2:.4f} (±{std_rouge2:.4f})")
print(f"  • ROUGE-L:      {avg_rougeL:.4f} (±{std_rougeL:.4f})")
print(f"  • BLEU:         {avg_bleu:.4f} (±{std_bleu:.4f})")
print(f"  • Overall Avg:  {avg_overall:.4f}")

print(f"\n Score Ranges:")
print(f"  • ROUGE-2:  [{min(rouge2_scores):.4f} - {max(rouge2_scores):.4f}]")
print(f"  • ROUGE-L:  [{min(rougeL_scores):.4f} - {max(rougeL_scores):.4f}]")
print(f"  • BLEU:     [{min(bleu_scores):.4f} - {max(bleu_scores):.4f}]")

print(f"\n Percentiles:")
print(f"  • ROUGE-2:  25th={np.percentile(rouge2_scores, 25):.4f}, "
      f"50th={np.percentile(rouge2_scores, 50):.4f}, "
      f"75th={np.percentile(rouge2_scores, 75):.4f}")
print(f"  • ROUGE-L:  25th={np.percentile(rougeL_scores, 25):.4f}, "
      f"50th={np.percentile(rougeL_scores, 50):.4f}, "
      f"75th={np.percentile(rougeL_scores, 75):.4f}")
print(f"  • BLEU:     25th={np.percentile(bleu_scores, 25):.4f}, "
      f"50th={np.percentile(bleu_scores, 50):.4f}, "
      f"75th={np.percentile(bleu_scores, 75):.4f}")

# Create detailed results DataFrame
results_df = pd.DataFrame({
    'ID': model_summaries['ID'] if 'ID' in model_summaries.columns else range(len(model_texts)),
    'ROUGE-2': rouge2_scores,
    'ROUGE-L': rougeL_scores,
    'BLEU': bleu_scores,
    'Average': [(r2 + rl + b) / 3 for r2, rl, b in zip(rouge2_scores, rougeL_scores, bleu_scores)]
})

# Save detailed results
results_df.to_csv('evaluation_results.csv', index=False)
print(f"\n Detailed results saved to: evaluation_results.csv")

# Save summary statistics
summary_stats = pd.DataFrame({
    'Metric': ['ROUGE-2', 'ROUGE-L', 'BLEU', 'Overall Average'],
    'Mean': [avg_rouge2, avg_rougeL, avg_bleu, avg_overall],
    'Std': [std_rouge2, std_rougeL, std_bleu, 0],
    'Min': [min(rouge2_scores), min(rougeL_scores), min(bleu_scores), 0],
    'Max': [max(rouge2_scores), max(rougeL_scores), max(bleu_scores), 0],
    '25th': [np.percentile(rouge2_scores, 25), np.percentile(rougeL_scores, 25), 
             np.percentile(bleu_scores, 25), 0],
    '50th': [np.percentile(rouge2_scores, 50), np.percentile(rougeL_scores, 50), 
             np.percentile(bleu_scores, 50), 0],
    '75th': [np.percentile(rouge2_scores, 75), np.percentile(rougeL_scores, 75), 
             np.percentile(bleu_scores, 75), 0]
})

summary_stats.to_csv('summary_statistics.csv', index=False)
print(f" Summary statistics saved to: summary_statistics.csv")

# Find best and worst performing summaries
print(f"\n{'='*70}")
print("TOP 5 BEST SUMMARIES (by average score):")
print(f"{'='*70}")
top5 = results_df.nlargest(5, 'Average')
for idx, row in top5.iterrows():
    print(f"ID: {row['ID']}, Avg: {row['Average']:.4f}, "
          f"R-2: {row['ROUGE-2']:.4f}, R-L: {row['ROUGE-L']:.4f}, BLEU: {row['BLEU']:.4f}")

print(f"\n{'='*70}")
print("TOP 5 WORST SUMMARIES (by average score):")
print(f"{'='*70}")
bottom5 = results_df.nsmallest(5, 'Average')
for idx, row in bottom5.iterrows():
    print(f"ID: {row['ID']}, Avg: {row['Average']:.4f}, "
          f"R-2: {row['ROUGE-2']:.4f}, R-L: {row['ROUGE-L']:.4f}, BLEU: {row['BLEU']:.4f}")

print(f"\n{'='*70}")
print("✓ Evaluation complete!")
print(f"{'='*70}")