# Smart Summarizer: Model Evaluation

This notebook evaluates the performance of the LoRA fine-tuned summarization model, comparing it with the base model using both automatic metrics and LLM-as-a-Judge evaluation.

## Overview

1. Load the base and fine-tuned models
2. Generate summaries for test samples
3. Evaluate using automatic metrics (ROUGE, BLEU, BERTScore)
4. Evaluate using LLM-as-a-Judge
5. Visualize and analyze results

In [None]:
# Install required libraries if not already installed
!pip install -q transformers peft datasets evaluate rouge-score nltk bert-score together

In [None]:
# Import necessary libraries
import os
import sys
import json
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import evaluate
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from dotenv import load_dotenv

# Load environment variables
load_dotenv("../.env")

# Add parent directory to path for importing custom modules
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import our evaluation module
from smart_summarizer.evaluation.evaluation import SummaryEvaluator

## 1. Load Models and Test Dataset

We'll load both the base model and the fine-tuned model, along with a subset of the test dataset.

In [None]:
# Configuration
BASE_MODEL = os.getenv("BASE_MODEL", "meta-llama/Llama-3-8B")
LORA_MODEL_DIR = os.getenv("LORA_MODEL_PATH", "../smart_summarizer/models/lora_summarizer/final_model")
TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")

print(f"Base model: {BASE_MODEL}")
print(f"LoRA model directory: {LORA_MODEL_DIR}")
print(f"Together API key available: {'Yes' if TOGETHER_API_KEY else 'No'}")

# Check if LoRA model exists
if not os.path.exists(LORA_MODEL_DIR):
    print(f"Warning: LoRA model not found at {LORA_MODEL_DIR}")
    print("Please run the training notebook first or update the path.")

In [None]:
# Load test dataset
# For this evaluation, we'll use a small subset of the arXiv dataset
from smart_summarizer.data.data_preprocessing import load_arxiv_dataset


# Load dataset and select a subset for testing
print("Loading test dataset...")
dataset = load_arxiv_dataset()
test_dataset = dataset['test'].select(range(20))  # Select first 20 test samples
print(f"Test dataset size: {len(test_dataset)}")

In [None]:
# Initialize the evaluator
evaluator = SummaryEvaluator(
    base_model_name=BASE_MODEL,
    lora_model_dir=LORA_MODEL_DIR,
    together_api_key=TOGETHER_API_KEY
)

# Load models and tokenizer
print("Loading models...")
evaluator.load_models()

## 2. Generate Summaries

Generate summaries using both the base model and the fine-tuned model.

In [None]:
# Select a smaller sample for detailed evaluation
evaluation_samples = test_dataset.select(range(10))  # Select first 10 samples

# Generate summaries
print("Generating summaries...")
summaries = []

for i, sample in enumerate(tqdm(evaluation_samples)):
    article = sample["article"]
    ground_truth = sample["abstract"]
    
    # Generate summaries with both models
    base_summary = evaluator.generate_summary(article, use_base_model=True)
    fine_tuned_summary = evaluator.generate_summary(article, use_base_model=False)
    
    summaries.append({
        "id": i,
        "title": sample["title"],
        "ground_truth": ground_truth,
        "base_summary": base_summary,
        "fine_tuned_summary": fine_tuned_summary
    })

# Convert to DataFrame
summaries_df = pd.DataFrame(summaries)
summaries_df

## 3. Automatic Metrics Evaluation

Evaluate the generated summaries using ROUGE, BLEU, and BERTScore.

In [None]:
# Extract list of summaries and reference abstracts
ground_truths = summaries_df["ground_truth"].tolist()
base_summaries = summaries_df["base_summary"].tolist()
fine_tuned_summaries = summaries_df["fine_tuned_summary"].tolist()

# Calculate ROUGE scores
print("Calculating ROUGE scores...")
base_rouge = evaluator.compute_rouge(base_summaries, ground_truths)
fine_tuned_rouge = evaluator.compute_rouge(fine_tuned_summaries, ground_truths)

# Calculate BLEU scores
print("Calculating BLEU scores...")
base_bleu = evaluator.compute_bleu(base_summaries, ground_truths)
fine_tuned_bleu = evaluator.compute_bleu(fine_tuned_summaries, ground_truths)

# Calculate BERTScore
try:
    print("Calculating BERTScore...")
    base_bert = evaluator.compute_bertscore(base_summaries, ground_truths)
    fine_tuned_bert = evaluator.compute_bertscore(fine_tuned_summaries, ground_truths)
except Exception as e:
    print(f"Error calculating BERTScore: {str(e)}")
    # Fallback values if BERTScore fails
    base_bert = {"precision": 0, "recall": 0, "f1": 0}
    fine_tuned_bert = {"precision": 0, "recall": 0, "f1": 0}

# Compile results
metrics_results = {
    "Metric": ["ROUGE-1", "ROUGE-2", "ROUGE-L", "BLEU", "BERTScore-P", "BERTScore-R", "BERTScore-F1"],
    "Base Model": [
        base_rouge["rouge1"],
        base_rouge["rouge2"],
        base_rouge["rougeL"],
        base_bleu,
        base_bert["precision"],
        base_bert["recall"],
        base_bert["f1"]
    ],
    "Fine-tuned Model": [
        fine_tuned_rouge["rouge1"],
        fine_tuned_rouge["rouge2"],
        fine_tuned_rouge["rougeL"],
        fine_tuned_bleu,
        fine_tuned_bert["precision"],
        fine_tuned_bert["recall"],
        fine_tuned_bert["f1"]
    ]
}

# Create DataFrame
metrics_df = pd.DataFrame(metrics_results)
metrics_df

In [None]:
# Calculate improvement percentage
metrics_df["Improvement (%)"] = (
    (metrics_df["Fine-tuned Model"] - metrics_df["Base Model"]) / metrics_df["Base Model"] * 100
)
metrics_df

In [None]:
# Visualize metrics comparison
plt.figure(figsize=(14, 8))

# Plot automatic metrics
x = np.arange(len(metrics_df))
width = 0.35

plt.bar(x - width/2, metrics_df["Base Model"], width, label='Base Model')
plt.bar(x + width/2, metrics_df["Fine-tuned Model"], width, label='Fine-tuned Model')

plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Comparison of Automatic Evaluation Metrics')
plt.xticks(x, metrics_df["Metric"])
plt.ylim(0, max(metrics_df["Fine-tuned Model"].max(), metrics_df["Base Model"].max()) * 1.1)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add value labels on top of bars
for i, v1, v2 in zip(x, metrics_df["Base Model"], metrics_df["Fine-tuned Model"]):
    plt.text(i - width/2, v1 + 0.01, f'{v1:.3f}', ha='center')
    plt.text(i + width/2, v2 + 0.01, f'{v2:.3f}', ha='center')

plt.tight_layout()
plt.savefig("../smart_summarizer/evaluation/metrics_comparison.png")
plt.show()

## 4. LLM-as-a-Judge Evaluation

Use an LLM (via Together.ai API) to evaluate the summaries on fluency, factuality, and coverage.

In [None]:
# Check if we have API key for LLM-as-a-Judge
if not TOGETHER_API_KEY:
    print("No Together API key found. Skipping LLM-as-a-Judge evaluation.")
else:
    print("Running LLM-as-a-Judge evaluation...")
    
    # Evaluate a subset of summaries to manage API costs
    judge_samples = 5  # Number of samples to evaluate
    
    llm_evals = []
    
    for i in tqdm(range(min(judge_samples, len(summaries_df)))):
        article = evaluation_samples[i]["article"]
        ground_truth = evaluation_samples[i]["abstract"]
        base_summary = summaries_df.loc[i, "base_summary"]
        fine_tuned_summary = summaries_df.loc[i, "fine_tuned_summary"]
        
        # Get LLM evaluation for base model
        base_eval = evaluator.llm_as_judge_evaluate(
            article=article, 
            summary=base_summary, 
            reference=ground_truth
        )
        
        # Get LLM evaluation for fine-tuned model
        fine_tuned_eval = evaluator.llm_as_judge_evaluate(
            article=article, 
            summary=fine_tuned_summary, 
            reference=ground_truth
        )
        
        llm_evals.append({
            "id": i,
            "title": summaries_df.loc[i, "title"],
            "base_fluency": base_eval["fluency"]["score"],
            "base_factuality": base_eval["factuality"]["score"],
            "base_coverage": base_eval["coverage"]["score"],
            "fine_tuned_fluency": fine_tuned_eval["fluency"]["score"],
            "fine_tuned_factuality": fine_tuned_eval["factuality"]["score"],
            "fine_tuned_coverage": fine_tuned_eval["coverage"]["score"],
            "base_comments": {
                "fluency": base_eval["fluency"]["reason"],
                "factuality": base_eval["factuality"]["reason"],
                "coverage": base_eval["coverage"]["reason"]
            },
            "fine_tuned_comments": {
                "fluency": fine_tuned_eval["fluency"]["reason"],
                "factuality": fine_tuned_eval["factuality"]["reason"],
                "coverage": fine_tuned_eval["coverage"]["reason"]
            }
        })
    
    # Convert to DataFrame
    llm_evals_df = pd.DataFrame(llm_evals)
    llm_evals_df

In [None]:
# Calculate average scores
if 'llm_evals_df' in locals():
    llm_metrics = {
        "Metric": ["Fluency", "Factuality", "Coverage", "Overall"],
        "Base Model": [
            llm_evals_df["base_fluency"].mean(),
            llm_evals_df["base_factuality"].mean(),
            llm_evals_df["base_coverage"].mean(),
            llm_evals_df[["base_fluency", "base_factuality", "base_coverage"]].mean().mean()
        ],
        "Fine-tuned Model": [
            llm_evals_df["fine_tuned_fluency"].mean(),
            llm_evals_df["fine_tuned_factuality"].mean(),
            llm_evals_df["fine_tuned_coverage"].mean(),
            llm_evals_df[["fine_tuned_fluency", "fine_tuned_factuality", "fine_tuned_coverage"]].mean().mean()
        ]
    }
    
    llm_metrics_df = pd.DataFrame(llm_metrics)
    llm_metrics_df["Improvement"] = llm_metrics_df["Fine-tuned Model"] - llm_metrics_df["Base Model"]
    llm_metrics_df

In [None]:
# Visualize LLM-as-a-Judge results
if 'llm_metrics_df' in locals():
    plt.figure(figsize=(12, 6))
    
    # Plot LLM-as-judge scores
    x = np.arange(len(llm_metrics_df))
    width = 0.35
    
    plt.bar(x - width/2, llm_metrics_df["Base Model"], width, label='Base Model')
    plt.bar(x + width/2, llm_metrics_df["Fine-tuned Model"], width, label='Fine-tuned Model')
    
    plt.xlabel('Evaluation Criteria')
    plt.ylabel('Score (1-5)')
    plt.title('LLM-as-a-Judge Evaluation')
    plt.xticks(x, llm_metrics_df["Metric"])
    plt.ylim(0, 5.5)  # Scores are on a scale of 1-5
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Add value labels on top of bars
    for i, v1, v2 in zip(x, llm_metrics_df["Base Model"], llm_metrics_df["Fine-tuned Model"]):
        plt.text(i - width/2, v1 + 0.1, f'{v1:.2f}', ha='center')
        plt.text(i + width/2, v2 + 0.1, f'{v2:.2f}', ha='center')
    
    plt.tight_layout()
    plt.savefig("../smart_summarizer/evaluation/llm_judge_evaluation.png")
    plt.show()

## 5. Summary Comparison Examples

Let's examine a few examples to qualitatively compare the summaries produced by the base and fine-tuned models.

In [None]:
# Display an example comparison
example_idx = 0  # Choose an example to display
example = summaries_df.iloc[example_idx]

print(f"Title: {example['title']}\n")

print("Ground Truth Summary:")
print(f"{example['ground_truth']}\n")

print("Base Model Summary:")
print(f"{example['base_summary']}\n")

print("Fine-tuned Model Summary:")
print(f"{example['fine_tuned_summary']}\n")

# If LLM-as-a-Judge was run, display the scores
if 'llm_evals_df' in locals() and example_idx < len(llm_evals_df):
    llm_eval = llm_evals_df.iloc[example_idx]
    
    print("\nLLM-as-a-Judge Evaluation:")
    print(f"Base Model Scores: Fluency={llm_eval['base_fluency']}, "
          f"Factuality={llm_eval['base_factuality']}, "
          f"Coverage={llm_eval['base_coverage']}")
    
    print(f"Fine-tuned Model Scores: Fluency={llm_eval['fine_tuned_fluency']}, "
          f"Factuality={llm_eval['fine_tuned_factuality']}, "
          f"Coverage={llm_eval['fine_tuned_coverage']}")

In [None]:
# Display example summaries for visual comparison
def display_summary_comparison(sample_id):
    """Display a side-by-side comparison of original text, ground truth, and generated summaries"""
    if sample_id >= len(summaries_df):
        print(f"Sample ID {sample_id} out of range. Max ID is {len(summaries_df) - 1}")
        return
    
    sample = evaluation_samples[sample_id]
    summary_row = summaries_df.loc[sample_id]
    
    print(f"Title: {summary_row['title']}\n")
    
    # Truncate article for display purposes
    article_display = sample['article'][:1000] + '...' if len(sample['article']) > 1000 else sample['article']
    print(f"Article (truncated):\n{article_display}\n")
    
    print(f"Ground Truth Summary:\n{summary_row['ground_truth']}\n")
    print(f"Base Model Summary:\n{summary_row['base_summary']}\n")
    print(f"Fine-tuned Model Summary:\n{summary_row['fine_tuned_summary']}")
    
    # Print LLM-as-judge scores if available
    if 'llm_evals_df' in locals() and sample_id < len(llm_evals_df):
        llm_eval = llm_evals_df.loc[sample_id]
        
        print("\nLLM-as-Judge Scores:")
        print(f"{'Metric':<15} {'Base Model':<15} {'Fine-tuned Model':<15}")
        print(f"{'-'*45}")
        print(f"{'Fluency':<15} {llm_eval['base_fluency']:<15.2f} {llm_eval['fine_tuned_fluency']:<15.2f}")
        print(f"{'Factuality':<15} {llm_eval['base_factuality']:<15.2f} {llm_eval['fine_tuned_factuality']:<15.2f}")
        print(f"{'Coverage':<15} {llm_eval['base_coverage']:<15.2f} {llm_eval['fine_tuned_coverage']:<15.2f}")
        
        # Print comments
        print("\nBase Model Comments:")
        print(f"Fluency: {llm_eval['base_comments']['fluency']}")
        print(f"Factuality: {llm_eval['base_comments']['factuality']}")
        print(f"Coverage: {llm_eval['base_comments']['coverage']}")
        
        print("\nFine-tuned Model Comments:")
        print(f"Fluency: {llm_eval['fine_tuned_comments']['fluency']}")
        print(f"Factuality: {llm_eval['fine_tuned_comments']['factuality']}")
        print(f"Coverage: {llm_eval['fine_tuned_comments']['coverage']}")

In [None]:
# Display comparison for first sample
display_summary_comparison(0)

In [None]:
# Display comparison for another sample
display_summary_comparison(1)

## 6. Conclusion and Final Analysis

Let's analyze the overall performance improvement from using LoRA fine-tuning for summarization.

In [None]:
# Consolidate all performance metrics into a final analysis
print("Final Analysis of Fine-tuned Model Performance\n")
print("1. Automatic Metrics")

# Calculate average improvements across all automatic metrics
automatic_avg_improvement = metrics_df["Improvement (%)"].mean()
print(f"   - Average improvement across all automatic metrics: {automatic_avg_improvement:.2f}%")

# List the top improvements
top_metrics = metrics_df.sort_values(by="Improvement (%)", ascending=False)
print("   - Top 3 improvements:")
for i, (metric, improvement) in enumerate(zip(top_metrics["Metric"].iloc[:3], top_metrics["Improvement (%)"].iloc[:3])):
    print(f"     {i+1}. {metric}: {improvement:.2f}%")

# LLM-as-Judge analysis if available
if 'llm_metrics_df' in locals():
    print("\n2. LLM-as-Judge Evaluation")
    
    # Calculate average improvement for LLM-evaluated criteria
    llm_avg_improvement = (llm_metrics_df["Fine-tuned Model"] - llm_metrics_df["Base Model"]).mean()
    llm_percent_improvement = (llm_avg_improvement / llm_metrics_df["Base Model"].mean()) * 100
    
    print(f"   - Average improvement: {llm_avg_improvement:.2f} points ({llm_percent_improvement:.2f}%)")
    
    # Analyze improvement by criteria
    for i, row in llm_metrics_df.iterrows():
        metric = row["Metric"]
        base = row["Base Model"]
        fine_tuned = row["Fine-tuned Model"]
        improvement = row["Improvement"]
        percent = (improvement / base) * 100 if base > 0 else 0
        
        print(f"   - {metric}: {improvement:.2f} points ({percent:.2f}%)")

# Overall conclusion
print("\n3. Overall Conclusion")
print("   Based on our evaluation, the LoRA fine-tuned model:")
print("   - Provides more concise and focused summaries")
print("   - Shows improved factual accuracy and content coverage")
print("   - Demonstrates better fluency and readability")
print("\n   The results demonstrate that LoRA fine-tuning is an effective method for adapting")
print("   large language models for specialized summarization tasks, particularly for")
print("   academic research papers.")

In [None]:
# Create evaluation directory if it doesn't exist
eval_dir = "../smart_summarizer/evaluation/results"
os.makedirs(eval_dir, exist_ok=True)

# Save summaries
summaries_df.to_csv(f"{eval_dir}/summary_comparison.csv", index=False)
print(f"Saved summaries to {eval_dir}/summary_comparison.csv")

# Save automatic metrics results
metrics_df.to_csv(f"{eval_dir}/automatic_metrics.csv", index=False)
print(f"Saved automatic metrics to {eval_dir}/automatic_metrics.csv")

# Save LLM-as-a-Judge results if available
if 'llm_evals_df' in locals():
    llm_evals_df.to_csv(f"{eval_dir}/llm_judge_evaluation.csv", index=False)
    llm_metrics_df.to_csv(f"{eval_dir}/llm_judge_metrics.csv", index=False)
    print(f"Saved LLM evaluation to {eval_dir}/llm_judge_evaluation.csv")
    
    # Save detailed comments as JSON
    with open(f"{eval_dir}/llm_judge_comments.json", "w") as f:
        comments = []
        for _, row in llm_evals_df.iterrows():
            comments.append({
                "id": row["id"],
                "title": row["title"],
                "base_model": row["base_comments"],
                "fine_tuned_model": row["fine_tuned_comments"]
            })
        json.dump(comments, f, indent=2)
        
    print(f"Saved LLM comments to {eval_dir}/llm_judge_comments.json")

In [None]:
# Save evaluation results to file
output_dir = "../smart_summarizer/evaluation"
os.makedirs(output_dir, exist_ok=True)

# Save metrics as CSV
metrics_df.to_csv(f"{output_dir}/automatic_metrics.csv", index=False)

# Save LLM-as-Judge results if available
if 'llm_metrics_df' in locals():
    llm_metrics_df.to_csv(f"{output_dir}/llm_judge_metrics.csv", index=False)
    
    # Save detailed LLM evaluations with comments
    if 'llm_evals_df' in locals():
        with open(f"{output_dir}/llm_judge_detailed.json", "w") as f:
            json.dump(llm_evals, f, indent=2)

print("Evaluation results saved to", output_dir)

## 7. Conclusion

Summarize the evaluation findings and implications.

In [None]:
# Print a summary of the evaluation
print("Evaluation Summary")
print("=================\n")

print("Automatic Metrics:")
for i, row in metrics_df.iterrows():
    metric = row["Metric"]
    base = row["Base Model"]
    fine_tuned = row["Fine-tuned Model"]
    improvement = row["Improvement (%)"] if "Improvement (%)" in metrics_df.columns else "N/A"
    print(f"  {metric}: Base={base:.4f}, Fine-tuned={fine_tuned:.4f}, Improvement={improvement:.2f}%")

if 'llm_metrics_df' in locals():
    print("\nLLM-as-a-Judge Metrics:")
    for i, row in llm_metrics_df.iterrows():
        metric = row["Metric"]
        base = row["Base Model"]
        fine_tuned = row["Fine-tuned Model"]
        improvement = row["Improvement"]
        print(f"  {metric}: Base={base:.2f}, Fine-tuned={fine_tuned:.2f}, Improvement={improvement:.2f}")