# 04 - Model Evaluation

This notebook performs comprehensive evaluation of the fine-tuned FLAN-T5 model. We will:

1. Load the trained model and test dataset
2. Generate predictions on the test set
3. Compute automatic metrics (ROUGE, BLEU, BERTScore)
4. Visualize results and analyze error patterns
5. Set up a human evaluation framework

---

## Evaluation Metrics

| Metric | Purpose | Good Range |
|--------|---------|------------|
| ROUGE-L | Longest common subsequence overlap | > 0.15 |
| BLEU-4 | N-gram precision | > 0.05 |
| BERTScore | Semantic similarity | > 0.60 |

## Setup and Imports

In [None]:
import torch
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_from_disk

from rouge_score import rouge_scorer
from bert_score import score as bert_score
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

## Configuration

In [None]:
MODEL_PATH = Path("../models/flan-t5-socratic/final")
DATA_PATH = Path("../datasets/processed")
OUTPUT_PATH = Path("../evaluation_results")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

GENERATION_CONFIG = {
    "max_length": 80,
    "num_beams": 4,
    "do_sample": True,
    "top_k": 5,
    "top_p": 0.6,
    "repetition_penalty": 1.2,
    "no_repeat_ngram_size": 3
}

## Load Model and Data

In [None]:
tokenizer = T5Tokenizer.from_pretrained(str(MODEL_PATH))
model = T5ForConditionalGeneration.from_pretrained(str(MODEL_PATH))
model = model.to(DEVICE)
model.eval()

print(f"Model loaded: {model.num_parameters():,} parameters")

In [None]:
test_formatted = pd.read_parquet(DATA_PATH / "test_formatted.parquet")
print(f"Test samples: {len(test_formatted)}")
test_formatted.head()

## Generate Predictions

In [None]:
def generate_question(context, model, tokenizer, device, config):
    """Generate a Socratic question for the given context."""
    input_text = f"Generate a Socratic question for this context: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=400, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(**inputs, **config)
    
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated = generated.replace("[Question]", "").strip()
    return generated

In [None]:
predictions = []
references = []

print("Generating predictions...")
for idx, row in tqdm(test_formatted.iterrows(), total=len(test_formatted)):
    pred = generate_question(
        row['original_input'],
        model,
        tokenizer,
        DEVICE,
        GENERATION_CONFIG
    )
    predictions.append(pred)
    ref = row['original_target'] if 'original_target' in row else row['target_text'].replace("[Question] ", "")
    references.append(ref)

test_formatted['prediction'] = predictions
test_formatted['reference'] = references
print(f"Generated {len(predictions)} predictions.")

## ROUGE Scores

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for pred, ref in zip(predictions, references):
    scores = scorer.score(ref, pred)
    for key in rouge_scores:
        rouge_scores[key].append(scores[key].fmeasure)

rouge_results = {
    'ROUGE-1': np.mean(rouge_scores['rouge1']),
    'ROUGE-2': np.mean(rouge_scores['rouge2']),
    'ROUGE-L': np.mean(rouge_scores['rougeL'])
}

print("ROUGE Scores:")
for metric, score in rouge_results.items():
    print(f"  {metric}: {score:.4f}")

## BLEU Scores

In [None]:
smoothing = SmoothingFunction().method1
bleu_scores = []

for pred, ref in zip(predictions, references):
    ref_tokens = nltk.word_tokenize(ref.lower())
    pred_tokens = nltk.word_tokenize(pred.lower())
    
    bleu = sentence_bleu(
        [ref_tokens],
        pred_tokens,
        weights=(0.25, 0.25, 0.25, 0.25),
        smoothing_function=smoothing
    )
    bleu_scores.append(bleu)

bleu_mean = np.mean(bleu_scores)
print(f"BLEU-4 Score: {bleu_mean:.4f}")

## BERTScore

BERTScore measures semantic similarity using contextual embeddings. It is more meaningful than BLEU for tasks with many valid paraphrases.

In [None]:
print("Computing BERTScore (this may take a few minutes)...")

P, R, F1 = bert_score(
    predictions,
    references,
    lang="en",
    verbose=True,
    device=DEVICE
)

bertscore_results = {
    'Precision': P.mean().item(),
    'Recall': R.mean().item(),
    'F1': F1.mean().item()
}

print("\nBERTScore Results:")
for metric, score in bertscore_results.items():
    print(f"  {metric}: {score:.4f}")

## Summary Table

In [None]:
summary_data = [
    ["ROUGE-1", f"{rouge_results['ROUGE-1']:.4f}", "> 0.20"],
    ["ROUGE-2", f"{rouge_results['ROUGE-2']:.4f}", "> 0.05"],
    ["ROUGE-L", f"{rouge_results['ROUGE-L']:.4f}", "> 0.15"],
    ["BLEU-4", f"{bleu_mean:.4f}", "> 0.05"],
    ["BERTScore F1", f"{bertscore_results['F1']:.4f}", "> 0.60"]
]

print("\nEvaluation Summary")
print("=" * 50)
print(tabulate(summary_data, headers=["Metric", "Score", "Target"], tablefmt="grid"))

## Score Distributions

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(rouge_scores['rougeL'], bins=30, color='steelblue', alpha=0.7, edgecolor='black')
axes[0].axvline(np.mean(rouge_scores['rougeL']), color='red', linestyle='--', label=f"Mean: {np.mean(rouge_scores['rougeL']):.3f}")
axes[0].set_title('ROUGE-L Distribution')
axes[0].set_xlabel('Score')
axes[0].set_ylabel('Frequency')
axes[0].legend()

axes[1].hist(bleu_scores, bins=30, color='coral', alpha=0.7, edgecolor='black')
axes[1].axvline(np.mean(bleu_scores), color='red', linestyle='--', label=f"Mean: {np.mean(bleu_scores):.3f}")
axes[1].set_title('BLEU-4 Distribution')
axes[1].set_xlabel('Score')
axes[1].set_ylabel('Frequency')
axes[1].legend()

axes[2].hist(F1.numpy(), bins=30, color='seagreen', alpha=0.7, edgecolor='black')
axes[2].axvline(F1.mean().item(), color='red', linestyle='--', label=f"Mean: {F1.mean().item():.3f}")
axes[2].set_title('BERTScore F1 Distribution')
axes[2].set_xlabel('Score')
axes[2].set_ylabel('Frequency')
axes[2].legend()

plt.tight_layout()
plt.savefig(OUTPUT_PATH / "score_distributions.png", dpi=150)
plt.show()

## Sample Predictions Analysis

In [None]:
test_formatted['rougeL'] = rouge_scores['rougeL']
test_formatted['bleu'] = bleu_scores
test_formatted['bertscore_f1'] = F1.numpy()

In [None]:
print("HIGH SCORING EXAMPLES (Top 5 by BERTScore)")
print("=" * 70)

top_samples = test_formatted.nlargest(5, 'bertscore_f1')

for idx, row in top_samples.iterrows():
    print(f"\nContext: {row['original_input'][:150]}...")
    print(f"Reference: {row['reference']}")
    print(f"Prediction: {row['prediction']}")
    print(f"Scores - ROUGE-L: {row['rougeL']:.3f}, BERTScore: {row['bertscore_f1']:.3f}")
    print("-" * 70)

In [None]:
print("\nLOW SCORING EXAMPLES (Bottom 5 by BERTScore)")
print("=" * 70)

bottom_samples = test_formatted.nsmallest(5, 'bertscore_f1')

for idx, row in bottom_samples.iterrows():
    print(f"\nContext: {row['original_input'][:150]}...")
    print(f"Reference: {row['reference']}")
    print(f"Prediction: {row['prediction']}")
    print(f"Scores - ROUGE-L: {row['rougeL']:.3f}, BERTScore: {row['bertscore_f1']:.3f}")
    print("-" * 70)

## Human Evaluation Framework

For comprehensive evaluation, sample predictions should be rated by humans on three criteria:

1. **Fluency** (1-5): Is the question grammatically correct and natural?
2. **Relevance** (1-5): Is the question relevant to the given context?
3. **Socratic Quality** (Binary): Is the question genuinely thought-provoking and unanswerable from the context alone?

In [None]:
human_eval_sample = test_formatted.sample(50, random_state=42)[
    ['original_input', 'reference', 'prediction', 'rougeL', 'bertscore_f1']
].copy()

human_eval_sample['fluency'] = None
human_eval_sample['relevance'] = None
human_eval_sample['is_socratic'] = None

human_eval_sample = human_eval_sample.reset_index(drop=True)
human_eval_sample.to_csv(OUTPUT_PATH / "human_evaluation_samples.csv", index=False)

print(f"Saved {len(human_eval_sample)} samples for human evaluation.")
print(f"File: {OUTPUT_PATH / 'human_evaluation_samples.csv'}")

## Save Full Results

In [None]:
import json

evaluation_results = {
    "test_samples": len(test_formatted),
    "rouge": rouge_results,
    "bleu4": float(bleu_mean),
    "bertscore": bertscore_results,
    "generation_config": GENERATION_CONFIG
}

with open(OUTPUT_PATH / "evaluation_metrics.json", "w") as f:
    json.dump(evaluation_results, f, indent=2)

test_formatted.to_csv(OUTPUT_PATH / "test_predictions.csv", index=False)

print("\nSaved:")
print(f"  - {OUTPUT_PATH / 'evaluation_metrics.json'}")
print(f"  - {OUTPUT_PATH / 'test_predictions.csv'}")
print(f"  - {OUTPUT_PATH / 'score_distributions.png'}")
print(f"  - {OUTPUT_PATH / 'human_evaluation_samples.csv'}")

## Correlation Analysis

In [None]:
metrics_df = test_formatted[['rougeL', 'bleu', 'bertscore_f1']].copy()
metrics_df.columns = ['ROUGE-L', 'BLEU-4', 'BERTScore']

plt.figure(figsize=(8, 6))
correlation = metrics_df.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, fmt='.3f')
plt.title('Metric Correlations')
plt.tight_layout()
plt.savefig(OUTPUT_PATH / "metric_correlations.png", dpi=150)
plt.show()

## Evaluation Complete!

**Key Takeaways:**

1. BERTScore is more reliable than BLEU for Socratic questions (many valid paraphrases)
2. Low ROUGE/BLEU with high BERTScore indicates semantically similar but differently worded questions
3. Human evaluation is essential for assessing true "Socratic" quality

---

**Next Step**: Proceed to `05_keybert_extraction.ipynb` to set up keyphrase extraction for the concept map.