# 04 - Model Evaluation

This notebook evaluates the fine-tuned FLAN-T5 model using automated metrics and manual evaluation.

## Objectives
- Generate questions on the test set
- Calculate BLEU, ROUGE, and BERTScore metrics
- Compare to EACL 2023 paper baselines
- Perform manual evaluation on 50 samples
- Analyze errors by question type

## Paper Baselines (FLAN-T5-base)
| Metric | Paper Score | Target |
|--------|-------------|--------|
| BLEU-1 | 0.172 | > 0.15 |
| ROUGE-L | 0.211 | > 0.18 |
| BERTScore | 0.632 | > 0.60 |

## 1. Setup and Imports

In [None]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import json
from datetime import datetime

from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_from_disk

from rouge_score import rouge_scorer
from bert_score import score as bert_score
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

nltk.download('punkt', quiet=True)

SEED = 42
np.random.seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

## 2. Load Model and Data

In [None]:
MODEL_PATH = Path("../backend/model_artifacts/soqg_flan_t5/final")
DATASET_PATH = Path("../datasets/processed/soqg_tokenized")
CLEAN_DATA_PATH = Path("../datasets/processed/test_clean.csv")
OUTPUT_DIR = Path("../experiments/evaluation")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

model = T5ForConditionalGeneration.from_pretrained(MODEL_PATH).to(device)
tokenizer = T5Tokenizer.from_pretrained(MODEL_PATH)
model.eval()

print(f"Model loaded from {MODEL_PATH}")

In [None]:
dataset = load_from_disk(DATASET_PATH)
test_clean = pd.read_csv(CLEAN_DATA_PATH)

print(f"Test set size: {len(dataset['test'])}")
print(f"Clean test CSV size: {len(test_clean)}")

## 3. Generation Function

In [None]:
def generate_question(input_text, max_length=128, num_beams=4):
    """Generate a Socratic question from input."""
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

## 4. Generate on Test Set

This may take 30-60 minutes depending on GPU.

In [None]:
predictions = []
references = []
question_types = []
inputs_list = []

for i, row in tqdm(test_clean.iterrows(), total=len(test_clean), desc="Generating"):
    input_text = row['formatted_input']
    reference = row['target']
    q_type = row['question_type']
    
    prediction = generate_question(input_text)
    
    predictions.append(prediction)
    references.append(reference)
    question_types.append(q_type)
    inputs_list.append(input_text)

print(f"\nGenerated {len(predictions)} questions")

In [None]:
results_df = pd.DataFrame({
    'input': inputs_list,
    'reference': references,
    'prediction': predictions,
    'question_type': question_types
})

results_df.to_csv(OUTPUT_DIR / "generation_results.csv", index=False)
print(f"Results saved to {OUTPUT_DIR / 'generation_results.csv'}")

## 5. Calculate BLEU Scores

In [None]:
smoother = SmoothingFunction()

bleu_1_scores = []
bleu_2_scores = []
bleu_4_scores = []

for pred, ref in tqdm(zip(predictions, references), total=len(predictions), desc="BLEU"):
    pred_tokens = nltk.word_tokenize(pred.lower())
    ref_tokens = [nltk.word_tokenize(ref.lower())]
    
    bleu_1 = sentence_bleu(ref_tokens, pred_tokens, weights=(1, 0, 0, 0), smoothing_function=smoother.method1)
    bleu_2 = sentence_bleu(ref_tokens, pred_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoother.method1)
    bleu_4 = sentence_bleu(ref_tokens, pred_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoother.method1)
    
    bleu_1_scores.append(bleu_1)
    bleu_2_scores.append(bleu_2)
    bleu_4_scores.append(bleu_4)

results_df['bleu_1'] = bleu_1_scores
results_df['bleu_2'] = bleu_2_scores
results_df['bleu_4'] = bleu_4_scores

print(f"\nBLEU Scores:")
print(f"  BLEU-1: {np.mean(bleu_1_scores):.4f} (Paper: 0.172)")
print(f"  BLEU-2: {np.mean(bleu_2_scores):.4f}")
print(f"  BLEU-4: {np.mean(bleu_4_scores):.4f}")

## 6. Calculate ROUGE Scores

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

for pred, ref in tqdm(zip(predictions, references), total=len(predictions), desc="ROUGE"):
    scores = scorer.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

results_df['rouge_1'] = rouge1_scores
results_df['rouge_2'] = rouge2_scores
results_df['rouge_L'] = rougeL_scores

print(f"\nROUGE Scores:")
print(f"  ROUGE-1: {np.mean(rouge1_scores):.4f}")
print(f"  ROUGE-2: {np.mean(rouge2_scores):.4f}")
print(f"  ROUGE-L: {np.mean(rougeL_scores):.4f} (Paper: 0.211)")

## 7. Calculate BERTScore

In [None]:
P, R, F1 = bert_score(predictions, references, lang="en", verbose=True)

results_df['bertscore_p'] = P.numpy()
results_df['bertscore_r'] = R.numpy()
results_df['bertscore_f1'] = F1.numpy()

print(f"\nBERTScore:")
print(f"  Precision: {P.mean():.4f}")
print(f"  Recall: {R.mean():.4f}")
print(f"  F1: {F1.mean():.4f} (Paper: 0.632)")

## 8. Metrics by Question Type

In [None]:
type_metrics = results_df.groupby('question_type').agg({
    'bleu_1': 'mean',
    'rouge_L': 'mean',
    'bertscore_f1': 'mean',
    'input': 'count'
}).rename(columns={'input': 'count'})

print("\nMetrics by Question Type:")
print(type_metrics.round(4).to_markdown())

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(type_metrics))
width = 0.25

bars1 = ax.bar(x - width, type_metrics['bleu_1'], width, label='BLEU-1', color='#3B82F6')
bars2 = ax.bar(x, type_metrics['rouge_L'], width, label='ROUGE-L', color='#10B981')
bars3 = ax.bar(x + width, type_metrics['bertscore_f1'], width, label='BERTScore', color='#8B5CF6')

ax.set_ylabel('Score')
ax.set_title('Evaluation Metrics by Question Type')
ax.set_xticks(x)
ax.set_xticklabels(type_metrics.index, rotation=45, ha='right')
ax.legend()
ax.set_ylim(0, 1)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / "metrics_by_type.png", dpi=150)
plt.show()

## 9. Sample Outputs for Manual Review

In [None]:
print("Sample Outputs (10 per type):")
print("="*80)

for q_type in results_df['question_type'].unique():
    print(f"\n{'='*80}")
    print(f"TYPE: {q_type.upper()}")
    print('='*80)
    
    type_samples = results_df[results_df['question_type'] == q_type].sample(
        min(10, len(results_df[results_df['question_type'] == q_type])), 
        random_state=SEED
    )
    
    for idx, row in type_samples.head(3).iterrows():
        context = row['input'].split(':', 2)[-1][:150]
        print(f"\nContext: {context}...")
        print(f"Reference: {row['reference']}")
        print(f"Generated: {row['prediction']}")
        print(f"BLEU-1: {row['bleu_1']:.3f}, ROUGE-L: {row['rouge_L']:.3f}")

## 10. Manual Evaluation Form

Export 50 samples for manual evaluation following the paper's methodology.

In [None]:
samples_per_type = 10
manual_eval_samples = []

for q_type in results_df['question_type'].unique():
    type_data = results_df[results_df['question_type'] == q_type]
    samples = type_data.sample(min(samples_per_type, len(type_data)), random_state=SEED)
    manual_eval_samples.append(samples)

manual_eval_df = pd.concat(manual_eval_samples, ignore_index=True)

manual_eval_df['fluency'] = ""
manual_eval_df['relevance'] = ""
manual_eval_df['answerability'] = ""
manual_eval_df['notes'] = ""

eval_columns = ['question_type', 'input', 'reference', 'prediction', 
                'fluency', 'relevance', 'answerability', 'notes']
manual_eval_df[eval_columns].to_csv(OUTPUT_DIR / "manual_evaluation_form.csv", index=False)

print(f"Manual evaluation form saved with {len(manual_eval_df)} samples")
print(f"\nEvaluation Criteria (from paper):")
print("  Fluency (1-5): Is the question grammatically correct and natural?")
print("  Relevance (1-5): Is the question relevant to the context?")
print("  Answerability (0/1): Can the question be answered from the context? (0 = good for Socratic)")

## 11. Error Analysis

In [None]:
low_bleu = results_df[results_df['bleu_1'] < 0.05].sample(min(5, len(results_df[results_df['bleu_1'] < 0.05])), random_state=SEED)

print("Low BLEU Examples (potential issues):")
print("="*80)
for idx, row in low_bleu.iterrows():
    print(f"\nType: {row['question_type']}")
    print(f"Reference: {row['reference']}")
    print(f"Generated: {row['prediction']}")
    print(f"BLEU-1: {row['bleu_1']:.4f}")

In [None]:
results_df['pred_length'] = results_df['prediction'].apply(lambda x: len(x.split()))
results_df['ref_length'] = results_df['reference'].apply(lambda x: len(x.split()))

short_preds = results_df[results_df['pred_length'] < 3]
long_preds = results_df[results_df['pred_length'] > 50]

print(f"\nLength Analysis:")
print(f"  Very short predictions (< 3 words): {len(short_preds)}")
print(f"  Very long predictions (> 50 words): {len(long_preds)}")
print(f"  Avg prediction length: {results_df['pred_length'].mean():.1f} words")
print(f"  Avg reference length: {results_df['ref_length'].mean():.1f} words")

## 12. Summary Report

In [None]:
summary = {
    "timestamp": datetime.now().isoformat(),
    "test_samples": len(results_df),
    "metrics": {
        "bleu_1": float(np.mean(bleu_1_scores)),
        "bleu_2": float(np.mean(bleu_2_scores)),
        "bleu_4": float(np.mean(bleu_4_scores)),
        "rouge_1": float(np.mean(rouge1_scores)),
        "rouge_2": float(np.mean(rouge2_scores)),
        "rouge_L": float(np.mean(rougeL_scores)),
        "bertscore_f1": float(F1.mean())
    },
    "paper_baselines": {
        "bleu_1": 0.172,
        "rouge_L": 0.211,
        "bertscore_f1": 0.632
    },
    "metrics_by_type": type_metrics.to_dict()
}

with open(OUTPUT_DIR / "evaluation_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("="*60)
print("EVALUATION SUMMARY")
print("="*60)
print(f"\nTest samples: {len(results_df)}")
print(f"\nOverall Metrics vs Paper Baselines:")
print(f"  BLEU-1:     {np.mean(bleu_1_scores):.4f} (Paper: 0.172) {'✓' if np.mean(bleu_1_scores) >= 0.15 else '✗'}")
print(f"  ROUGE-L:    {np.mean(rougeL_scores):.4f} (Paper: 0.211) {'✓' if np.mean(rougeL_scores) >= 0.18 else '✗'}")
print(f"  BERTScore:  {F1.mean():.4f} (Paper: 0.632) {'✓' if F1.mean() >= 0.60 else '✗'}")
print(f"\nResults saved to {OUTPUT_DIR}")

## 13. Next Steps

1. **Complete manual evaluation** using `manual_evaluation_form.csv`
2. **Calculate inter-annotator agreement** if multiple evaluators
3. **Analyze failure cases** to identify improvement areas
4. **Proceed to vector store setup** (05_keyphrase_extraction.ipynb)

### For Report
Include:
- Table comparing your metrics to paper baselines
- Metrics breakdown by question type chart
- Error analysis examples
- Manual evaluation results (after completion)