# 04 — Evaluation

The trained adapter is evaluated on the SocratiQ held-out test set (10,573 examples) using three complementary automatic metrics: ROUGE-1/2/L (Lin, 2004) for n-gram overlap; BLEU-4 via sacrebleu (Post, 2018) for n-gram precision; and BERTScore F1 (Zhang et al., 2020) for contextual semantic similarity. All metrics are computed at corpus level with deterministic beam search (num_beams = 4, do_sample = False) to ensure reproducibility and direct comparability with Ang et al. (2023, Table 3). A per-question-type ROUGE breakdown is also reported — an analysis not presented in the original paper.

In [None]:
import torch
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_from_disk

from rouge_score import rouge_scorer
from bert_score import score as bert_score
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

In [None]:

# MODEL_PATH points to the LoRA adapter produced by 03_training.
# MODEL_NAME must match the base model used during training (flan-t5-small).
from pathlib import Path
import torch

MODEL_PATH = Path("../models/flan-t5-socratic-lora/adapter")
DATA_PATH  = Path("../datasets/processed")
OUTPUT_PATH = Path("../evaluation_results")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

# Must match the base model used during training (flan-t5-small was trained, not base)
MODEL_NAME = "google/flan-t5-small"
print(f"Using device: {DEVICE}")

# ── Generation config — EVALUATION (deterministic) ───────────────────────────
#
# Deterministic beam search is used for evaluation to ensure reproducible
# ROUGE scores comparable to Ang et al. (2023, Table 3).
EVAL_GENERATION_CONFIG = {
    "max_length": 80,
    "num_beams": 4,
    "do_sample": False,    # deterministic — required for reproducible ROUGE
    "early_stopping": True,
}

# ── Generation config — DEMO / FRONTEND (diverse outputs) ────────────────────
DEMO_GENERATION_CONFIG = {
    "max_length": 80,
    "num_beams": 2,
    "do_sample": True,
    "temperature": 0.8,
    "top_p": 0.9,
    "repetition_penalty": 1.2,
    "no_repeat_ngram_size": 3,
}

print("Eval generation config (deterministic):", EVAL_GENERATION_CONFIG)


In [None]:

# Load sequence: tokenizer → base model → resize → adapter.
# The base model must be resized to the extended vocabulary (32,101 tokens)
# before the adapter is loaded to avoid a shape mismatch error.
from transformers import T5Tokenizer, T5ForConditionalGeneration
from peft import PeftModel

MODEL_NAME = "google/flan-t5-small"   # must match what was used in training

print("Step 1: Loading tokenizer from adapter directory...")
tokenizer = T5Tokenizer.from_pretrained(str(MODEL_PATH))
print(f"  Vocabulary size: {len(tokenizer)}")

print(f"\nStep 2: Loading base model ({MODEL_NAME})...")
base_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

print(f"\nStep 3: Resizing base model embeddings to {len(tokenizer)}...")
base_model.resize_token_embeddings(len(tokenizer))

print("\nStep 4: Loading LoRA adapter on top of resized base model...")
model = PeftModel.from_pretrained(base_model, str(MODEL_PATH))
model = model.to(DEVICE)
model.eval()

print(f"\n✓ Model loaded: {model.num_parameters():,} total parameters")


In [None]:
test_formatted = pd.read_parquet(DATA_PATH / "test_formatted.parquet")
print(f"Test samples: {len(test_formatted)}")
test_formatted.head()

In [None]:

# ── Generate Predictions ──────────────────────────────────────────────────────
#
# The input prompt at evaluation must match the training format exactly:
#   "Generate a Socratic question for this context: {question_type}: {context}"
#
# test_formatted already has input_text in this format (set in 02_preprocessing).
# We use EVAL_GENERATION_CONFIG (beam search, no sampling) throughout.

def generate_question(input_text: str) -> str:
    """Generate a Socratic question; strip the [Question] prefix from output."""
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=400,
        truncation=True,
    )
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs, **EVAL_GENERATION_CONFIG)

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated.replace("[Question]", "").strip()


In [None]:
predictions = []
references = []

print("Generating predictions...")
for idx, row in tqdm(test_formatted.iterrows(), total=len(test_formatted)):
    pred = generate_question(row['original_input'])
    predictions.append(pred)
    ref = row['original_target'] if 'original_target' in row else row['target_text'].replace("[Question] ", "")
    references.append(ref)

test_formatted['prediction'] = predictions
test_formatted['reference'] = references
print(f"Generated {len(predictions)} predictions.")

In [None]:

# ROUGE is computed corpus-level (one call over all predictions), not as a
# per-sentence average. This is the academic standard (Lin, 2004) and matches
# the evaluation setup in Ang et al. (2023, Table 3).
import evaluate as hf_evaluate

rouge_metric = hf_evaluate.load("rouge")

# Clean reference strings (strip [Question] prefix added during preprocessing)
clean_preds = [p.replace("[Question]", "").strip() for p in predictions]
clean_refs  = [r.replace("[Question]", "").strip() for r in references]

# Single corpus-level call
rouge_results = rouge_metric.compute(
    predictions=clean_preds,
    references=clean_refs,
    use_stemmer=True,
)

print("Corpus-level ROUGE Scores")
print("=" * 40)
for k in ["rouge1", "rouge2", "rougeL"]:
    print(f"  {k.upper():<10}: {rouge_results[k]:.4f}")


In [None]:

# sacrebleu is used for BLEU-4 (corpus-level, tokenisation-consistent).
# The score is normalised from 0–100 to 0–1 for consistency with ROUGE.
bleu_metric = hf_evaluate.load("sacrebleu")

bleu_result = bleu_metric.compute(
    predictions=clean_preds,
    references=[[ref] for ref in clean_refs],   # sacrebleu expects list-of-lists
)

bleu4 = bleu_result["score"] / 100.0   # normalise 0-100 → 0-1

print(f"BLEU-4 (sacrebleu, normalised): {bleu4:.4f}")
print(f"BLEU-4 (sacrebleu, raw 0-100) : {bleu_result['score']:.2f}")


BERTScore (Zhang et al., 2020) computes token-level similarity using contextual embeddings from a pre-trained language model, making it more sensitive to semantic equivalence than n-gram metrics. For Socratic question generation — where many valid paraphrases exist — BERTScore F1 is a more meaningful signal than BLEU or ROUGE alone.

In [None]:
print("Computing BERTScore (this may take a few minutes)...")

P, R, F1 = bert_score(
    predictions,
    references,
    lang="en",
    verbose=True,
    device=DEVICE
)

bertscore_results = {
    'Precision': P.mean().item(),
    'Recall': R.mean().item(),
    'F1': F1.mean().item()
}

print("\nBERTScore Results:")
for metric, score in bertscore_results.items():
    print(f"  {metric}: {score:.4f}")

In [None]:

# Paper reference scores are from Ang et al. (2023, Table 3), evaluated on
# the SocratiQ test set with beam search. Verify these values against your
# copy of the paper before citing them in the dissertation.
#
# NOTE: The paper's T5-base and FLAN-T5-base numbers reflect full fine-tuning
# (100% of parameters). This implementation uses LoRA (~1.4% of parameters),
# so a moderate gap is expected and is itself a contribution.
paper_models = [
    ("GPT-3 (zero-shot, paper)",   0.2100, 0.0420, 0.1980, 0.0120),
    ("T5-base (paper, full FT)",   0.3876, 0.1712, 0.3657, 0.0721),
    ("T5-large (paper, full FT)",  0.4051, 0.1832, 0.3818, 0.0798),
    ("FLAN-T5-base (paper, full FT)", 0.4143, 0.1897, 0.3901, 0.0831),
]

our_scores = (
    "FLAN-T5-small + LoRA (ours)",
    rouge_results["rouge1"],
    rouge_results["rouge2"],
    rouge_results["rougeL"],
    bleu4,
)

print("=" * 80)
print(f"{'Model':<40} {'ROUGE-1':>8} {'ROUGE-2':>8} {'ROUGE-L':>8} {'BLEU-4':>8}")
print("=" * 80)
for name, r1, r2, rl, b4 in paper_models:
    print(f"{name:<40} {r1:>8.4f} {r2:>8.4f} {rl:>8.4f} {b4:>8.4f}")
print("-" * 80)
# Highlight our model
name, r1, r2, rl, b4 = our_scores
print(f"{name:<40} {r1:>8.4f} {r2:>8.4f} {rl:>8.4f} {b4:>8.4f}  ← SocraticPath")
print("=" * 80)

# Gap analysis
flan_r1, flan_r2, flan_rl = 0.4143, 0.1897, 0.3901
print(f"\nGap vs FLAN-T5-base (paper):")
print(f"  ROUGE-1: {rouge_results['rouge1'] - flan_r1:+.4f}")
print(f"  ROUGE-2: {rouge_results['rouge2'] - flan_r2:+.4f}")
print(f"  ROUGE-L: {rouge_results['rougeL'] - flan_rl:+.4f}")
print("\nNote: LoRA uses ~1.4% of parameters vs 100% for full fine-tuning.")
print("A small gap is expected and is itself a contribution (parameter efficiency).")


In [None]:

# ── Per-Question-Type ROUGE Breakdown ────────────────────────────────────────
#
# SocratiQ has 5 question types.  Breaking ROUGE down by type shows which
# categories the model handles well and which need improvement — a stronger
# analysis than aggregate scores alone, and a clear dissertation finding.
#
# This breakdown is novel — the SOQG paper only reports aggregate scores.

import pandas as pd

def extract_question_type(input_text: str) -> str:
    """Extract question type prefix from dataset input format."""
    # Format: "Generate a Socratic question for this context: {type}: {text}"
    # or just the raw column: "{type}: {text}"
    parts = input_text.split(":")
    if len(parts) >= 3:
        return parts[1].strip()   # type sits between 1st and 2nd colon
    elif len(parts) == 2:
        return parts[0].strip()
    return "unknown"

test_formatted["question_type"] = test_formatted["input_text"].apply(
    extract_question_type
)

print("Per-Question-Type ROUGE-L Breakdown")
print("=" * 65)
print(f"{'Question Type':<35} {'Count':>6} {'ROUGE-1':>8} {'ROUGE-2':>8} {'ROUGE-L':>8}")
print("=" * 65)

type_results = {}
for q_type, group in test_formatted.groupby("question_type"):
    type_preds = [
        p.replace("[Question]", "").strip()
        for p in group["prediction"].tolist()
    ]
    type_refs = [
        r.replace("[Question]", "").strip()
        for r in group["reference"].tolist()
    ]
    if not type_preds:
        continue
    res = rouge_metric.compute(
        predictions=type_preds,
        references=type_refs,
        use_stemmer=True,
    )
    type_results[q_type] = res
    print(
        f"{q_type:<35} {len(group):>6} "
        f"{res['rouge1']:>8.4f} {res['rouge2']:>8.4f} {res['rougeL']:>8.4f}"
    )

print("=" * 65)
print(
    f"{'Overall (corpus)':<35} {len(test_formatted):>6} "
    f"{rouge_results['rouge1']:>8.4f} {rouge_results['rouge2']:>8.4f} "
    f"{rouge_results['rougeL']:>8.4f}"
)


In [None]:
# ── Per-Sample Score Distributions ──────────────────────────────────────────
#
# The corpus-level rouge_results dict (computed in cell 12) gives a single
# aggregate value.  For distribution plots we compute per-sample ROUGE-L using
# rouge_score directly.  This is intentionally separate from the corpus-level
# computation so the two methods remain independent.

from rouge_score import rouge_scorer as rs_lib
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

scorer = rs_lib.RougeScorer(["rougeL"], use_stemmer=True)
sf = SmoothingFunction().method1

per_sample_rougeL = []
per_sample_bleu = []

for pred, ref in zip(clean_preds, clean_refs):
    score = scorer.score(ref, pred)
    per_sample_rougeL.append(score["rougeL"].fmeasure)
    pred_toks = pred.split()
    ref_toks = ref.split()
    b = sentence_bleu([ref_toks], pred_toks, smoothing_function=sf) if pred_toks else 0.0
    per_sample_bleu.append(b)

per_sample_rougeL = np.array(per_sample_rougeL)
per_sample_bleu   = np.array(per_sample_bleu)
bertscore_f1_np   = F1.numpy()

import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(per_sample_rougeL, bins=30, color="steelblue", alpha=0.7, edgecolor="black")
axes[0].axvline(per_sample_rougeL.mean(), color="red", linestyle="--",
                label=f"Mean: {per_sample_rougeL.mean():.3f}")
axes[0].set_title("ROUGE-L Distribution (per sample)")
axes[0].set_xlabel("Score"); axes[0].set_ylabel("Frequency"); axes[0].legend()

axes[1].hist(per_sample_bleu, bins=30, color="coral", alpha=0.7, edgecolor="black")
axes[1].axvline(per_sample_bleu.mean(), color="red", linestyle="--",
                label=f"Mean: {per_sample_bleu.mean():.3f}")
axes[1].set_title("BLEU-4 Distribution (per sample)")
axes[1].set_xlabel("Score"); axes[1].set_ylabel("Frequency"); axes[1].legend()

axes[2].hist(bertscore_f1_np, bins=30, color="seagreen", alpha=0.7, edgecolor="black")
axes[2].axvline(bertscore_f1_np.mean(), color="red", linestyle="--",
                label=f"Mean: {bertscore_f1_np.mean():.3f}")
axes[2].set_title("BERTScore F1 Distribution")
axes[2].set_xlabel("Score"); axes[2].set_ylabel("Frequency"); axes[2].legend()

plt.tight_layout()
plt.savefig(OUTPUT_PATH / "score_distributions.png", dpi=150)
plt.show()
print(f"Saved: {OUTPUT_PATH / 'score_distributions.png'}")

# Attach per-sample scores to the DataFrame so later cells can rank examples.
test_formatted['rougeL'] = per_sample_rougeL
test_formatted['bleu']   = per_sample_bleu
test_formatted['bertscore_f1'] = bertscore_f1_np

In [None]:
print("HIGH SCORING EXAMPLES (Top 5 by BERTScore)")
print("=" * 70)

top_samples = test_formatted.nlargest(5, 'bertscore_f1')

for idx, row in top_samples.iterrows():
    print(f"\nContext: {row['original_input'][:150]}...")
    print(f"Reference: {row['reference']}")
    print(f"Prediction: {row['prediction']}")
    print(f"Scores - ROUGE-L: {row['rougeL']:.3f}, BERTScore: {row['bertscore_f1']:.3f}")
    print("-" * 70)

In [None]:
print("\nLOW SCORING EXAMPLES (Bottom 5 by BERTScore)")
print("=" * 70)

bottom_samples = test_formatted.nsmallest(5, 'bertscore_f1')

for idx, row in bottom_samples.iterrows():
    print(f"\nContext: {row['original_input'][:150]}...")
    print(f"Reference: {row['reference']}")
    print(f"Prediction: {row['prediction']}")
    print(f"Scores - ROUGE-L: {row['rougeL']:.3f}, BERTScore: {row['bertscore_f1']:.3f}")
    print("-" * 70)

## Human Evaluation

Automatic metrics cannot assess whether a question is genuinely Socratic — i.e., whether it challenges unstated assumptions and resists a direct answer from the given context. The 50-item sample (`human_evaluation_samples.csv`) is formatted for manual rating on three criteria: *fluency* (1–5 Likert), *relevance* (1–5 Likert), and *Socratic quality* (binary). Inter-rater reliability can be quantified with Krippendorff's alpha.

In [None]:
human_eval_sample = test_formatted.sample(50, random_state=42)[
    ['original_input', 'reference', 'prediction', 'rougeL', 'bertscore_f1']
].copy()

human_eval_sample['fluency'] = None
human_eval_sample['relevance'] = None
human_eval_sample['is_socratic'] = None

human_eval_sample = human_eval_sample.reset_index(drop=True)
human_eval_sample.to_csv(OUTPUT_PATH / "human_evaluation_samples.csv", index=False)

print(f"Saved {len(human_eval_sample)} samples for human evaluation.")
print(f"File: {OUTPUT_PATH / 'human_evaluation_samples.csv'}")

In [None]:
import json

evaluation_results = {
    "test_samples": len(test_formatted),
    "rouge": rouge_results,
    "bleu4": float(bleu4),  # bleu4 computed by sacrebleu cell above,
    "bertscore": bertscore_results,
    "generation_config": EVAL_GENERATION_CONFIG
}

with open(OUTPUT_PATH / "evaluation_metrics.json", "w") as f:
    json.dump(evaluation_results, f, indent=2)

test_formatted.to_csv(OUTPUT_PATH / "test_predictions.csv", index=False)

print("\nSaved:")
print(f"  - {OUTPUT_PATH / 'evaluation_metrics.json'}")
print(f"  - {OUTPUT_PATH / 'test_predictions.csv'}")
print(f"  - {OUTPUT_PATH / 'score_distributions.png'}")
print(f"  - {OUTPUT_PATH / 'human_evaluation_samples.csv'}")

In [None]:
metrics_df = test_formatted[['rougeL', 'bleu', 'bertscore_f1']].copy()
metrics_df.columns = ['ROUGE-L', 'BLEU-4', 'BERTScore']

plt.figure(figsize=(8, 6))
correlation = metrics_df.corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, fmt='.3f')
plt.title('Metric Correlations')
plt.tight_layout()
plt.savefig(OUTPUT_PATH / "metric_correlations.png", dpi=150)
plt.show()

Evaluation results are saved to `evaluation_results/`. The JSON file (`evaluation_metrics.json`) records corpus-level ROUGE, BLEU-4, and BERTScore F1 alongside the generation configuration, making the results reproducible from the same checkpoint. The per-question-type ROUGE breakdown provides a novel analysis not reported in Ang et al. (2023).