# 03 — Evaluation

Evaluates all model variants and compares against paper baselines.

**Metrics computed:**
- BLEU-1/2/3/4 (word-level, correct tokenisation)
- BLEU-1/2/3/4 (char-level, paper's original method — for direct comparison)
- Token F1
- METEOR
- ROUGE-L
- Perplexity (T5 models only)

**Models evaluated:**
- `t5:baseline`, `t5:topic`, `t5:topic2x` — auto-discovered from `models/`
- Ollama models — auto-discovered from running local server
- Gemini models — configured in `config/pipeline.yaml`

> Prerequisite: `01_data_generation.ipynb` (MixKhanQ data.csv must exist).

## Setup

In [None]:
import sys
from pathlib import Path

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    REPO_URL = "https://github.com/YOUR_ORG/YOUR_REPO.git"  # TODO: set your URL
    !git clone {REPO_URL} /content/ai4ed-qg -q
    %cd /content/ai4ed-qg
    !pip install -q torch transformers datasets sentence-transformers \
                    evaluate rouge_score nltk sentencepiece pyyaml \
                    tqdm pandas python-dotenv google-genai

    import nltk
    for res in ('punkt', 'punkt_tab', 'wordnet', 'omw-1.4'):
        nltk.download(res, quiet=True)

    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_DIR = Path('/content/drive/MyDrive/ai4ed_qg')

    # Restore models and data from Drive
    import shutil
    for subdir in ('models', 'data/training'):
        src = DRIVE_DIR / subdir.split('/')[-1]
        dst = Path('/content/ai4ed-qg') / subdir
        if src.exists():
            dst.parent.mkdir(parents=True, exist_ok=True)
            shutil.copytree(src, dst, dirs_exist_ok=True)
            print(f"Restored {subdir}/ from Drive")
else:
    DRIVE_DIR = None

import os
project_root = Path('/content/ai4ed-qg') if IN_COLAB else Path.cwd()
if project_root.name == 'notebooks':
    project_root = project_root.parent
os.chdir(project_root)
sys.path.insert(0, str(project_root))
print(f"Working dir: {os.getcwd()}")

## API Keys

Gemini evaluation requires `GOOGLE_API_KEY` (or `GEMINI_API_KEY`). Set it in Colab Secrets or your local `.env`.

Ollama models are auto-discovered — just have Ollama running locally.

In [None]:
if IN_COLAB:
    from google.colab import userdata
    import os
    for key_name in ('GOOGLE_API_KEY', 'GEMINI_API_KEY'):
        try:
            os.environ[key_name] = userdata.get(key_name)
            print(f"{key_name} loaded from Colab Secrets")
            break
        except Exception:
            pass
else:
    try:
        from dotenv import load_dotenv
        load_dotenv()
    except ImportError:
        pass

api_key = os.environ.get('GOOGLE_API_KEY') or os.environ.get('GEMINI_API_KEY', '')
print(f"Gemini API key: {'set (' + api_key[:6] + '...)' if api_key else 'not set (Gemini models will be skipped)'}")

## Initialise Pipeline

In [None]:
from src.pipeline import Pipeline

pipe = Pipeline('config/pipeline.yaml')
pipe.status()

## Evaluate — KhanQ

Evaluates all available models on the MixKhanQ test set (653 pairs) and prints a comparison table that includes paper baselines.

> The table shows both word-level BLEU (correct) and char-level BLEU (paper's original buggy method) for direct comparison with the paper's Table 2 numbers.

In [None]:
# Evaluate all auto-discovered models on KhanQ
# models='all'  → T5 checkpoints (disk) + Ollama (live server) + Gemini (config)
# models='t5:topic,t5:baseline' → evaluate specific models only

results = pipe.evaluate(
    models='all',
    dataset='khanq',
    # output_dir='results/my_eval',  # optional: pin the output directory
)

## Evaluate — SQuAD (optional)

Evaluates on the SQuAD test split. Only relevant if you ran the full SQuAD data pipeline.

In [None]:
# Uncomment to also evaluate on SQuAD:
# results_squad = pipe.evaluate(models='all', dataset='squad')

## Inspect Results

In [None]:
import pandas as pd

# Build a tidy DataFrame from the results dict
rows = []
for key, m in results.items():
    rows.append({
        'model':      key,
        'n':          m.get('num_samples', '-'),
        'B1':         round(m.get('bleu1', 0), 3),
        'B2':         round(m.get('bleu2', 0), 3),
        'B3':         round(m.get('bleu3', 0), 3),
        'B4':         round(m.get('bleu4', 0), 3),
        'B1c (paper)': round(m.get('bleu1_char', 0), 3),
        'B4c (paper)': round(m.get('bleu4_char', 0), 3),
        'F1':         round(m.get('f1', 0), 3),
        'METEOR':     round(m.get('meteor', 0), 3),
        'ROUGE-L':    round(m.get('rouge_l', 0), 3),
        'PPL':        round(m.get('perplexity', float('nan')), 3),
    })

df = pd.DataFrame(rows).set_index('model')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 140)
df

In [None]:
# Find best model by word-level BLEU-4
if 'B4' in df.columns and df['B4'].notna().any():
    best = df['B4'].idxmax()
    print(f"Best model by BLEU-4 (word-level): {best}")
    print(df.loc[best])

## Qualitative Examples

Compare what different models generate for the same input.

In [None]:
import random

# Pick any model that has predictions stored
sample_key = next(
    (k for k, v in results.items() if v.get('predictions')),
    None
)

if sample_key:
    preds = results[sample_key]['predictions']
    refs  = results[sample_key]['references']
    indices = random.sample(range(len(preds)), min(5, len(preds)))

    print(f"Sample predictions from: {sample_key}")
    print("=" * 70)
    for i in indices:
        print(f"\n[{i}] Reference : {refs[i]}")
        print(f"     Prediction: {preds[i]}")
else:
    print("No predictions stored (check results dict)")

## Single Question Generation

Interactively generate a question from any topic + context.

In [None]:
topic = "Electronegativity"
context = (
    "Electronegativity is a measure of the tendency of an atom to attract "
    "a bonding pair of electrons. The Pauling scale is the most commonly "
    "used scale for electronegativity. Fluorine is the most electronegative "
    "element and is assigned a value of 4.0 on the Pauling scale."
)

# T5 fine-tuned model
try:
    q_t5 = pipe.generate(topic=topic, context=context, mode='topic')
    print(f"T5 (topic)  : {q_t5}")
except FileNotFoundError:
    print("T5 model not trained yet — run 02_distillation_training.ipynb first")

# Gemini zero-shot (requires GOOGLE_API_KEY)
# from src.evaluation.models import GeminiBaseline
# gem = GeminiBaseline('gemini-2.5-flash')
# print(f"Gemini flash: {gem.generate_question(topic, context)}")

## Save Results (Colab)

In [None]:
if IN_COLAB and DRIVE_DIR:
    import shutil
    src = project_root / 'results'
    dst = DRIVE_DIR / 'results'
    shutil.copytree(src, dst, dirs_exist_ok=True)
    print(f"Results synced to {dst}")