# Week 2.6: Leakage-Controlled Evaluation

**Purpose**: Run inference on original vs sanitized text to measure actual F1 delta.

**Requires**: GPU runtime (T4 recommended)

In [None]:
# 1. Clone Repository
!git clone https://github.com/AngadSingh22/Text2Diag.git
%cd Text2Diag

In [None]:
# 2. Install Dependencies
!pip install -q torch transformers accelerate scikit-learn datasets pyyaml

In [None]:
# 3. Regenerate Dataset (needed for inference)
!python scripts/02_build_reddit_canonical.py

In [None]:
# 4. Upload Week 2 Checkpoint (or use existing if you have it)
# Option A: If you have the checkpoint locally, upload it
# Option B: Retrain (takes ~20 min)

import os
checkpoint_path = "results_week2/results/week2/checkpoints/checkpoint-4332"

if not os.path.exists(checkpoint_path):
    print("Checkpoint not found. Retraining...")
    !python scripts/03_train_baseline.py \
        --data_dir data/processed/reddit_mh_windows \
        --out_dir results_week2/results/week2 \
        --model_name distilbert-base-uncased \
        --max_len 256 \
        --batch_size 8 \
        --grad_accum 4 \
        --epochs 3 \
        --lr 2e-5
    checkpoint_path = "results_week2/results/week2/checkpoints/checkpoint-4332"
else:
    print(f"Using existing checkpoint: {checkpoint_path}")

In [None]:
# 5. Run Leakage-Controlled Evaluation
!python scripts/09_eval_sanitized.py \
    --checkpoint $checkpoint_path \
    --data_dir data/processed/reddit_mh_windows \
    --out_dir results/week2/remediation \
    --sanitize_config configs/sanitize.yaml \
    --batch_size 32

In [None]:
# 6. Check Results
!cat results/week2/remediation/leakage_eval_metrics.md

In [None]:
# 7. View JSON Metrics
import json
with open('results/week2/remediation/leakage_eval_metrics.json', 'r') as f:
    metrics = json.load(f)
print(json.dumps(metrics, indent=2))

In [None]:
# 8. Zip and Download Results
!zip -r w26_results.zip results/week2/remediation
from google.colab import files
files.download('w26_results.zip')