# Zenodo RMHD Validation Pipeline (Colab)
This notebook orchestrates the external validation on Reddit Mental Health Dataset (Low et al., 2020).
**Note**: You must replace `YourRepo` with your actual GitHub username.

In [None]:
# 1. Setup
!git clone https://github.com/YourRepo/Text2Diag.git
%cd Text2Diag
!pip install -r requirements.txt
!pip install transformers torch numpy pandas scikit-learn

import sys
import os
sys.path.append("src")

In [None]:
# 2. Download Data
# Direct download from Zenodo Record 3941387 (Low et al., 2020)
!mkdir -p data/external/rmhd_raw

subreddits = [
    "adhd", "depression", "ptsd", "anxiety", "bipolar", 
    "schizophrenia", "autism", "suicidewatch", 
    # Controls (Verify existence or generic ones)
    "personalfinance", "relationships", "teaching"
]

base_url = "https://zenodo.org/record/3941387/files/"

for sub in subreddits:
    filename = f"{sub}.csv"
    url = f"{base_url}{filename}?download=1"
    dest = f"data/external/rmhd_raw/{filename}"
    print(f"Downloading {filename}...")
    !wget -q -O {dest} {url} || echo f"Failed to download {filename} (might not exist in this archive)"

In [None]:
# 3. Build JSONL
!python scripts/21_rmhd_build_jsonl.py \
  --data_dir data/external/rmhd_raw \
  --out_file data/external/rmhd_raw/rmhd_full.jsonl \
  --label_map configs/external/rmhd_label_mapping.json \
  --sample_n 5000

In [None]:
# 4. Leakage Report (Audit)
!python scripts/22_rmhd_leakage_report.py \
  --input_file data/external/rmhd_raw/rmhd_full.jsonl \
  --out_dir results/external/rmhd_audit

In [None]:
# 5. Strict Sanitization (Preprocessing)
import json
from tqdm import tqdm
from text2diag.preprocess.sanitize_external import sanitize_text_strict

input_file = "data/external/rmhd_raw/rmhd_full.jsonl"
sanitized_file = "data/external/rmhd_sanitized.jsonl"

print("Sanitizing...")
with open(input_file, "r") as f_in, open(sanitized_file, "w") as f_out:
    for line in tqdm(f_in):
        row = json.loads(line)
        row["text"] = sanitize_text_strict(row["text"])
        f_out.write(json.dumps(row) + "\n")

In [None]:
# 6. Run Week 5 E2E Batch Runner
# Ensure you have your model checkpoint available (upload or download)
CHECKPOINT = "temp_model" # UPDATE THIS PATH

!python scripts/14_run_e2e_contract_v1.py \
  --checkpoint {CHECKPOINT} \
  --temperature_json results/week2_sanitized/calibration/temperature_scaling.json \
  --label_map data/processed/reddit_mh_sanitized/labels.json \
  --input_jsonl data/external/rmhd_sanitized.jsonl \
  --out_jsonl results/external/rmhd_e2e_outputs.jsonl \
  --include_dependency_graph \
  --skip_sanitization

In [None]:
# 7. Verification
!python scripts/23_week5_verify_outputs.py \
  --input_file results/external/rmhd_e2e_outputs.jsonl \
  --out_report results/external/rmhd_verification.json

In [None]:
# 8. Evaluation Metrics
!python scripts/24_rmhd_eval_metrics.py \
  --pred_file results/external/rmhd_e2e_outputs.jsonl \
  --gold_file data/external/rmhd_raw/rmhd_full.jsonl \
  --out_dir results/external/rmhd_metrics