# 03c - Compare raw vs resized vs realistic augmentation
Compare three setups (raw without aug, raw + realistic aug, processed 224 + realistic aug) to quantify preprocessing/augmentation impact.

## Prepare paths and validate datasets
Detect project root, define raw/processed folders, and ensure they exist before launching runs.

In [None]:
from pathlib import Path
import sys

# Detect project root
ROOT = Path.cwd().resolve()
for _ in range(10):
    if (ROOT / "Data").exists():
        break
    ROOT = ROOT.parent
else:
    raise FileNotFoundError("Project root not found (Data folder missing).")

# Datasets
RAW_TRAIN  = ROOT / "Data" / "raw" / "train"
RAW_VAL    = ROOT / "Data" / "raw" / "val"
PROC_TRAIN = ROOT / "Data" / "processed" / "train_224"
PROC_VAL   = ROOT / "Data" / "processed" / "val_224"

# Parameters
image_size = 224
limit_per_class = 600   # None for full dataset
epochs = 5
batch_size = 32
model = "resnet18"

# Python: prefer .venv if available
VENV_PY = ROOT / ".venv" / "Scripts" / "python.exe"
PY = str(VENV_PY if VENV_PY.exists() else Path(sys.executable))

# Sanity checks (fail fast)
if not RAW_TRAIN.exists() or not RAW_VAL.exists():
    raise FileNotFoundError(f"Raw dataset missing: {RAW_TRAIN} / {RAW_VAL}")

if not PROC_TRAIN.exists() or not PROC_VAL.exists():
    print("WARNING: Processed dataset missing (ok if you have not preprocessed).")
    print("   PROC_TRAIN =", PROC_TRAIN)
    print("   PROC_VAL   =", PROC_VAL)

variants = [
    {"name": "raw_noaug",          "train": RAW_TRAIN,  "val": RAW_VAL,  "augment": "none"},
    {"name": "raw_realistic",      "train": RAW_TRAIN,  "val": RAW_VAL,  "augment": "realistic"},
    {"name": "resized_realistic",  "train": PROC_TRAIN, "val": PROC_VAL, "augment": "realistic"},
]

print("ROOT =", ROOT)
print("PY   =", PY)
print("Variants:", [v["name"] for v in variants])

## Launch variants in parallel
Filter out variants missing folders, then run each training command in parallel with a limited pool to avoid overloading the machine.

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from subprocess import run
from pathlib import Path
import os

WEIGHTS_DIR = ROOT / "Model" / "weights"
LOG_DIR = ROOT / "Monitoring" / "output"
WEIGHTS_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR.mkdir(parents=True, exist_ok=True)

def run_variant(var: dict):
    save_path = WEIGHTS_DIR / f"best_compare_{var['name']}.pt"

    cmd = [
        PY, "-m", "Model.training.train",
        "--train-dir", str(var["train"]),
        "--val-dir", str(var["val"]),
        "--model", model,
        "--optimizer", "adam",
        "--epochs", str(epochs),
        "--batch-size", str(batch_size),
        "--dropout", "0.3",
        "--image-size", str(image_size),
        "--augment", var["augment"],
        "--save-path", str(save_path),
        "--log-dir", str(LOG_DIR),
    ]

    if limit_per_class is not None:
        cmd += ["--limit-per-class", str(limit_per_class)]

    print("===", " ".join(cmd))
    run(cmd, check=True, cwd=str(ROOT))
    return var["name"], save_path

jobs = [v for v in variants if v["train"].exists() and v["val"].exists()]
missing = [v["name"] for v in variants if not v["train"].exists() or not v["val"].exists()]
if missing:
    print("Skipped variants (missing folders):", missing)

max_workers = min(3, os.cpu_count() or 1, len(jobs))
results = []

print(f"Launching {len(jobs)} runs with {max_workers} workers")

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_variant = {executor.submit(run_variant, var): var for var in jobs}
    for fut in as_completed(future_to_variant):
        name, save_path = fut.result()
        results.append({"name": name, "save_path": str(save_path)})
        print(f"[done] {name} -> {save_path}")

print("Comparison finished:", len(results), "runs done")