In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
# Cell 1 – Install & Imports
!pip install --quiet torch torchvision webdataset tqdm pillow scikit-learn joblib matplotlib seaborn pyyaml

import os, sys, json, yaml, joblib
from pathlib import Path
import torch
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# set matplotlib style
plt.rcParams.update({"figure.max_open_warning": 0})


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Cell 2 – Load Configuration & Paths (robust version)
import os
import yaml
from pathlib import Path

# Define default root paths
DEFAULT_ENV_PATHS = {
    "colab": "/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project",
    "local": "/Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project",
}

# Determine environment
IN_COLAB = Path("/content").exists()
PROJECT_ROOT = Path(
    os.getenv("PROJECT_ROOT", DEFAULT_ENV_PATHS["colab" if IN_COLAB else "local"])
).resolve()

# Path to YAML config (always in config/)
cfg_path = PROJECT_ROOT / "config" / "training.yaml"

# Check config file
if not cfg_path.exists():
    raise FileNotFoundError(f"❌ training.yaml not found at: {cfg_path}")

# Load YAML config
with cfg_path.open() as f:
    cfg = yaml.safe_load(f)

# Extract config values
EXP_CODE   = cfg.get("exp_code") or os.getenv("EXP_CODE") or "missing_code"
DATASET_ID = cfg["data"]["dataset_id"]

# Build central experiment path
EXP_DIR = PROJECT_ROOT / cfg["output"]["exp_dir"].format(
    dataset_id=DATASET_ID, exp_code=EXP_CODE
)

# Ovverride general training yaml file
cfg_path = EXP_DIR / f"training_{EXP_CODE}.yaml"
# Check config file
if not cfg_path.exists():
    raise FileNotFoundError(f"❌ training.yaml not found at: {cfg_path}")

# Load YAML config
with cfg_path.open() as f:
    cfg = yaml.safe_load(f)

# Display
print(f"📁 PROJECT_ROOT → {PROJECT_ROOT}")
print(f"📄 YAML loaded  → {cfg_path.name}")
print(f"🔑 EXP_CODE     → {EXP_CODE}")
print(f"📂 EXP_DIR      → {EXP_DIR}")


📁 PROJECT_ROOT → /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project
📄 YAML loaded  → training_20250709142129.yaml
🔑 EXP_CODE     → 20250709142129
📂 EXP_DIR      → /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/dataset_9f30917e/experiments/20250709142129


In [3]:
# Cell 3 – Extend PYTHONPATH & Import Trainers + Utils

import sys
import importlib
from pathlib import Path

# Extend PYTHONPATH to include src/
SRC_DIR = PROJECT_ROOT / "src"
sys.path[:0] = [str(PROJECT_ROOT), str(SRC_DIR)]

# Import training utils from utils.training_utils
from utils.training_utils.registry import TRAINER_REGISTRY
from utils.training_utils.device_io import (
    choose_device,
    get_latest_checkpoint,
    load_checkpoint,
    save_json,
    save_joblib,
)
from utils.training_utils.data_utils import (
    build_loader,
    load_classifier,
    parse_label_from_filename,
)
from utils.training_utils.model_utils import mc_dropout_predictions
from utils.training_utils.metrics import (
    compute_classification_metrics,
    aggregate_fold_metrics,
    expected_calibration_error,
)

# Import trainer modules dynamically to ensure registration
trainer_names = ["simclr", "moco_v2", "rotation", "jepa", "supervised", "transfer"]
for name in trainer_names:
    try:
        importlib.import_module(f"trainers.{name}")
        print(f"✅ Imported trainer module: {name}")
    except ImportError as e:
        print(f"❌ Failed to import trainer {name}: {e}")

# Verify registration
for name in trainer_names:
    assert name in TRAINER_REGISTRY, f"❌ Missing trainer in registry: {name}"
print("📚 All trainers successfully registered.")

✅ Imported trainer module: simclr
✅ Imported trainer module: moco_v2
✅ Imported trainer module: rotation
✅ Imported trainer module: jepa
✅ Imported trainer module: supervised
✅ Imported trainer module: transfer
📚 All trainers successfully registered.


In [4]:
# Cell 4 – Evaluation Settings (debug-friendly)

device = choose_device()
eval_cfg = cfg.get("evaluation", {})

# 🔧 Debug: riduci MC_PASSES ed ECE_BINS per velocizzare
MC_PASSES = min(int(eval_cfg.get("mc_dropout_passes", 20)), 3)
ECE_BINS  = min(int(eval_cfg.get("ece_bins", 15)), 5)

GCAM_TOPK  = int(eval_cfg.get("gradcam", {}).get("top_k", 5))
GCAM_LAYER = eval_cfg.get("gradcam", {}).get("layer", None)

print(f"🖥️  Device:         {device}")
print(f"🔄  MC-dropout:     {MC_PASSES} passes")
print(f"📊  ECE bins:       {ECE_BINS}")
print(f"🔍  GradCAM++ top-k:{GCAM_TOPK}")
print(f"📐  GradCAM++ layer:{GCAM_LAYER}")


🖥️  Device:         cpu
🔄  MC-dropout:     3 passes
📊  ECE bins:       5
🔍  GradCAM++ top-k:5
📐  GradCAM++ layer:layer4


In [5]:
# Cell 5 – Helper Functions & Path Centralization

from pathlib import Path

def _paths(model_name: str, fold: int, patient_id: str = None) -> dict[str, Path]:
    """
    Build and return all relevant paths for a given model/fold,
    directly from the patterns in cfg['output'], WITHOUT formatting {epoch}.
    """
    # Base placeholders
    ph = {
        "dataset_id": DATASET_ID,
        "exp_code":   EXP_CODE,
        "model_name": model_name,
        "fold_idx":   fold,
        "patient_id": patient_id or "{patient_id}",
    }

    # Experiment directories
    ph["exp_dir"]       = cfg["output"]["exp_dir"].format(**ph)
    ph["exp_model_dir"] = cfg["output"]["exp_model_dir"].format(**ph)

    out: dict[str, Path] = {}

    # ─── Training ────────────────────────────────────────────────────────
    out["ckpt_dir"] = PROJECT_ROOT / ph["exp_model_dir"] / f"fold{fold}" / "training"

    t = cfg["output"]["training"]
    out["features_train"] = PROJECT_ROOT / t["features"].format(**ph)
    out["clf"]            = PROJECT_ROOT / t["clf"].format(**ph)
    out["scaler"]         = PROJECT_ROOT / t["scaler"].format(**ph)
    out["loss_json"]      = PROJECT_ROOT / t["loss_json"].format(**ph)
    out["log"]            = PROJECT_ROOT / t["log"].format(**ph)

    # ─── Inference ───────────────────────────────────────────────────────
    i = cfg["output"]["inference"]
    out["patch_preds"]   = PROJECT_ROOT / i["patch_preds"].format(**ph)
    out["patient_preds"] = PROJECT_ROOT / i["patient_preds"].format(**ph)
    out["mc_logits"]     = PROJECT_ROOT / i["mc_logits"].format(**ph)
    out["metrics"]       = PROJECT_ROOT / i["metrics"].format(**ph)

    # ─── Explainability ──────────────────────────────────────────────────
    e = cfg["output"]["explain"]
    out["gradcam_dir"]  = PROJECT_ROOT / e["gradcam_dir"].format(**ph)
    out["metadata_csv"] = PROJECT_ROOT / e["metadata_csv"].format(**ph)

    # ─── Aggregation ─────────────────────────────────────────────────────
    a = cfg["output"]["aggregate"]
    out["agg_metrics"] = PROJECT_ROOT / a["metrics"].format(**ph)
    out["agg_summary"] = PROJECT_ROOT / a["summary_img"].format(**ph)

    # ─── Experiment-Level ────────────────────────────────────────────────
    x = cfg["output"]["experiment_level"]
    out["exp_json"] = PROJECT_ROOT / x["comparison_json"].format(**ph)
    out["exp_img"]  = PROJECT_ROOT / x["comparison_img"].format(**ph)

    # ─── Ensure directories exist ────────────────────────────────────────
    for key, p in out.items():
        if "dir" in key:
            p.mkdir(parents=True, exist_ok=True)
        else:
            p.parent.mkdir(parents=True, exist_ok=True)

    return out

def _completed(paths: dict[str, Path], is_ssl: bool) -> bool:
    """
    Returns True if all the necessary inference artifacts for this fold
    are already on disk, so we can skip evaluation.
    """
    # Always require patch‐level preds + metrics JSON
    required = ["patch_preds", "metrics"]
    # For SSL models also require MC logits and patient‐level CSV
    if is_ssl:
        required += ["mc_logits", "patient_preds"]
    return all(paths[k].exists() for k in required)


def extract_patient_id(key: str) -> str:
    """
    Extract patient ID from a key formatted like 'CLASS_HPxxxx_x_y'.
    """
    parts = key.split("_")
    return next((p for p in parts if p.startswith(("HP", "H"))), "UNKNOWN")


def ece(probs, labels):
    """
    Expected Calibration Error (ECE) helper for quick access.
    """
    return expected_calibration_error(probs, labels, n_bins=ECE_BINS)

In [6]:
# Cell 6 – Core Evaluation Functions

import torch
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict, Counter

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

from utils.training_utils.metrics import (
    TemperatureScaler,
    expected_calibration_error,
    mc_dropout_statistics
)
from utils.training_utils.data_utils import load_classifier, parse_label_from_filename
from utils.training_utils.device_io import (
    get_latest_checkpoint,
    load_checkpoint,
    save_json
)
from utils.training_utils.registry import TRAINER_REGISTRY
from utils.training_utils.model_utils import mc_dropout_predictions

def load_model_and_components(model_name: str, fold: int):
    """
    Carica:
      - il modello PyTorch (encoder o rete completa) dal checkpoint
      - se SSL: il probe (clf + label encoder) e, se presente, il TemperatureScaler
    Restituisce: model, is_ssl, clf, le, temp_scaler
    """
    paths   = _paths(model_name, fold)
    cfg_m   = cfg["models"][model_name]
    is_ssl  = cfg_m["type"] == "ssl"
    trainer = TRAINER_REGISTRY[model_name](cfg_m, cfg["data"])

    # 1️⃣ Scegli checkpoint
    ckpt = None
    if is_ssl and cfg.get("train_encoder_once", False) and fold > 0:
        ckpt = get_latest_checkpoint(_paths(model_name, 0)["ckpt_dir"])
        if ckpt is None:
            raise FileNotFoundError(f"❌ Nessun checkpoint trovato in fold0 per {model_name}")
        print(f"   ➔ SSL+train_encoder_once: caricamento encoder da fold0 → {ckpt.name}")
    else:
        ckpt = get_latest_checkpoint(paths["ckpt_dir"])
        if ckpt is None:
            raise FileNotFoundError(f"❌ Nessun checkpoint trovato in fold{fold} per {model_name}")
        print(f"   ➔ Caricamento checkpoint fold{fold} → {ckpt.name}")

    # 2️⃣ Carica pesi
    if is_ssl:
        # prende sia encoder che eventuale proiettore
        full_model, _ = trainer.get_resume_model_and_optimizer()
        load_checkpoint(ckpt, model=full_model)
        # feature-extractor: preferisci `encoder`, altrimenti `model`
        feat_mod = getattr(trainer, "encoder", None) or getattr(trainer, "model", None)
        if feat_mod is None:
            raise AttributeError(f"No feature submodule found on {trainer}")
        model = feat_mod.to(device).eval()
    else:
        # supervisati e transfer espongono `trainer.model`
        load_checkpoint(ckpt, model=trainer.model)
        model = trainer.model.to(device).eval()

    # 3️⃣ Se SSL: carica probe + temp_scaler
    clf = le = temp_scaler = None
    if is_ssl:
        clf, le = load_classifier(paths["clf"])
        scaler_path = paths["scaler"]
        if scaler_path.exists():
            obj = joblib.load(scaler_path)
            if isinstance(obj, TemperatureScaler):
                temp_scaler = obj
                print("   ➔ Loaded TemperatureScaler (calibrate probs)")
        else:
            print(f"   ➔ No TemperatureScaler found for {model_name} fold {fold}")

    # 4️⃣ Debug printout
    print(f"   ➔ SSL pipeline?  {is_ssl}")
    print(f"   ➔ Classifier?    {clf is not None}")
    print(f"   ➔ Temp-scaler?   {temp_scaler is not None}")

    return model, is_ssl, clf, le, temp_scaler


def run_patch_inference(model, loader, is_ssl, clf, temp_scaler):
    """
    Inferenzia patch-level. Restituisce:
      keys (dummy), y_true, y_pred, probs (calibrated if temp_scaler)
    """
    keys, y_true, y_pred = [], [], []
    probs_list = []

    for batch_idx, batch in enumerate(tqdm(loader, desc="Patches")):
        imgs, labels = batch
        imgs = imgs.to(device)

        if is_ssl:
            with torch.no_grad():
                feats = model(imgs).cpu().numpy()
            raw_p = clf.predict_proba(feats)
        else:
            with torch.no_grad():
                logits = model(imgs)
                raw_p = torch.softmax(logits, dim=1).cpu().numpy()

        # calibration
        if temp_scaler is not None:
            logits_for_cal = np.log(raw_p + 1e-12)
            p = temp_scaler.transform_proba(logits_for_cal)
        else:
            p = raw_p

        preds = p.argmax(axis=1)
        t     = labels.cpu().numpy()

        y_true.extend(t.tolist())
        y_pred.extend(preds.tolist())
        probs_list.append(p)

        print(f"   • Batch {batch_idx} done.")
        if batch_idx == 1:
            print("🛑 DEBUG: stopping after batch_idx=1")
            break

    probs = np.vstack(probs_list)
    return keys, np.array(y_true), np.array(y_pred), probs


def save_patch_outputs(model_name, fold, keys, y_true, y_pred, probs):
    torch.save({
        "keys": keys,
        "true": y_true,
        "pred": y_pred,
        "probs": probs
    }, _paths(model_name, fold)["patch_preds"])


def save_mc_logits(model_name, fold, model, loader):
    mc = mc_dropout_predictions(model, loader, device=device, T=MC_PASSES)
    np.save(_paths(model_name, fold)["mc_logits"], mc)
    return mc


def compute_and_save_metrics(model_name, fold, y_true, y_pred, probs, mc=None):
    """
    Calcola tutte le metriche, MC-stats e ECE post-calibrazione.
    """
    mc_stats = mc_dropout_statistics(mc) if mc is not None else {}

    acc  = accuracy_score(y_true, y_pred)
    f1   = f1_score(y_true, y_pred, average="macro")
    try:
        auc = roc_auc_score(y_true, y_pred, average="macro", multi_class="ovo")
    except ValueError:
        auc = None
    cm = confusion_matrix(y_true, y_pred).tolist()
    cr = classification_report(y_true, y_pred, output_dict=True)

    mets = {
        "accuracy": acc,
        "macro_f1": f1,
        "roc_auc": auc,
        "confusion_matrix": cm,
        "class_report": cr,
        **mc_stats
    }

    # ECE post-calibrazione
    mets["ece_post"] = expected_calibration_error(probs, y_true, n_bins=ECE_BINS)

    save_json(mets, _paths(model_name, fold)["metrics"])
    return mets


In [7]:
# Cell 7 – Per-Fold Evaluation (aggiornato)

from utils.training_utils.data_utils import default_transforms, build_loader
from utils.training_utils.device_io import get_latest_checkpoint
from collections import defaultdict, Counter
import numpy as np
import pandas as pd

def aggregate_patient_results(model_name, fold, keys, y_pred, probs):
    """
    Aggrega predizioni per paziente (majority voting) e salva CSV.
    """
    by_pt = defaultdict(list)
    for k, y, conf in zip(keys, y_pred, probs.max(axis=1)):
        pid = extract_patient_id(k)
        by_pt[pid].append((y, conf))

    rows = []
    for pid, recs in by_pt.items():
        votes, confs = zip(*recs)
        rows.append({
            "patient_id": pid,
            "true_label": parse_label_from_filename(pid),
            "pred_label": Counter(votes).most_common(1)[0][0],
            "n_patches": len(recs),
            "mean_conf_raw": float(np.mean(confs))
        })

    pd.DataFrame(rows).to_csv(
        _paths(model_name, fold)["patient_preds"], index=False
    )


def evaluate_fold(model_name: str, fold: int):
    print(f"\n🔍 Evaluating {model_name} fold {fold}…")

    # 0️⃣ Paths e checkpoint
    paths = _paths(model_name, fold)
    ckpt  = get_latest_checkpoint(paths["ckpt_dir"])
    if ckpt is None:
        print(f"⚠️ Nessun checkpoint trovato per {model_name} fold {fold}, salto valutazione")
        return

    # 1️⃣ Skip se artefatti già tutti presenti
    is_ssl = (cfg["models"][model_name]["type"] == "ssl")
    if _completed(paths, is_ssl):
        print(f"⚡ Skipping {model_name} fold {fold}: artifacts already present")
        return

    # 2️⃣ Carica modello, probe, scaler
    print("📥 Loading model, classifier, scaler…")
    model, is_ssl, clf, le, temp_scaler = load_model_and_components(model_name, fold)
    print(f"   ➔ SSL pipeline?  {is_ssl}")
    print(f"   ➔ Classifier?    {clf is not None}")
    print(f"   ➔ Temp-scaler?   {temp_scaler is not None}")

    # 3️⃣ Build test loader
    patch_size = cfg["models"][model_name].get("patch_size", 224)
    batch_size = cfg["models"][model_name]["training"]["batch_size"]
    test_rel   = cfg["data"]["test"].format(
        fold_idx=fold,
        dataset_id=cfg["data"]["dataset_id"]
    )
    test_wds   = (PROJECT_ROOT / test_rel).resolve()
    print(f"🧪 Using test shard: {test_wds}")
    assert test_wds.exists(), f"❌ Test shard not found: {test_wds}"

    # ❗ mappatura globale delle classi (inclusa 'CHROMO' quindi)
    all_classes  = cfg["data"]["classes"]
    class_to_idx = {cls: i for i, cls in enumerate(all_classes)}

    loader = build_loader(
        str(test_wds),
        class_to_idx=class_to_idx,
        patch_size=patch_size,
        batch_size=batch_size,
        device=device,
        augment=False,
    )
    print(f"📦 DataLoader ready with batch_size = {batch_size}")

    # 4️⃣ Patch-level inference
    print("▶️ Running inference (patch-level)…")
    keys, y_true, y_pred, probs = run_patch_inference(
        model, loader, is_ssl, clf, temp_scaler
    )
    print(f"   ➔ Patches processed: {len(y_true)}")
    save_patch_outputs(model_name, fold, keys, y_true, y_pred, probs)
    print("💾 Saved patch-level outputs.")

    # 5️⃣ MC-Dropout (solo SSL)
    if is_ssl:
        print("🔄 Running MC-Dropout…")
        mc = save_mc_logits(model_name, fold, model, loader)
        print("💾 Saved MC-Dropout logits.")
    else:
        mc = None

    # 6️⃣ Compute & save metrics
    print("📊 Computing metrics…")
    compute_and_save_metrics(model_name, fold, y_true, y_pred, probs, mc)
    print("💾 Saved metrics JSON.")

    # 7️⃣ Patient-level aggregation (solo SSL)
    if is_ssl:
        print("👨‍⚕️ Aggregating patient-level results…")
        aggregate_patient_results(model_name, fold, keys, y_pred, probs)
        print("💾 Saved patient-level CSV.")

    print(f"✅ Done {model_name} fold {fold}\n")


# 🚀 Run evaluation for all models and folds
for model_name in cfg["run_models"]:
    for fold_idx in cfg["folds"]:
        evaluate_fold(model_name, fold_idx)


🔍 Evaluating simclr fold 0…
⚡ Skipping simclr fold 0: artifacts already present

🔍 Evaluating simclr fold 1…
⚠️ Nessun checkpoint trovato per simclr fold 1, salto valutazione

🔍 Evaluating jepa fold 0…
⚡ Skipping jepa fold 0: artifacts already present

🔍 Evaluating jepa fold 1…
⚠️ Nessun checkpoint trovato per jepa fold 1, salto valutazione

🔍 Evaluating moco_v2 fold 0…
⚠️ Nessun checkpoint trovato per moco_v2 fold 0, salto valutazione

🔍 Evaluating moco_v2 fold 1…
⚠️ Nessun checkpoint trovato per moco_v2 fold 1, salto valutazione

🔍 Evaluating rotation fold 0…
⚡ Skipping rotation fold 0: artifacts already present

🔍 Evaluating rotation fold 1…
⚠️ Nessun checkpoint trovato per rotation fold 1, salto valutazione

🔍 Evaluating supervised fold 0…
📥 Loading model, classifier, scaler…




   ➔ Caricamento checkpoint fold0 → SupervisedTrainer_bestepoch001.pt


RuntimeError: Error(s) in loading state_dict for ResNet:
	size mismatch for fc.weight: copying a param with shape torch.Size([5, 2048]) from checkpoint, the shape in current model is torch.Size([0, 2048]).
	size mismatch for fc.bias: copying a param with shape torch.Size([5]) from checkpoint, the shape in current model is torch.Size([0]).

In [None]:
!ls /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/dataset_9f30917e/webdataset/


In [None]:
# Cell 8 – Fold-Level & Experiment-Level Aggregation
def aggregate_model(model_name: str):
    # collect per-fold metrics
    all_m = []
    for f in cfg["folds"]:
        p = _paths(model_name, f)["metrics"]
        all_m.append(json.load(open(p)))
    summary = aggregate_fold_metrics(all_m)
    save_json(_paths(model_name,0)["agg_metrics"], summary)
    # plot heatmap of means
    dfm = pd.DataFrame(all_m)
    fig, ax = plt.subplots(figsize=(6,4))
    sns.heatmap(dfm.mean()[["accuracy","macro_f1","roc_auc"]].to_frame().T,
                annot=True, fmt=".3f", ax=ax)
    fig.savefig(_paths(model_name,0)["agg_summary"], bbox_inches="tight")
    plt.close(fig)
    print(f"Aggregated {model_name}")

def aggregate_experiment():
    models = cfg["run_models"]
    rows = []
    for m in models:
        met = json.load(open(_paths(m,0)["agg_metrics"]))
        row = {"model": m}
        for k,v in met.items(): row[k] = v["mean"]
        rows.append(row)
    save_json(_paths(models[0],0)["exp_json"], rows)
    df = pd.DataFrame(rows)
    fig, ax = plt.subplots(figsize=(8,4))
    sns.barplot(data=df, x="model", y="accuracy", ax=ax)
    ax.set_title("Model Accuracy Comparison")
    fig.savefig(_paths(models[0],0)["exp_img"], bbox_inches="tight")
    plt.close(fig)
    print("Experiment-level comparison complete")

# run aggregations
for m in cfg["run_models"]:
    aggregate_model(m)
aggregate_experiment()
