In [1]:
# ===================== 04_evaluate â€” CELL 0: Bootstrap =====================

import os
import sys
from pathlib import Path

p = Path.cwd().resolve()
REPO_ROOT = None
for candidate in [p, *p.parents]:
    if (candidate / "src" / "imgofup").is_dir():
        REPO_ROOT = candidate
        break
if REPO_ROOT is None:
    raise RuntimeError("Could not find repo root (no 'src/imgofup' found).")

SRC_DIR = REPO_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

os.environ["PROJ_ROOT"] = str(REPO_ROOT)

print("ðŸ“¦ Repo root:", REPO_ROOT)
print("ðŸ“¦ Using src from:", SRC_DIR)
print("ðŸ”§ PROJ_ROOT env set to:", os.environ["PROJ_ROOT"])


ðŸ“¦ Repo root: /Users/amirdonyadide/Documents/GitHub/IMGOFUP
ðŸ“¦ Using src from: /Users/amirdonyadide/Documents/GitHub/IMGOFUP/src
ðŸ”§ PROJ_ROOT env set to: /Users/amirdonyadide/Documents/GitHub/IMGOFUP


In [2]:
# ===================== 04_evaluate â€” CELL 1: Load EXPERIMENTS =====================

from pathlib import Path

NOTEBOOKS_DIR = Path.cwd().resolve()
if not (NOTEBOOKS_DIR / "experiments.py").is_file():
    raise FileNotFoundError(
        "Missing notebooks/experiments.py. Create it to share EXPERIMENTS across notebooks."
    )

from experiments import make_experiments

EXPERIMENTS = make_experiments(REPO_ROOT)

for cfg in EXPERIMENTS.values():
    cfg["train_out"] = Path(cfg["train_out"]).resolve()
    cfg["model_out"] = Path(cfg["model_out"]).resolve()

print("âœ… Loaded EXPERIMENTS:", list(EXPERIMENTS.keys()))


âœ… Loaded EXPERIMENTS: ['openai_prompt_only', 'use_prompt_only', 'map_only', 'use_map', 'openai_map']


In [3]:
# ===================== 04_evaluate â€” CELL 2: Load Stage-2 cache + bundle paths =====================

import json
import numpy as np
import pandas as pd
import joblib
from pathlib import Path

from imgofup.config.constants import (
    MAPS_ID_COL,
    PARAM_TARGET_NAME,
)

STAGE2_DIRNAME = "cache_stage2"
BUNDLE_NAME = "cls_plus_regressors.joblib"

# Load per-experiment cached test set + labels
EVAL_DATA = {}  # exp_name -> dict with X_test_s, df_test, y_test_cls, class_names, bundle_path

for exp_name, cfg in EXPERIMENTS.items():
    model_out = Path(cfg["model_out"]).expanduser().resolve()
    cache_dir = model_out / STAGE2_DIRNAME
    bundle_path = model_out / BUNDLE_NAME

    if not cache_dir.is_dir():
        raise FileNotFoundError(
            f"Missing stage2 cache for {exp_name} at {cache_dir}. "
            "Run 02 (cache saving cell) first."
        )
    if not bundle_path.is_file():
        raise FileNotFoundError(
            f"Missing bundle for {exp_name} at {bundle_path}. "
            "Run 03 (bundle saving) first."
        )

    zX = np.load(cache_dir / "X_scaled.npz", allow_pickle=True)
    X_test_s = np.asarray(zX["X_test_s"], dtype=np.float64)

    zL = np.load(cache_dir / "labels.npz", allow_pickle=True)
    y_test_cls = np.asarray(zL["y_test_cls"], dtype=int)

    class_names = json.loads((cache_dir / "class_names.json").read_text(encoding="utf-8"))

    df_test = pd.read_parquet(cache_dir / "df_test.parquet")

    # basic sanity
    if X_test_s.shape[0] != len(y_test_cls) or len(y_test_cls) != len(df_test):
        raise ValueError(
            f"{exp_name}: mismatch lengths: X_test={X_test_s.shape[0]}, y_test={len(y_test_cls)}, df_test={len(df_test)}"
        )
    if PARAM_TARGET_NAME not in df_test.columns:
        raise KeyError(f"{exp_name}: df_test missing '{PARAM_TARGET_NAME}' needed for param evaluation.")
    if MAPS_ID_COL not in df_test.columns:
        raise KeyError(f"{exp_name}: df_test missing '{MAPS_ID_COL}' (useful sanity).")

    EVAL_DATA[exp_name] = {
        "X_test_s": X_test_s,
        "df_test": df_test,
        "y_test_cls": y_test_cls,
        "class_names": class_names,
        "bundle_path": str(bundle_path),
        "model_out": str(model_out),
    }

print("âœ… Loaded eval data for:", list(EVAL_DATA.keys()))


âœ… Loaded eval data for: ['openai_prompt_only', 'use_prompt_only', 'map_only', 'use_map', 'openai_map']


In [4]:
# ===================== 04_evaluate â€” CELL 3: Classifier comparison table =====================

import json
import pandas as pd
from pathlib import Path

rows = []

for exp_name, cfg in EXPERIMENTS.items():
    meta_path = Path(cfg["model_out"]).expanduser().resolve() / "classifier_meta.json"

    if not meta_path.is_file():
        raise FileNotFoundError(
            f"Missing classifier_meta.json for experiment '{exp_name}' at:\n  {meta_path}\n"
            "Run 03_train_models.ipynb (classifier training) first."
        )

    meta = json.loads(meta_path.read_text(encoding="utf-8"))

    rows.append({
        "experiment": exp_name,
        "val_acc": meta.get("best_val", {}).get("acc"),
        "val_f1_macro": meta.get("best_val", {}).get("macro_f1"),
        "test_acc": meta.get("test", {}).get("acc"),
        "test_f1_macro": meta.get("test", {}).get("macro_f1"),
        "model_path": meta.get("model_path"),
    })

df_clf = pd.DataFrame(rows).sort_values(
    by="test_f1_macro",
    ascending=False,
    na_position="last",
).reset_index(drop=True)

print("\n=== Classifier comparison (sorted by TEST macro-F1) ===")
df_clf



=== Classifier comparison (sorted by TEST macro-F1) ===


Unnamed: 0,experiment,val_acc,val_f1_macro,test_acc,test_f1_macro,model_path
0,openai_prompt_only,0.964912,0.96473,0.912281,0.914519,/Users/amirdonyadide/Documents/GitHub/IMGOFUP/...
1,use_prompt_only,0.947368,0.947169,0.877193,0.87625,/Users/amirdonyadide/Documents/GitHub/IMGOFUP/...
2,openai_map,0.947368,0.94436,0.859649,0.866066,/Users/amirdonyadide/Documents/GitHub/IMGOFUP/...
3,use_map,0.894737,0.889212,0.754386,0.766735,/Users/amirdonyadide/Documents/GitHub/IMGOFUP/...
4,map_only,0.385965,0.378513,0.263158,0.257236,/Users/amirdonyadide/Documents/GitHub/IMGOFUP/...


In [5]:
# ===================== 04_evaluate â€” CELL 4: Regressor comparison table (RMSE by operator) =====================

from pathlib import Path
import joblib
import pandas as pd
import numpy as np

FIXED_CLASSES = ["simplify", "select", "aggregate", "displace"]
ops = list(FIXED_CLASSES)

bund_cv = {}

for exp_name, ed in EVAL_DATA.items():
    bundle_path = Path(ed["bundle_path"]).expanduser().resolve()
    pack = joblib.load(bundle_path)

    # your bundle is produced by save_cls_plus_regressors_bundle â†’ cv_summary is top-level
    cv_summary = pack.get("cv_summary", None) if isinstance(pack, dict) else None

    if cv_summary is None:
        raise ValueError(
            f"{exp_name}: bundle has no cv_summary. "
            f"Bundle type: {type(pack)} | keys: {list(pack.keys()) if isinstance(pack, dict) else 'n/a'}"
        )

    bund_cv[exp_name] = cv_summary

def get_rmse_param(cv_summary, op_name):
    d = cv_summary.get(op_name, cv_summary.get(str(op_name), {}))
    if not isinstance(d, dict):
        return np.nan

    for k in ["rmse_param_norm", "rmse_param", "rmse_param_units", "rmse_param_norm_units", "rmse_param_norm_units_mean"]:
        if k in d and d[k] is not None:
            return float(d[k])

    # fallback: any rmse+param-ish
    for k, v in d.items():
        if isinstance(v, (int, float)) and ("rmse" in k.lower()) and ("param" in k.lower()):
            return float(v)

    return np.nan

rows = []
for exp_name, cv_summary in bund_cv.items():
    row = {"experiment": exp_name}
    for op in ops:
        row[op] = get_rmse_param(cv_summary, op)
    rows.append(row)

df_rmse = pd.DataFrame(rows).set_index("experiment")
df_rmse["mean_rmse"] = df_rmse[ops].mean(axis=1)

df_rmse_sorted = df_rmse.sort_values("mean_rmse", ascending=True)

df_pct = (df_rmse_sorted * 100).round(3)
rename_ops = {op: f"{op} RMSE (%)" for op in ops}
rename_ops["mean_rmse"] = "Mean RMSE (%)"
df_pct = df_pct.rename(columns=rename_ops)

print("\n=== RMSE (raw units; likely in [0,1] if param_norm) ===")
display(df_rmse_sorted.round(6))

print("\n=== RMSE as percent of [0,1] range ===")
display(df_pct)



=== RMSE (raw units; likely in [0,1] if param_norm) ===


Unnamed: 0_level_0,simplify,select,aggregate,displace,mean_rmse
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
map_only,0.004217,0.000253,0.003658,0.003258,0.002847
use_map,0.004395,0.000249,0.00371,0.00331,0.002916
use_prompt_only,0.004347,0.000362,0.003499,0.003669,0.002969
openai_prompt_only,0.00433,0.000359,0.003562,0.003713,0.002991
openai_map,0.00464,0.00025,0.003801,0.003386,0.003019



=== RMSE as percent of [0,1] range ===


Unnamed: 0_level_0,simplify RMSE (%),select RMSE (%),aggregate RMSE (%),displace RMSE (%),Mean RMSE (%)
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
map_only,0.422,0.025,0.366,0.326,0.285
use_map,0.44,0.025,0.371,0.331,0.292
use_prompt_only,0.435,0.036,0.35,0.367,0.297
openai_prompt_only,0.433,0.036,0.356,0.371,0.299
openai_map,0.464,0.025,0.38,0.339,0.302


In [6]:
# ===================== 04_evaluate â€” CELL 5: End-to-end evaluation (TEST) =====================

import numpy as np
import pandas as pd
import joblib

from imgofup.config.constants import PARAM_TARGET_NAME

TOL = 0.05  # tolerance in param_norm units (0..1)

def _predict_param(reg_and_scaler, Xi):
    if isinstance(reg_and_scaler, (tuple, list)):
        reg = reg_and_scaler[0]
        y_scaler = reg_and_scaler[1] if len(reg_and_scaler) > 1 else None
    else:
        reg = reg_and_scaler
        y_scaler = None

    y_hat = float(reg.predict(Xi)[0])

    if y_scaler is not None:
        try:
            y_hat = float(y_scaler.inverse_transform(np.array([[y_hat]], dtype=float))[0, 0])
        except Exception:
            pass

    return y_hat

def _safe_pack_get(pack, key, default=None):
    if isinstance(pack, dict):
        return pack.get(key, default)
    return getattr(pack, key, default)

rows = []

for exp_name, cfg in EXPERIMENTS.items():
    ed = EVAL_DATA[exp_name]
    bundle = joblib.load(ed["bundle_path"])

    clf = _safe_pack_get(bundle, "classifier")
    regs = _safe_pack_get(bundle, "regressors_by_class")
    class_names = [str(x) for x in _safe_pack_get(bundle, "class_names", [])]

    if clf is None or regs is None or not class_names:
        raise ValueError(
            f"{exp_name}: bundle missing required keys. "
            f"Have classifier={clf is not None}, regressors_by_class={regs is not None}, class_names={len(class_names)}"
        )

    X_test = ed["X_test_s"]
    y_true_cls = ed["y_test_cls"]
    df_test = ed["df_test"]

    y_true_param = df_test[PARAM_TARGET_NAME].to_numpy(dtype=float)

    # Predict operator
    y_pred_cls = clf.predict(X_test)
    op_acc = float((y_pred_cls == y_true_cls).mean())

    pred_names = [class_names[int(i)] for i in y_pred_cls]
    true_names = [class_names[int(i)] for i in y_true_cls]

    # Predict param using regressor of predicted operator
    y_pred_param = np.zeros_like(y_true_param, dtype=float)

    regs_norm = {str(k).strip().lower(): v for k, v in regs.items()}

    for i, op in enumerate(pred_names):
        key = str(op).strip().lower()
        if key not in regs_norm:
            raise KeyError(
                f"{exp_name}: no regressor for predicted class '{op}'. "
                f"Available keys: {list(regs_norm.keys())}"
            )
        Xi = X_test[i:i+1]
        y_pred_param[i] = _predict_param(regs_norm[key], Xi)

    abs_err = np.abs(y_pred_param - y_true_param)
    correct_mask = (np.array(pred_names) == np.array(true_names))

    # Param RMSE/MAE only when operator correct
    if correct_mask.any():
        rmse_cond = float(np.sqrt(np.mean((y_pred_param[correct_mask] - y_true_param[correct_mask]) ** 2)))
        mae_cond  = float(np.mean(abs_err[correct_mask]))
    else:
        rmse_cond, mae_cond = np.nan, np.nan

    # Joint metric: operator correct AND parameter within tolerance
    joint_success = float(np.mean(correct_mask & (abs_err <= TOL)))

    # Penalized RMSE over all: if op wrong, set error=1 (max on [0,1])
    penalized_err = abs_err.copy()
    penalized_err[~correct_mask] = 1.0
    rmse_penalized = float(np.sqrt(np.mean(penalized_err ** 2)))

    rows.append({
        "experiment": exp_name,
        "op_acc": op_acc,
        "param_rmse_if_op_correct": rmse_cond,
        "param_mae_if_op_correct": mae_cond,
        f"joint_success@{TOL}": joint_success,
        "rmse_penalized_all": rmse_penalized,
        "n_test": int(len(y_true_cls)),
        "n_op_correct": int(correct_mask.sum()),
    })

df_e2e = pd.DataFrame(rows).sort_values("rmse_penalized_all", ascending=True).reset_index(drop=True)
df_e2e


Unnamed: 0,experiment,op_acc,param_rmse_if_op_correct,param_mae_if_op_correct,joint_success@0.05,rmse_penalized_all,n_test,n_op_correct
0,openai_prompt_only,0.912281,0.002811,0.002044,0.912281,0.296187,57,52
1,use_prompt_only,0.877193,0.002993,0.002168,0.877193,0.35045,57,50
2,openai_map,0.859649,0.00328,0.002322,0.859649,0.374647,57,49
3,use_map,0.754386,0.003401,0.002483,0.754386,0.495603,57,43
4,map_only,0.263158,0.002274,0.001652,0.263158,0.858396,57,15
