In [3]:
# ===================== 03_train_models â€” CELL 0: Bootstrap =====================

import os
import sys
from pathlib import Path

p = Path.cwd().resolve()
REPO_ROOT = None
for candidate in [p, *p.parents]:
    if (candidate / "src" / "imgofup").is_dir():
        REPO_ROOT = candidate
        break
if REPO_ROOT is None:
    raise RuntimeError("Could not find repo root (no 'src/imgofup' found).")

SRC_DIR = REPO_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

os.environ["PROJ_ROOT"] = str(REPO_ROOT)

print("ðŸ“¦ Repo root:", REPO_ROOT)
print("ðŸ“¦ Using src from:", SRC_DIR)
print("ðŸ”§ PROJ_ROOT env set to:", os.environ["PROJ_ROOT"])

DATA_DIR = REPO_ROOT / "data"


ðŸ“¦ Repo root: /Users/amirdonyadide/Documents/GitHub/IMGOFUP
ðŸ“¦ Using src from: /Users/amirdonyadide/Documents/GitHub/IMGOFUP/src
ðŸ”§ PROJ_ROOT env set to: /Users/amirdonyadide/Documents/GitHub/IMGOFUP


In [5]:
# ===================== 03_train_models â€” CELL 1: Load experiment registry =====================

from pathlib import Path

# repo_root is already computed in your bootstrap cell
# Import from notebooks/experiments.py (same folder as notebooks)
NOTEBOOKS_DIR = Path.cwd().resolve()
if (NOTEBOOKS_DIR / "experiments.py").is_file():
    from experiments import make_experiments
else:
    raise FileNotFoundError(
        "Missing notebooks/experiments.py. Create it so all notebooks share the same EXPERIMENTS registry."
    )

EXPERIMENTS = make_experiments(REPO_ROOT)

# normalize paths and ensure dirs exist
for cfg in EXPERIMENTS.values():
    cfg["train_out"] = Path(cfg["train_out"]).resolve()
    cfg["model_out"] = Path(cfg["model_out"]).resolve()
    cfg["train_out"].mkdir(parents=True, exist_ok=True)
    cfg["model_out"].mkdir(parents=True, exist_ok=True)

print("âœ… Loaded EXPERIMENTS:", list(EXPERIMENTS.keys()))


âœ… Loaded EXPERIMENTS: ['openai_prompt_only', 'use_prompt_only', 'map_only', 'use_map', 'openai_map']


In [6]:
# ===================== 03_train_models â€” CELL: Load Stage-2 cache from disk =====================

import json
from pathlib import Path
import numpy as np

from imgofup.config import paths as CONFIG
from imgofup.config.constants import MAPS_ID_COL

STAGE2_DIRNAME = "cache_stage2"

SPLITS = {}
PREPROC = {}
LABELS = {}

print("\n=== Loading Stage-2 cache produced by notebook 02 ===")

for exp_name, cfg in EXPERIMENTS.items():
    cache_dir = Path(cfg["model_out"]).expanduser().resolve() / STAGE2_DIRNAME
    if not cache_dir.is_dir():
        raise FileNotFoundError(
            f"Missing cache for {exp_name} at {cache_dir}\n"
            "Fix: run 02_build_training_data.ipynb (with cache-saving cell) first."
        )

    # ---- load scaled arrays
    zX = np.load(cache_dir / "X_scaled.npz", allow_pickle=True)
    X_train_s = np.asarray(zX["X_train_s"], dtype=np.float64)
    X_val_s   = np.asarray(zX["X_val_s"], dtype=np.float64)
    X_test_s  = np.asarray(zX["X_test_s"], dtype=np.float64)

    # ---- load labels
    zL = np.load(cache_dir / "labels.npz", allow_pickle=True)
    y_train_cls = np.asarray(zL["y_train_cls"], dtype=int)
    y_val_cls   = np.asarray(zL["y_val_cls"], dtype=int)
    y_test_cls  = np.asarray(zL["y_test_cls"], dtype=int)
    sample_w    = np.asarray(zL["sample_w"], dtype=np.float64)

    # ---- load class names
    class_names = json.loads((cache_dir / "class_names.json").read_text(encoding="utf-8"))

    # ---- load dfs (needed for grouped CV + regressor targets)
    df_train = __import__("pandas").read_parquet(cache_dir / "df_train.parquet")
    df_val   = __import__("pandas").read_parquet(cache_dir / "df_val.parquet")
    df_test  = __import__("pandas").read_parquet(cache_dir / "df_test.parquet")

    # minimal SPLITS structure expected by your training code
    SPLITS[exp_name] = {
        "df_train": df_train,
        "df_val": df_val,
        "df_test": df_test,
    }

    PREPROC[exp_name] = {
        "X_train_s": X_train_s,
        "X_val_s": X_val_s,
        "X_test_s": X_test_s,
        "bundle_path": str(Path(cfg["model_out"]) / "preproc.joblib"),  # already saved earlier
    }

    LABELS[exp_name] = {
        "class_names": class_names,
        "y_train_cls": y_train_cls,
        "y_val_cls": y_val_cls,
        "y_test_cls": y_test_cls,
        "sample_w": sample_w,
    }

    # quick sanity
    if MAPS_ID_COL not in df_train.columns:
        raise ValueError(f"{exp_name}: df_train missing '{MAPS_ID_COL}' (needed for grouped CV).")
    if X_train_s.shape[0] != len(y_train_cls) or X_train_s.shape[0] != len(df_train):
        raise ValueError(
            f"{exp_name}: mismatch lengths: X_train_s={X_train_s.shape[0]}, "
            f"y_train={len(y_train_cls)}, df_train={len(df_train)}"
        )

    print(f"âœ… {exp_name}: loaded cache from {cache_dir}")

# reload these from config each time (not from cache)
DISTANCE_OPS = CONFIG.DISTANCE_OPS
AREA_OPS = CONFIG.AREA_OPS
CFG = CONFIG.CFG

print("\nâœ… Stage-2 cache loaded. You can now run classifier + regressor training.")



=== Loading Stage-2 cache produced by notebook 02 ===
âœ… openai_prompt_only: loaded cache from /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_openai_prompt_only/cache_stage2
âœ… use_prompt_only: loaded cache from /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_use_prompt_only/cache_stage2
âœ… map_only: loaded cache from /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_map_only/cache_stage2
âœ… use_map: loaded cache from /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_use_map/cache_stage2
âœ… openai_map: loaded cache from /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_openai_map/cache_stage2

âœ… Stage-2 cache loaded. You can now run classifier + regressor training.


In [7]:
# ===================== 03_train_models â€” CELL 1: Ensure notebook 02 has been run =====================

required_globals = ["EXPERIMENTS", "SPLITS", "PREPROC", "LABELS", "DISTANCE_OPS", "AREA_OPS"]

missing = [k for k in required_globals if k not in globals()]
if missing:
    raise RuntimeError(
        "This notebook expects outputs from 02_build_training_data.ipynb in the SAME kernel.\n"
        f"Missing variables: {missing}\n\n"
        "Fix: run notebooks/02_build_training_data.ipynb first (same kernel), then run this notebook."
    )

print("âœ… Found required variables from notebook 02.")
print("Experiments:", list(EXPERIMENTS.keys()))


âœ… Found required variables from notebook 02.
Experiments: ['openai_prompt_only', 'use_prompt_only', 'map_only', 'use_map', 'openai_map']


In [None]:
# ===================== 03_train_models â€” CELL 2: Train classifier (per experiment) =====================

from pathlib import Path
import json
from dataclasses import asdict, is_dataclass

from imgofup.config.paths import CFG
from imgofup.config.constants import (
    MAPS_ID_COL,
    CLS_SEARCH_N_ITER_DEFAULT,
    CLS_SEARCH_N_SPLITS_DEFAULT,
    CLS_SEARCH_SEED_DEFAULT,
)
from imgofup.models.train_classifier import train_mlp_classifier_with_search

CLF_RESULTS = {}

def _safe_get(obj, *names, default=None):
    for n in names:
        if hasattr(obj, n):
            return getattr(obj, n)
    return default

print("\n=== Training operator classifiers for all experiments ===")

printed_debug_fields = False

for exp_name, cfg in EXPERIMENTS.items():
    split = SPLITS[exp_name]
    pre   = PREPROC[exp_name]
    lab   = LABELS[exp_name]

    X_train_s = pre["X_train_s"]
    X_val_s   = pre["X_val_s"]
    X_test_s  = pre["X_test_s"]

    y_train  = lab["y_train_cls"]
    y_val    = lab["y_val_cls"]
    y_test   = lab["y_test_cls"]
    sample_w = lab["sample_w"]

    class_names = [str(x) for x in lab["class_names"]]

    # Sanity checks
    if X_train_s.shape[0] != len(y_train):
        raise ValueError(f"{exp_name}: X_train rows {X_train_s.shape[0]} != y_train {len(y_train)}")
    if X_val_s.shape[0] != len(y_val):
        raise ValueError(f"{exp_name}: X_val rows {X_val_s.shape[0]} != y_val {len(y_val)}")
    if X_test_s.shape[0] != len(y_test):
        raise ValueError(f"{exp_name}: X_test rows {X_test_s.shape[0]} != y_test {len(y_test)}")

    # Grouped CV: group by map_id to avoid leakage across folds
    if MAPS_ID_COL not in split["df_train"].columns:
        raise ValueError(f"{exp_name}: df_train missing '{MAPS_ID_COL}' for grouped CV.")
    groups_tr = split["df_train"][MAPS_ID_COL].astype(str).to_numpy()

    model_out_dir = Path(cfg["model_out"]).expanduser().resolve()
    model_out_dir.mkdir(parents=True, exist_ok=True)

    print(f"\nðŸ§ª Experiment: {exp_name}")
    print(f"   Classes   : {class_names}")
    print(f"   Train X   : {X_train_s.shape}")
    print(f"   Val X     : {X_val_s.shape}")
    print(f"   Test X    : {X_test_s.shape}")
    print(f"   Model out : {model_out_dir}")

    res_clf = train_mlp_classifier_with_search(
        exp_name=exp_name,
        X_train=X_train_s,
        y_train=y_train,
        groups_train=groups_tr,
        sample_w=sample_w,
        X_val=X_val_s,
        y_val=y_val,
        X_test=X_test_s,
        y_test=y_test,
        class_names=class_names,
        out_dir=model_out_dir,
        n_iter=int(CLS_SEARCH_N_ITER_DEFAULT),
        n_splits=int(CLS_SEARCH_N_SPLITS_DEFAULT),
        seed=int(getattr(CFG, "SEED", CLS_SEARCH_SEED_DEFAULT)),
        verbose=True,
        save_name="classifier.joblib",
    )

    CLF_RESULTS[exp_name] = res_clf

    # Robust reporting (no assumptions about field names)
    model_path    = _safe_get(res_clf, "model_path", "path", default=str(model_out_dir / "classifier.joblib"))
    best_val_f1   = _safe_get(res_clf, "val_f1_macro", "best_val_f1", "val_f1", "best_f1", default=None)
    best_val_acc  = _safe_get(res_clf, "val_acc", "best_val_acc", "best_accuracy", default=None)
    test_f1       = _safe_get(res_clf, "test_f1_macro", "test_f1", default=None)
    test_acc      = _safe_get(res_clf, "test_acc", "accuracy_test", default=None)

    print("   âœ… Classifier training done.")
    print("   Saved to:", model_path)
    if best_val_f1 is not None or best_val_acc is not None:
        print("   Best VAL:", {"macro_f1": best_val_f1, "acc": best_val_acc})
    if test_f1 is not None or test_acc is not None:
        print("   TEST     :", {"macro_f1": test_f1, "acc": test_acc})

    # Save lightweight meta for evaluation / reporting
    clf_meta = {
        "experiment": exp_name,
        "feature_mode": cfg["feature_mode"],
        "class_names": class_names,
        "best_val": {"macro_f1": best_val_f1, "acc": best_val_acc},
        "test": {"macro_f1": test_f1, "acc": test_acc},
        "model_path": str(model_path),
    }
    (model_out_dir / "classifier_meta.json").write_text(json.dumps(clf_meta, indent=2), encoding="utf-8")

    # Print available fields once for debugging
    if not printed_debug_fields:
        printed_debug_fields = True
        if is_dataclass(res_clf):
            print("   (debug) Result fields:", list(asdict(res_clf).keys()))
        else:
            print("   (debug) Result attrs :", [a for a in dir(res_clf) if not a.startswith("_")])

print("\nâœ… All classifiers trained.")



=== Training operator classifiers for all experiments ===

ðŸ§ª Experiment: openai_prompt_only
   Classes   : ['simplify', 'select', 'aggregate', 'displace']
   Train X   : (448, 1536)
   Val X     : (57, 1536)
   Test X    : (57, 1536)
   Model out : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_openai_prompt_only

Searching 50 MLP configs...
[01/50] cvF1=0.929Â±0.018 | VAL F1=0.923 acc=0.930 | (128, 64), Î±=2.02e-02, lr=1.2e-03, bs=16




[02/50] cvF1=0.918Â±0.038 | VAL F1=0.939 acc=0.947 | (256, 128), Î±=3.49e-05, lr=1.7e-04, bs=64


In [None]:
# ===================== 03_train_models â€” CELL 3: Train regressors + save final bundle =====================

from pathlib import Path
import joblib

from imgofup.config.paths import CFG
from imgofup.config.constants import (
    MAPS_ID_COL,
    PARAM_TARGET_NAME,
    EXTENT_DIAG_COL,
    EXTENT_AREA_COL,
    REG_USE_LOG1P_DEFAULT,
    REG_N_SPLITS_DEFAULT,
    REG_N_ITER_DEFAULT,
    REG_RANDOM_STATE_DEFAULT,
    REG_VERBOSE_DEFAULT,
)
from imgofup.models.train_regressors import train_regressors_per_operator
from imgofup.models.save_bundle import save_cls_plus_regressors_bundle

BUNDLES = {}     # exp_name -> bundle path
REG_RESULTS = {} # exp_name -> regressor training result

def _safe_get(obj, *names, default=None):
    for n in names:
        if hasattr(obj, n):
            return getattr(obj, n)
    return default

print("\n=== Training per-operator regressors and saving final bundles ===")

for exp_name, cfg in EXPERIMENTS.items():
    split = SPLITS[exp_name]
    pre   = PREPROC[exp_name]
    lab   = LABELS[exp_name]
    res_clf = CLF_RESULTS[exp_name]

    X_train_s = pre["X_train_s"]
    df_train  = split["df_train"]
    y_train_cls = lab["y_train_cls"]
    sample_w = lab["sample_w"]

    cn = [str(x) for x in lab["class_names"]]

    model_out_dir = Path(cfg["model_out"]).expanduser().resolve()
    model_out_dir.mkdir(parents=True, exist_ok=True)

    print(f"\nðŸ§ª Experiment: {exp_name}")
    print(f"   Model out: {model_out_dir}")
    print(f"   Train X  : {X_train_s.shape} | df_train: {df_train.shape}")

    # (1) Train per-operator regressors on TRAIN only
    reg_res = train_regressors_per_operator(
        X_train_s=X_train_s,
        df_train=df_train,
        y_train_cls=y_train_cls,
        class_names=cn,
        sample_w=sample_w,
        group_col=MAPS_ID_COL,
        target_col=PARAM_TARGET_NAME,
        use_log1p=bool(REG_USE_LOG1P_DEFAULT),
        n_splits=int(REG_N_SPLITS_DEFAULT),
        n_iter=int(REG_N_ITER_DEFAULT),
        random_state=int(getattr(CFG, "SEED", REG_RANDOM_STATE_DEFAULT)),
        verbose=int(REG_VERBOSE_DEFAULT),
    )

    REG_RESULTS[exp_name] = reg_res

    # (2) Load the trained classifier model from disk
    clf_model_path = _safe_get(res_clf, "model_path", "path", default=str(model_out_dir / "classifier.joblib"))
    clf_pack = joblib.load(Path(clf_model_path))
    final_clf = clf_pack["model"] if isinstance(clf_pack, dict) and "model" in clf_pack else clf_pack

    # (3) Save combined bundle for evaluation
    bundle_res = save_cls_plus_regressors_bundle(
        exp_name=exp_name,
        out_dir=model_out_dir,
        classifier=final_clf,
        regressors_by_class=reg_res.regressors_by_class,
        class_names=cn,
        use_log1p=reg_res.use_log1p,
        cv_summary=reg_res.cv_summary,
        distance_ops=DISTANCE_OPS,
        area_ops=AREA_OPS,
        diag_col=EXTENT_DIAG_COL,
        area_col=EXTENT_AREA_COL,
        save_name="cls_plus_regressors.joblib",
    )

    BUNDLES[exp_name] = bundle_res.bundle_path

    print("   âœ… Saved bundle:", bundle_res.bundle_path)
    print("   âœ… Regressors trained for:", sorted(list(reg_res.regressors_by_class.keys())))

print("\nâœ… All bundles saved.")
for k, v in BUNDLES.items():
    print(f" - {k:18s}: {v}")
