In [1]:
# ===================== 02_build_training_data â€” CELL 0: Bootstrap =====================

import os
import sys
from pathlib import Path

p = Path.cwd().resolve()
REPO_ROOT = None
for candidate in [p, *p.parents]:
    if (candidate / "src" / "imgofup").is_dir():
        REPO_ROOT = candidate
        break
if REPO_ROOT is None:
    raise RuntimeError("Could not find repo root (no 'src/imgofup' found).")

SRC_DIR = REPO_ROOT / "src"
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

os.environ["PROJ_ROOT"] = str(REPO_ROOT)

print("ðŸ“¦ Repo root:", REPO_ROOT)
print("ðŸ“¦ Using src from:", SRC_DIR)
print("ðŸ”§ PROJ_ROOT env set to:", os.environ["PROJ_ROOT"])

DATA_DIR = REPO_ROOT / "data"


ðŸ“¦ Repo root: /Users/amirdonyadide/Documents/GitHub/IMGOFUP
ðŸ“¦ Using src from: /Users/amirdonyadide/Documents/GitHub/IMGOFUP/src
ðŸ”§ PROJ_ROOT env set to: /Users/amirdonyadide/Documents/GitHub/IMGOFUP


In [2]:
# ===================== 02_build_training_data â€” CELL 1: Experiment registry =====================

from pathlib import Path

EXPERIMENTS = {
    "openai_prompt_only": {
        "train_out": DATA_DIR / "output" / "train_out_openai_prompt_only",
        "model_out": DATA_DIR / "output" / "models" / "exp_openai_prompt_only",
        "feature_mode": "prompt_only",
        "prompt_encoder_kind": "openai-small",
    },
    "use_prompt_only": {
        "train_out": DATA_DIR / "output" / "train_out_use_prompt_only",
        "model_out": DATA_DIR / "output" / "models" / "exp_use_prompt_only",
        "feature_mode": "prompt_only",
        "prompt_encoder_kind": "dan",
    },
    "map_only": {
        "train_out": DATA_DIR / "output" / "train_out_map_only",
        "model_out": DATA_DIR / "output" / "models" / "exp_map_only",
        "feature_mode": "map_only",
    },
    "use_map": {
        "train_out": DATA_DIR / "output" / "train_out_use_map",
        "model_out": DATA_DIR / "output" / "models" / "exp_use_map",
        "feature_mode": "prompt_plus_map",
        "prompt_encoder_kind": "dan",
    },
    "openai_map": {
        "train_out": DATA_DIR / "output" / "train_out_openai_map",
        "model_out": DATA_DIR / "output" / "models" / "exp_openai_map",
        "feature_mode": "prompt_plus_map",
        "prompt_encoder_kind": "openai-small",
    },
}

for exp_cfg in EXPERIMENTS.values():
    exp_cfg["train_out"] = Path(exp_cfg["train_out"])
    exp_cfg["model_out"] = Path(exp_cfg["model_out"])
    exp_cfg["train_out"].mkdir(parents=True, exist_ok=True)
    exp_cfg["model_out"].mkdir(parents=True, exist_ok=True)

print("ðŸ§ª Experiments:")
for exp_name, cfg in EXPERIMENTS.items():
    pe = cfg.get("prompt_encoder_kind", "-")
    print(f" - {exp_name:18s} | mode={cfg['feature_mode']:14s} | prompt={pe:14s}")


ðŸ§ª Experiments:
 - openai_prompt_only | mode=prompt_only    | prompt=openai-small  
 - use_prompt_only    | mode=prompt_only    | prompt=dan           
 - map_only           | mode=map_only       | prompt=-             
 - use_map            | mode=prompt_plus_map | prompt=dan           
 - openai_map         | mode=prompt_plus_map | prompt=openai-small  


In [3]:
# ===================== 02_build_training_data â€” CELL 2: Locate required artifacts =====================

from pathlib import Path
from imgofup.config import paths as CONFIG
from imgofup.config.constants import (
    MAP_EMBEDDINGS_NPZ_NAME, MAPS_PARQUET_NAME,
    PROMPT_EMBEDDINGS_NPZ_NAME, PROMPTS_PARQUET_NAME,
)

# Shared map embeddings location (matches 01 notebook convention)
MAP_EMB_DIR = Path(CONFIG.PATHS.MAP_OUT) / "shared_extent"
maps_npz = MAP_EMB_DIR / MAP_EMBEDDINGS_NPZ_NAME
maps_pq  = MAP_EMB_DIR / MAPS_PARQUET_NAME

print("MAP_EMB_DIR:", MAP_EMB_DIR)
print("maps_npz:", maps_npz, "| exists:", maps_npz.is_file())
print("maps_pq :", maps_pq,  "| exists:", maps_pq.is_file())

if not maps_npz.is_file() or not maps_pq.is_file():
    raise FileNotFoundError(
        "Missing shared map embedding artifacts.\n"
        f"Expected:\n  {maps_npz}\n  {maps_pq}\n"
        "Run notebooks/01_generate_embeddings.ipynb first."
    )

# Per-experiment prompt artifacts expected under <train_out>/prompt_out/
for exp_name, cfg in EXPERIMENTS.items():
    if cfg["feature_mode"] == "map_only":
        continue
    prompt_out = Path(cfg["train_out"]) / "prompt_out"
    prm_npz = prompt_out / PROMPT_EMBEDDINGS_NPZ_NAME
    prm_pq  = prompt_out / PROMPTS_PARQUET_NAME
    print(f"\n{exp_name}: prompt_out={prompt_out}")
    print("  npz:", prm_npz.name, "| exists:", prm_npz.is_file())
    print("  pq :", prm_pq.name,  "| exists:", prm_pq.is_file())
    if not prm_npz.is_file() or not prm_pq.is_file():
        raise FileNotFoundError(
            f"Missing prompt artifacts for {exp_name}.\n"
            f"Expected:\n  {prm_npz}\n  {prm_pq}\n"
            "Run notebooks/01_generate_embeddings.ipynb first."
        )


MAP_EMB_DIR: /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/map_out/shared_extent
maps_npz: /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/map_out/shared_extent/maps_embeddings.npz | exists: True
maps_pq : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/map_out/shared_extent/maps.parquet | exists: True

openai_prompt_only: prompt_out=/Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_openai_prompt_only/prompt_out
  npz: prompts_embeddings.npz | exists: True
  pq : prompts.parquet | exists: True

use_prompt_only: prompt_out=/Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_use_prompt_only/prompt_out
  npz: prompts_embeddings.npz | exists: True
  pq : prompts.parquet | exists: True

use_map: prompt_out=/Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_use_map/prompt_out
  npz: prompts_embeddings.npz | exists: True
  pq : prompts.parquet | exists: True

openai_map: prompt_out=/Users/amirdonyadide/Document

In [4]:
# ===================== 02_build_training_data â€” CELL 3: Load training data + compute param_norm =====================

from dataclasses import replace
from pathlib import Path

from imgofup.config import paths
from imgofup.datasets.load_training_data import load_training_data_with_dynamic_param_norm

TRAIN_DATA = {}  # exp_name -> {"X":..., "df":..., "paths":...}

print("\n=== Loading training data for all experiments (unified loader) ===")

for exp_name, exp_cfg in EXPERIMENTS.items():
    train_out_dir = Path(exp_cfg["train_out"]).expanduser().resolve()
    if not train_out_dir.is_dir():
        raise FileNotFoundError(f"Missing train_out directory for {exp_name}: {train_out_dir}")

    feature_mode = str(exp_cfg["feature_mode"]).strip().lower()

    print(f"\nðŸ§ª Experiment: {exp_name}")
    print(f"   train_out : {train_out_dir}")
    print(f"   mode      : {feature_mode}")

    # Important: override TRAIN_OUT per experiment
    PATHS_EXP = replace(paths.PATHS, TRAIN_OUT=train_out_dir)

    # Only require text when prompts are part of the feature space
    require_text = feature_mode in {"prompt_only", "prompt_plus_map"}

    data = load_training_data_with_dynamic_param_norm(
        exp_name=exp_name,
        feature_mode=feature_mode,
        paths=PATHS_EXP,
        cfg=paths.CFG,
        distance_ops=paths.DISTANCE_OPS,
        area_ops=paths.AREA_OPS,
        require_text=require_text,
    )

    X = data.X
    df = data.df

    print(f"   âœ… Loaded: X={X.shape} | df={df.shape}")
    if PATHS_EXP.OPERATOR_COL in df.columns:
        print("   Operators:", sorted(df[PATHS_EXP.OPERATOR_COL].dropna().unique().tolist()))

    TRAIN_DATA[exp_name] = {"X": X, "df": df, "paths": PATHS_EXP}

first_key = next(iter(TRAIN_DATA.keys()))
print("\nFirst loaded experiment:", first_key)



=== Loading training data for all experiments (unified loader) ===

ðŸ§ª Experiment: openai_prompt_only
   train_out : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_openai_prompt_only
   mode      : prompt_only
   âœ… Loaded: X=(562, 1536) | df=(562, 15)
   Operators: ['aggregate', 'displace', 'select', 'simplify']

ðŸ§ª Experiment: use_prompt_only
   train_out : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_use_prompt_only
   mode      : prompt_only
   âœ… Loaded: X=(562, 512) | df=(562, 15)
   Operators: ['aggregate', 'displace', 'select', 'simplify']

ðŸ§ª Experiment: map_only
   train_out : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_map_only
   mode      : map_only
   âœ… Loaded: X=(562, 165) | df=(562, 15)
   Operators: ['aggregate', 'displace', 'select', 'simplify']

ðŸ§ª Experiment: use_map
   train_out : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out_use_map
   mode      : prompt_plus_map

In [5]:
# ===================== 02_build_training_data â€” CELL 4: Shared Train/Val/Test Split =====================

from pathlib import Path

from imgofup.config import paths
from imgofup.config.constants import MAPS_ID_COL, PROMPTS_PROMPT_ID_COL
from imgofup.datasets.splitting import make_splits_multi_prompt_to_train

FIXED_CLASSES = ["simplify", "select", "aggregate", "displace"]
USE_INTENSITY_FOR_STRAT = True

OP_COL  = paths.PATHS.OPERATOR_COL
INT_COL = paths.PATHS.INTENSITY_COL

MAP_ID_COL = MAPS_ID_COL
PROMPT_ID_COL = PROMPTS_PROMPT_ID_COL

# Save ONE shared split for all experiments
SPLITS_DIR = Path(paths.PATHS.SPLIT_OUT).expanduser().resolve()
SPLITS_DIR.mkdir(parents=True, exist_ok=True)
split_path = SPLITS_DIR / "splits_shared.json"

# Choose reference experiment (prefer prompt-based)
preferred_order = ["use_prompt_only", "use_map", "openai_map", "map_only"]
ref_exp = next((name for name in preferred_order if name in TRAIN_DATA), None)
if ref_exp is None:
    ref_exp = next(iter(TRAIN_DATA.keys()))

ref_df = TRAIN_DATA[ref_exp]["df"].copy().reset_index(drop=True)
ref_X  = TRAIN_DATA[ref_exp]["X"]

if not {MAP_ID_COL, PROMPT_ID_COL}.issubset(ref_df.columns):
    raise ValueError(f"Expected columns {{{MAP_ID_COL!r},{PROMPT_ID_COL!r}}} in df for split mapping.")
if OP_COL not in ref_df.columns:
    raise ValueError(f"Reference df missing operator column '{OP_COL}'.")

ref_df["row_key"] = ref_df[MAP_ID_COL].astype(str).str.zfill(4) + "::" + ref_df[PROMPT_ID_COL].astype(str)

print(f"\n=== Computing shared split using reference experiment: {ref_exp} ===")
print("ref_df:", ref_df.shape, "| ref_X:", ref_X.shape)
print("Saving split to:", split_path)

split = make_splits_multi_prompt_to_train(
    df=ref_df,
    X=ref_X,
    op_col=OP_COL,
    intensity_col=INT_COL if (USE_INTENSITY_FOR_STRAT and INT_COL in ref_df.columns) else None,
    map_id_col=MAP_ID_COL,
    fixed_classes=FIXED_CLASSES,
    use_intensity_for_strat=USE_INTENSITY_FOR_STRAT,
    seed=int(paths.CFG.SEED),
    val_ratio=float(paths.CFG.VAL_RATIO),
    test_ratio=float(paths.CFG.TEST_RATIO),
    max_attempts=500,
    save_splits_json=split_path,
    verbose=True,
)

train_idx_ref, val_idx_ref, test_idx_ref = split.train_idx, split.val_idx, split.test_idx

train_keys = set(ref_df.loc[train_idx_ref, "row_key"].tolist())
val_keys   = set(ref_df.loc[val_idx_ref,   "row_key"].tolist()) if len(val_idx_ref) else set()
test_keys  = set(ref_df.loc[test_idx_ref,  "row_key"].tolist()) if len(test_idx_ref) else set()

assert train_keys.isdisjoint(val_keys)
assert train_keys.isdisjoint(test_keys)
assert val_keys.isdisjoint(test_keys)

print("\nâœ… Shared split created:")
print(f"   Train keys: {len(train_keys)} | Val keys: {len(val_keys)} | Test keys: {len(test_keys)}")
print(f"   Saved to  : {split_path}")

# Apply split to each experiment
SPLITS = {}
needed_keys = train_keys | val_keys | test_keys

for exp_name, pack in TRAIN_DATA.items():
    df = pack["df"].copy().reset_index(drop=True)
    X  = pack["X"]

    if not {MAP_ID_COL, PROMPT_ID_COL}.issubset(df.columns):
        raise ValueError(f"Experiment '{exp_name}' df missing {MAP_ID_COL}/{PROMPT_ID_COL} needed for split mapping.")

    df["row_key"] = df[MAP_ID_COL].astype(str).str.zfill(4) + "::" + df[PROMPT_ID_COL].astype(str)

    missing = needed_keys - set(df["row_key"].tolist())
    if missing:
        raise ValueError(
            f"Experiment '{exp_name}' is missing {len(missing)} rows from the shared split "
            f"(first few: {list(sorted(missing))[:5]}).\n"
            "This usually means the pairs universe differs between experiments.\n"
            "Fix: ensure map_only uses the same prompts.parquet universe and concat is consistent."
        )

    train_idx = df.index[df["row_key"].isin(train_keys)].to_numpy()
    val_idx   = df.index[df["row_key"].isin(val_keys)].to_numpy() if val_keys else df.index[:0].to_numpy()
    test_idx  = df.index[df["row_key"].isin(test_keys)].to_numpy() if test_keys else df.index[:0].to_numpy()

    X_train, X_val, X_test = X[train_idx], X[val_idx], X[test_idx]
    df_train = df.loc[train_idx].reset_index(drop=True)
    df_val   = df.loc[val_idx].reset_index(drop=True)
    df_test  = df.loc[test_idx].reset_index(drop=True)

    SPLITS[exp_name] = {
        "train_idx": train_idx,
        "val_idx": val_idx,
        "test_idx": test_idx,
        "X_train": X_train, "X_val": X_val, "X_test": X_test,
        "df_train": df_train, "df_val": df_val, "df_test": df_test,
    }

    print(f"\nðŸ§ª {exp_name}")
    print("Rows -> Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)



=== Computing shared split using reference experiment: use_prompt_only ===
ref_df: (562, 16) | ref_X: (562, 512)
Saving split to: /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out/splits/splits_shared.json
=== DATASET SUMMARY ===
Total rows (prompts): 562
Unique maps: 399
Multi-prompt maps (>1 prompt): 22
Single-prompt maps (=1 prompt): 377

Top 10 maps by prompt count:
map_id
1646    30
1304    29
1755    26
1532    13
0127    10
0168     8
0142     7
0078     6
0080     6
0001     6
dtype: int64

âœ… Saved splits to /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out/splits/splits_shared.json

âœ… Shared split created:
   Train keys: 448 | Val keys: 57 | Test keys: 57
   Saved to  : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/train_out/splits/splits_shared.json

ðŸ§ª openai_prompt_only
Rows -> Train: (448, 1536) Val: (57, 1536) Test: (57, 1536)

ðŸ§ª use_prompt_only
Rows -> Train: (448, 512) Val: (57, 512) Test: (57, 512)

ðŸ§ª map_onl

In [6]:
# ===================== 02_build_training_data â€” CELL 5: Preprocessing (per experiment) =====================

from pathlib import Path
import numpy as np

from imgofup.preprocessing.preprocessing import fit_transform_modality_preproc
from imgofup.config.constants import (
    MAP_CLIP_Q_DEFAULT,
    MAP_IMPUTE_STRATEGY_DEFAULT,
    MAP_ROBUST_QRANGE_DEFAULT,
    MAP_VAR_EPS_DEFAULT,
    MAP_EMBEDDINGS_NPZ_NAME,
    PROMPT_EMBEDDINGS_NPZ_NAME,
)

def _infer_dim_from_npz(npz_path: Path) -> int:
    with np.load(npz_path, allow_pickle=True) as z:
        E = z["E"]
    return int(E.shape[1])

# Shared MAP_DIM from shared embeddings
MAP_DIM_INF = _infer_dim_from_npz(Path(MAP_EMB_DIR) / MAP_EMBEDDINGS_NPZ_NAME)

# Fill dims into EXPERIMENTS (like 01 did)
for exp_name, exp_cfg in EXPERIMENTS.items():
    fm = str(exp_cfg["feature_mode"]).strip().lower()
    prompt_dim = 0
    if fm in {"prompt_only", "prompt_plus_map"}:
        prm_npz = Path(exp_cfg["train_out"]) / "prompt_out" / PROMPT_EMBEDDINGS_NPZ_NAME
        prompt_dim = _infer_dim_from_npz(prm_npz)

    if fm == "prompt_only":
        exp_cfg["map_dim"], exp_cfg["prompt_dim"] = 0, prompt_dim
    elif fm == "map_only":
        exp_cfg["map_dim"], exp_cfg["prompt_dim"] = MAP_DIM_INF, 0
    elif fm == "prompt_plus_map":
        exp_cfg["map_dim"], exp_cfg["prompt_dim"] = MAP_DIM_INF, prompt_dim
    else:
        raise ValueError(f"Unknown feature_mode: {fm}")

PREPROC = {}
print("\n=== Fitting modality-aware preprocessing per experiment ===")

def _to_preproc_mode(feature_mode: str) -> str:
    fm = str(feature_mode).strip().lower()
    if fm == "prompt_only":
        return "prompt_only"
    if fm in {"prompt_plus_map", "map_only"}:
        return "prompt_plus_map"
    raise ValueError(f"Unsupported feature_mode for preprocessing: {feature_mode}")

for exp_name, cfg in EXPERIMENTS.items():
    split = SPLITS[exp_name]
    feature_mode = cfg["feature_mode"]
    preproc_mode = _to_preproc_mode(feature_mode)

    map_dim    = int(cfg["map_dim"])
    prompt_dim = int(cfg["prompt_dim"])

    model_out_dir = Path(cfg["model_out"]).expanduser().resolve()
    model_out_dir.mkdir(parents=True, exist_ok=True)

    preproc_path = model_out_dir / "preproc.joblib"

    print(f"\nðŸ§ª Experiment: {exp_name}")
    print(f"   Feature mode : {feature_mode} -> preproc_mode={preproc_mode}")
    print(f"   map_dim      : {map_dim}")
    print(f"   prompt_dim   : {prompt_dim}")
    print(f"   Save preproc : {preproc_path}")

    Xtr = split["X_train"]
    if Xtr.shape[1] != (map_dim + prompt_dim):
        raise ValueError(
            f"Dim mismatch in {exp_name}: X_train has {Xtr.shape[1]} cols, "
            f"but map_dim+prompt_dim={map_dim + prompt_dim}."
        )

    res = fit_transform_modality_preproc(
        X_train=split["X_train"],
        X_val=split["X_val"],
        X_test=split["X_test"],
        feature_mode=preproc_mode,
        map_dim=map_dim,
        prompt_dim=prompt_dim,
        eps=float(MAP_VAR_EPS_DEFAULT),
        clip_q=tuple(MAP_CLIP_Q_DEFAULT),
        impute_strategy=str(MAP_IMPUTE_STRATEGY_DEFAULT),
        robust_qrange=tuple(MAP_ROBUST_QRANGE_DEFAULT),
        save_path=preproc_path,
    )

    PREPROC[exp_name] = {
        "X_train_s": res.X_train_s,
        "X_val_s":   res.X_val_s,
        "X_test_s":  res.X_test_s,
        "bundle_path": res.bundle_path,
    }

    print("   âœ… Preprocessing complete.")
    print("   Shapes:", res.X_train_s.shape, res.X_val_s.shape, res.X_test_s.shape)

print("\nâœ… All preprocessing finished.")



=== Fitting modality-aware preprocessing per experiment ===

ðŸ§ª Experiment: openai_prompt_only
   Feature mode : prompt_only -> preproc_mode=prompt_only
   map_dim      : 0
   prompt_dim   : 1536
   Save preproc : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_openai_prompt_only/preproc.joblib
   âœ… Preprocessing complete.
   Shapes: (448, 1536) (57, 1536) (57, 1536)

ðŸ§ª Experiment: use_prompt_only
   Feature mode : prompt_only -> preproc_mode=prompt_only
   map_dim      : 0
   prompt_dim   : 512
   Save preproc : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_use_prompt_only/preproc.joblib
   âœ… Preprocessing complete.
   Shapes: (448, 512) (57, 512) (57, 512)

ðŸ§ª Experiment: map_only
   Feature mode : map_only -> preproc_mode=prompt_plus_map
   map_dim      : 165
   prompt_dim   : 0
   Save preproc : /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_map_only/preproc.joblib
   âœ… Preprocessing complete.
   Shapes

In [7]:
# ===================== 02_build_training_data â€” CELL 6: Labels + sample weights =====================

import numpy as np

from imgofup.config import paths
from imgofup.config.constants import (
    MAPS_ID_COL,
    CLASS_WEIGHT_MODE_DEFAULT,
    USE_MAP_WEIGHT_DEFAULT,
)
from imgofup.datasets.labels_and_weights import build_labels_and_sample_weights

OP_COL = paths.PATHS.OPERATOR_COL
MAP_ID_COL = MAPS_ID_COL

LABELS = {}

print("\n=== Building labels and sample weights per experiment ===")

for exp_name, split in SPLITS.items():
    df_train = split["df_train"].copy()
    df_val   = split["df_val"].copy()
    df_test  = split["df_test"].copy()

    for part_name, dfi in [("train", df_train), ("val", df_val), ("test", df_test)]:
        if OP_COL not in dfi.columns:
            raise ValueError(f"{exp_name}: df_{part_name} missing operator column '{OP_COL}'.")
        n_miss = int(dfi[OP_COL].isna().sum())
        if n_miss:
            raise ValueError(
                f"{exp_name}: df_{part_name} has {n_miss} missing operator labels. "
                "Fix the label merge before training."
            )

    lab = build_labels_and_sample_weights(
        df_train=df_train,
        df_val=df_val,
        df_test=df_test,
        op_col=OP_COL,
        map_id_col=MAP_ID_COL,
        fixed_classes=FIXED_CLASSES,
        use_map_weight=bool(USE_MAP_WEIGHT_DEFAULT),
        class_weight_mode=str(CLASS_WEIGHT_MODE_DEFAULT),
    )

    class_names = np.array(lab.class_names)

    LABELS[exp_name] = {
        "class_names": class_names,
        "y_train_cls": lab.y_train,
        "y_val_cls":   lab.y_val,
        "y_test_cls":  lab.y_test,
        "sample_w":    lab.sample_w,
        "class_weight_map": lab.class_weight_map,
    }

    print(f"\nðŸ§ª {exp_name}")
    print("Classes (fixed order):", list(class_names))
    print("Class weights:", lab.class_weight_map)
    print("y_train/y_val/y_test shapes:", lab.y_train.shape, lab.y_val.shape, lab.y_test.shape)
    sw = lab.sample_w
    print("Sample weight summary:", {"min": float(sw.min()), "max": float(sw.max()), "mean": float(sw.mean())})

first = next(iter(LABELS.keys()))
base_classes = LABELS[first]["class_names"].tolist()
for exp_name in LABELS.keys():
    if LABELS[exp_name]["class_names"].tolist() != base_classes:
        raise ValueError(f"Class order differs in experiment {exp_name}.")

print("\nâœ… Label build complete for all experiments (class order consistent).")



=== Building labels and sample weights per experiment ===

ðŸ§ª openai_prompt_only
Classes (fixed order): [np.str_('simplify'), np.str_('select'), np.str_('aggregate'), np.str_('displace')]
Class weights: {'simplify': 1.0275229357798166, 'select': 0.7777777777777778, 'aggregate': 0.835820895522388, 'displace': 1.8360655737704918}
y_train/y_val/y_test shapes: (448,) (57,) (57,)
Sample weight summary: {'min': 0.025925925925925925, 'max': 1.8360655737704918, 'mean': 0.6487687942076353}

ðŸ§ª use_prompt_only
Classes (fixed order): [np.str_('simplify'), np.str_('select'), np.str_('aggregate'), np.str_('displace')]
Class weights: {'simplify': 1.0275229357798166, 'select': 0.7777777777777778, 'aggregate': 0.835820895522388, 'displace': 1.8360655737704918}
y_train/y_val/y_test shapes: (448,) (57,) (57,)
Sample weight summary: {'min': 0.025925925925925925, 'max': 1.8360655737704918, 'mean': 0.6487687942076353}

ðŸ§ª map_only
Classes (fixed order): [np.str_('simplify'), np.str_('select'), np.st

In [8]:
# ===================== 02_build_training_data â€” FINAL CELL: Save Stage-2 cache =====================

import json
from pathlib import Path
import numpy as np

print("\n=== Saving Stage-2 cache for notebook 03 (disk persistence) ===")

STAGE2_DIRNAME = "cache_stage2"  # folder created inside each experiment's model_out

for exp_name, cfg in EXPERIMENTS.items():
    out_dir = Path(cfg["model_out"]).expanduser().resolve() / STAGE2_DIRNAME
    out_dir.mkdir(parents=True, exist_ok=True)

    split = SPLITS[exp_name]
    pre   = PREPROC[exp_name]
    lab   = LABELS[exp_name]

    # ---- 1) Save split indices (so we can reconstruct in 03 if needed)
    np.savez_compressed(
        out_dir / "splits.npz",
        train_idx=np.asarray(split["train_idx"], dtype=int),
        val_idx=np.asarray(split["val_idx"], dtype=int),
        test_idx=np.asarray(split["test_idx"], dtype=int),
    )

    # ---- 2) Save scaled arrays (what classifier/regressor actually trains on)
    np.savez_compressed(
        out_dir / "X_scaled.npz",
        X_train_s=np.asarray(pre["X_train_s"], dtype=np.float64),
        X_val_s=np.asarray(pre["X_val_s"], dtype=np.float64),
        X_test_s=np.asarray(pre["X_test_s"], dtype=np.float64),
    )

    # ---- 3) Save labels + weights
    np.savez_compressed(
        out_dir / "labels.npz",
        y_train_cls=np.asarray(lab["y_train_cls"], dtype=int),
        y_val_cls=np.asarray(lab["y_val_cls"], dtype=int),
        y_test_cls=np.asarray(lab["y_test_cls"], dtype=int),
        sample_w=np.asarray(lab["sample_w"], dtype=np.float64),
    )

    # ---- 4) Save class names
    class_names = [str(x) for x in lab["class_names"]]
    (out_dir / "class_names.json").write_text(
        json.dumps(class_names, indent=2), encoding="utf-8"
    )

    # ---- 5) Save dataframes (needed for grouped CV + regressors target)
    split["df_train"].to_parquet(out_dir / "df_train.parquet", index=False)
    split["df_val"].to_parquet(out_dir / "df_val.parquet", index=False)
    split["df_test"].to_parquet(out_dir / "df_test.parquet", index=False)

    # ---- 6) Save small meta (for sanity/debug)
    meta = {
        "exp_name": exp_name,
        "feature_mode": cfg["feature_mode"],
        "map_dim": int(cfg.get("map_dim", -1)),
        "prompt_dim": int(cfg.get("prompt_dim", -1)),
        "fused_dim": int(cfg.get("fused_dim", -1)),
        "paths": {
            "model_out": str(Path(cfg["model_out"]).expanduser().resolve()),
            "cache_dir": str(out_dir),
        },
        "shapes": {
            "X_train_s": list(pre["X_train_s"].shape),
            "X_val_s": list(pre["X_val_s"].shape),
            "X_test_s": list(pre["X_test_s"].shape),
            "df_train": [int(split["df_train"].shape[0]), int(split["df_train"].shape[1])],
        },
    }
    (out_dir / "meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")

    print(f"âœ… {exp_name}: wrote cache to {out_dir}")

print("\nâœ… Stage-2 cache saved. Notebook 03 can now run standalone.")



=== Saving Stage-2 cache for notebook 03 (disk persistence) ===
âœ… openai_prompt_only: wrote cache to /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_openai_prompt_only/cache_stage2
âœ… use_prompt_only: wrote cache to /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_use_prompt_only/cache_stage2
âœ… map_only: wrote cache to /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_map_only/cache_stage2
âœ… use_map: wrote cache to /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_use_map/cache_stage2
âœ… openai_map: wrote cache to /Users/amirdonyadide/Documents/GitHub/IMGOFUP/data/output/models/exp_openai_map/cache_stage2

âœ… Stage-2 cache saved. Notebook 03 can now run standalone.
