## üß© 0) Setup & Imports ##

In [1]:
# ===================== PARAMETERS / IMPORTS =====================
from pathlib import Path
import sys, subprocess, numpy as np, pandas as pd, joblib

from sklearn.model_selection import train_test_split, StratifiedGroupKFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer


from dataclasses import dataclass


from pprint import pprint

from scipy.stats import loguniform, randint

import joblib

# Project config
PROJ_ROOT = Path("../").resolve()
SRC_DIR   = PROJ_ROOT / "src"
if str(PROJ_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJ_ROOT))

from src.config import PATHS, CFG, print_summary
print_summary()

# Dims (fallbacks if CFG unset)
MAP_DIM = PROMPT_DIM = FUSED_DIM = None

BATCH_SIZE  = CFG.BATCH_SIZE

# Clean outputs for a fresh run
PATHS.clean_outputs()


=== CONFIG SUMMARY ===
PROJ_ROOT  : /Users/amirdonyadide/Documents/GitHub/Thesis
DATA_DIR   : /Users/amirdonyadide/Documents/GitHub/Thesis/data
INPUT_DIR  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/input
OUTPUT_DIR : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output
MAPS_ROOT  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/samples/pairs
INPUT PAT. : *_input.geojson
--- User Study ---
USER_STUDY_XLSX : /Users/amirdonyadide/Documents/GitHub/Thesis/data/userstudy/UserStudy.xlsx
RESPONSES_SHEET : Responses
TILE_ID_COL     : tile_id
COMPLETE_COL    : complete
REMOVE_COL      : remove
TEXT_COL        : cleaned_text
PARAM_VALUE_COL : param_value
OPERATOR_COL    : operator
INTENSITY_COL   : intensity
--- Filters / IDs / Split ---
ONLY_COMPLETE   : True
EXCLUDE_REMOVED : True
PROMPT_ID       : r{i:08d}
SPLIT_BY        : tile
--- Outputs ---
PROMPT_OUT : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out
MAP_OUT    : /Users/amirdonyadide/Documents/GitH

## üìö 1) Build Prompt Embeddings (USE) ##

In [None]:
from pathlib import Path

# Import the module
from src.mapvec.prompts import prompt_embeddings as pe

# Choose paths
data_dir = PATHS.DATA_DIR  # or Path("../data").resolve()
in_path  = PATHS.USER_STUDY_XLSX
out_dir  = PATHS.PROMPT_OUT

# Logging (match what CLI does)
pe.setup_logging(verbosity=1)

# Load prompts (will filter complete==True & remove==False because you updated the function)
ids, texts, id_colname = pe.load_prompts_from_source(
    input_path=Path(in_path),
    sheet_name=PATHS.RESPONSES_SHEET,
    tile_id_col=PATHS.TILE_ID_COL,
    complete_col=PATHS.COMPLETE_COL,
    remove_col=PATHS.REMOVE_COL,
    text_col=PATHS.TEXT_COL,
)

print(f"Loaded {len(texts)} prompts.")

# Get embedder based on CFG.PROMPT_ENCODER (dan/transformer/openai-small/openai-large)
embed_fn, model_label = pe.get_embedder(
    kind=CFG.PROMPT_ENCODER,
    data_dir=Path(data_dir),
    l2_normalize=True,
    batch_size=CFG.BATCH_SIZE,
)

# Embed
E = embed_fn(texts)

# Save outputs in the same format as before
pe.save_outputs(
    out_dir=Path(out_dir),
    ids=ids,
    texts=texts,
    E=E,
    model_name=model_label,
    l2_normalized=True,
    id_colname="prompt_id",
    also_save_embeddings_csv=False,
)

print("‚úÖ Prompt embeddings completed.")


2026-01-21 21:24:23 | INFO | Reading Excel: /Users/amirdonyadide/Documents/GitHub/Thesis/data/userstudy/UserStudy.xlsx (sheet=Responses)
2026-01-21 21:24:23 | INFO | Filtered Excel rows: 786 ‚Üí 564 (only_complete=True, exclude_removed=True)


Loaded 564 prompts.


2026-01-21 21:24:23 | INFO | Embedding 564 prompts with OpenAI model=text-embedding-3-small (batch_size=512, l2=True)‚Ä¶
2026-01-21 21:24:25 | INFO | HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-01-21 21:24:26 | INFO | HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-01-21 21:24:26 | INFO | Done OpenAI embedding in 2.31s (dim=1536).
2026-01-21 21:24:26 | INFO | Writing outputs to /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out
2026-01-21 21:24:26 | INFO |   saved prompts_embeddings.npz (shape=(564, 1536))
2026-01-21 21:24:26 | INFO |   saved prompts.parquet (rows=564)
2026-01-21 21:24:26 | INFO |   saved meta.json


‚úÖ Prompt embeddings completed.


## üó∫Ô∏è 2) Build Map Embeddings (geometric) ##

In [3]:
from pathlib import Path
import numpy as np
import pandas as pd

from src.mapvec.maps import map_embeddings as me

# ------------------------------------------------------------
# 1) Load allowed tile_ids from Excel
# ------------------------------------------------------------
dfu = pd.read_excel(PATHS.USER_STUDY_XLSX, sheet_name=PATHS.RESPONSES_SHEET)

dfu[PATHS.COMPLETE_COL] = dfu[PATHS.COMPLETE_COL].astype(bool)
dfu[PATHS.REMOVE_COL]   = dfu[PATHS.REMOVE_COL].astype(bool)

mask = pd.Series(True, index=dfu.index)
if PATHS.ONLY_COMPLETE:
    mask &= (dfu[PATHS.COMPLETE_COL] == True)
if PATHS.EXCLUDE_REMOVED:
    mask &= (dfu[PATHS.REMOVE_COL] == False)
dfu = dfu[mask].copy()


tile_raw = dfu[PATHS.TILE_ID_COL]
tile_num = pd.to_numeric(tile_raw, errors="coerce")
if tile_num.notna().all():
    allowed_tile_ids = set(tile_num.astype(int).astype(str).str.zfill(4).tolist())
else:
    allowed_tile_ids = set(tile_raw.astype(str).str.strip().str.zfill(4).tolist())

print(f"Allowed tiles from Excel: {len(allowed_tile_ids)}")

# ------------------------------------------------------------
# 2) Discover GeoJSONs and filter by tile_id
# ------------------------------------------------------------
me.setup_logging(verbosity=1)

pairs = list(me.find_geojsons(PATHS.MAPS_ROOT, PATHS.INPUT_MAPS_PATTERN))

pairs = [
    (map_id, path)
    for (map_id, path) in pairs
    if str(map_id) in allowed_tile_ids
]

if not pairs:
    raise RuntimeError("No maps left after Excel filtering.")

print(f"Maps to embed after filtering: {len(pairs)}")

# ------------------------------------------------------------
# 3) First pass: polygon counting
# ------------------------------------------------------------
counts = {}
for map_id, path in pairs:
    try:
        counts[map_id] = me._count_valid_polygons(path)
    except Exception:
        counts[map_id] = 0

max_polygons = max(max(counts.values()), 1)

# ------------------------------------------------------------
# 4) Second pass: embed maps
# ------------------------------------------------------------
ids, vecs, rows = [], [], []
feat_names = None
first_dim = None

for map_id, path in pairs:
    vec, names = me.embed_one_map(
        path,
        max_polygons=max_polygons,
        norm="fixed",
        norm_wh="400x400",
    )

    if first_dim is None:
        first_dim = vec.shape[0]
        feat_names = names
    elif vec.shape[0] != first_dim:
        print(f"Skipping {map_id}: dim mismatch")
        continue

    ids.append(map_id)
    vecs.append(vec)

    rows.append({
        "map_id": map_id,
        "geojson": str(path),
        "n_polygons": counts.get(map_id, 0),
    })

E = np.vstack(vecs).astype(np.float32)

# ------------------------------------------------------------
# 5) Save outputs (same format as script)
# ------------------------------------------------------------
me.save_outputs(
    out_dir=PATHS.MAP_OUT,
    rows=rows,
    E=E,
    ids=ids,
    feat_names=feat_names or [],
    save_csv=False,
)

print("‚úÖ Map embeddings completed.")


Allowed tiles from Excel: 401
Maps to embed after filtering: 401
‚úÖ Map embeddings completed.


In [4]:
import numpy as np

def infer_dims(paths):
    prm_npz = paths.PROMPT_OUT / "prompts_embeddings.npz"
    map_npz = paths.MAP_OUT / "maps_embeddings.npz"

    if not prm_npz.exists():
        raise FileNotFoundError(f"Missing {prm_npz} (run prompt embeddings first)")
    z = np.load(prm_npz, allow_pickle=True)
    PROMPT_DIM = int(z["E"].shape[1])

    if not map_npz.exists():
        raise FileNotFoundError(f"Missing {map_npz} (run map embeddings first)")
    z2 = np.load(map_npz, allow_pickle=True)
    MAP_DIM = int(z2["E"].shape[1])

    FUSED_DIM = MAP_DIM + PROMPT_DIM
    return MAP_DIM, PROMPT_DIM, FUSED_DIM

MAP_DIM, PROMPT_DIM, FUSED_DIM = infer_dims(PATHS)
print("‚úÖ Inferred dims:", {"MAP_DIM": MAP_DIM, "PROMPT_DIM": PROMPT_DIM, "FUSED_DIM": FUSED_DIM})


‚úÖ Inferred dims: {'MAP_DIM': 165, 'PROMPT_DIM': 1536, 'FUSED_DIM': 1701}


## üîó 3) Concatenate (pairs ‚Üí fused rows) ##

In [5]:
from pathlib import Path
import numpy as np
import pandas as pd
import json

from src.mapvec.concat import concat_embeddings as ce
ce.setup_logging(verbosity=1)

map_npz_path = Path(PATHS.MAP_OUT / "maps_embeddings.npz")
prm_npz_path = Path(PATHS.PROMPT_OUT / "prompts_embeddings.npz")
prompts_pq   = Path(PATHS.PROMPT_OUT / "prompts.parquet")
out_dir      = Path(PATHS.TRAIN_OUT)
out_dir.mkdir(parents=True, exist_ok=True)

# ---- build pairs from prompts.parquet (authoritative) ----
pairs = pd.read_parquet(prompts_pq)

if "prompt_id" not in pairs.columns or "tile_id" not in pairs.columns:
    raise RuntimeError("prompts.parquet must contain columns: prompt_id, tile_id")

pairs = pairs.rename(columns={"tile_id": "map_id"})[["map_id", "prompt_id"]].copy()
pairs["map_id"] = pairs["map_id"].astype(str).str.strip()
pairs["prompt_id"] = pairs["prompt_id"].astype(str).str.strip()
pairs = pairs.dropna(subset=["map_id", "prompt_id"])
pairs = pairs[(pairs["map_id"] != "") & (pairs["prompt_id"] != "")]
pairs = pairs.drop_duplicates(subset=["map_id", "prompt_id"])

# ---- load embeddings ----
E_map, map_ids = ce.load_npz(map_npz_path)
E_prm, prm_ids = ce.load_npz(prm_npz_path)

idx_map = {k: i for i, k in enumerate(map_ids)}
idx_prm = {k: i for i, k in enumerate(prm_ids)}

# ---- match & build X ----
chosen_rows, im_list, ip_list = [], [], []
missing = 0

for i, row in enumerate(pairs.itertuples(index=False), start=0):
    im = idx_map.get(row.map_id)
    ip = idx_prm.get(row.prompt_id)
    if im is None or ip is None:
        missing += 1
        continue
    chosen_rows.append(i)
    im_list.append(im)
    ip_list.append(ip)

if not im_list:
    raise RuntimeError("No valid pairs after ID matching.")

if missing:
    print(f"‚ö†Ô∏è Skipped {missing} rows with missing IDs")

X_map = E_map[np.asarray(im_list, dtype=int)].astype(np.float32, copy=False)
X_prm = E_prm[np.asarray(ip_list, dtype=int)].astype(np.float32, copy=False)
X = np.hstack([X_map, X_prm]).astype(np.float32, copy=False)

np.save(out_dir / "X_concat.npy", X)
join_df = pairs.iloc[chosen_rows].reset_index(drop=True)
join_df.to_parquet(out_dir / "train_pairs.parquet", index=False)

meta = {
    "shape": [int(X.shape[0]), int(X.shape[1])],
    "map_dim": int(E_map.shape[1]),
    "prompt_dim": int(E_prm.shape[1]),
    "rows": int(X.shape[0]),
    "skipped_pairs": int(missing),
    "sources": {
        "prompts_parquet": str(prompts_pq),
        "map_npz": str(map_npz_path),
        "prompt_npz": str(prm_npz_path),
    },
}
(out_dir / "meta.json").write_text(json.dumps(meta, indent=2))

print("‚úÖ Concatenation completed.")

‚úÖ Concatenation completed.


## üì• 4) Load & Basic Cleaning ##

In [9]:
# === LOAD FUSED DATA (operator + param_value only) ===
import json
import numpy as np
import pandas as pd

X = np.load(PATHS.TRAIN_OUT / "X_concat.npy")
pairs_df = pd.read_parquet(PATHS.TRAIN_OUT / "train_pairs.parquet")
print(f"Loaded X: {X.shape}, pairs: {pairs_df.shape}")

# --- Rebuild labels from Excel and merge (since train_pairs.parquet has only ids) ---
dfu = pd.read_excel(PATHS.USER_STUDY_XLSX, sheet_name=PATHS.RESPONSES_SHEET)

dfu[PATHS.COMPLETE_COL] = dfu[PATHS.COMPLETE_COL].astype(bool)
dfu[PATHS.REMOVE_COL]   = dfu[PATHS.REMOVE_COL].astype(bool)

mask_excel = pd.Series(True, index=dfu.index)
if PATHS.ONLY_COMPLETE:
    mask_excel &= (dfu[PATHS.COMPLETE_COL] == True)
if PATHS.EXCLUDE_REMOVED:
    mask_excel &= (dfu[PATHS.REMOVE_COL] == False)
dfu = dfu[mask_excel].copy()

# match prompt_embeddings.py: prompt_id uses original Excel row index
dfu = dfu.reset_index(drop=False).rename(columns={"index": "_row"})
prefix = PATHS.PROMPT_ID_PREFIX
width  = PATHS.PROMPT_ID_WIDTH
dfu["prompt_id"] = dfu["_row"].apply(lambda r: f"{prefix}{int(r):0{width}d}")

# normalize tile_id -> map_id (4-digit folder ids like 0001, 0345)
tile_raw = dfu[PATHS.TILE_ID_COL]
tile_num = pd.to_numeric(tile_raw, errors="coerce")
if tile_num.notna().all():
    dfu["map_id"] = tile_num.astype(int).astype(str).str.zfill(4)
else:
    dfu["map_id"] = tile_raw.astype(str).str.strip().str.zfill(4)

labels = dfu[[
    "map_id",
    "prompt_id",
    PATHS.OPERATOR_COL,
    PATHS.PARAM_VALUE_COL,
    PATHS.INTENSITY_COL,   # <-- ADD THIS
]].copy()


df = pairs_df.merge(labels, on=["map_id", "prompt_id"], how="left")

OP_COL = PATHS.OPERATOR_COL               # "operator"
PARAM_COL = PATHS.PARAM_VALUE_COL         # "param_value"

# clean target columns
df[OP_COL] = df[OP_COL].astype(str).str.strip().str.lower()
df[PARAM_COL] = pd.to_numeric(df[PARAM_COL], errors="coerce")

# keep only rows that have both targets
mask = df[OP_COL].notna() & df[PARAM_COL].notna()

X  = X[mask.values].astype("float64", copy=False)
df = df.loc[mask].reset_index(drop=True)

print(f"After cleaning: X={X.shape}, df={df.shape}, ops={sorted(df[OP_COL].unique())}")
print("Example rows:")
display(df)


Loaded X: (564, 1701), pairs: (564, 2)
After cleaning: X=(564, 1701), df=(564, 5), ops=['aggregate', 'displace', 'select', 'simplify']
Example rows:


Unnamed: 0,map_id,prompt_id,operator,param_value,intensity
0,1304,r00000000,aggregate,0.000,medium
1,1690,r00000001,select,47.584,low
2,1646,r00000002,select,129.722,medium
3,1663,r00000005,aggregate,0.000,medium
4,0856,r00000006,simplify,16.917,high
...,...,...,...,...,...
559,1697,r00000781,aggregate,0.000,low
560,0078,r00000782,aggregate,4.000,high
561,1525,r00000783,aggregate,7.000,high
562,1288,r00000784,select,80.000,medium


## ‚úÇÔ∏è 5) Split & Targets ##

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit

FIXED_CLASSES = ["simplify", "select", "aggregate", "displace"]
USE_INTENSITY_FOR_STRAT = True  # will fall back to operator-only if needed

OP_COL   = PATHS.OPERATOR_COL        # "operator"
PARAM_COL = PATHS.PARAM_VALUE_COL    # "param_value"
INT_COL  = PATHS.INTENSITY_COL       # "intensity"

df = df.copy()
df[OP_COL] = df[OP_COL].astype(str).str.strip().str.lower()
if INT_COL in df.columns:
    df[INT_COL] = df[INT_COL].astype(str).str.strip().str.lower()

# ------------------------------------------------------------
# 1) Group stats: prompts per map_id
# ------------------------------------------------------------
prompt_counts = df.groupby("map_id").size()
multi_map_ids = prompt_counts[prompt_counts > 1].index.tolist()
single_map_ids = prompt_counts[prompt_counts == 1].index.tolist()

print("=== DATASET SUMMARY ===")
print(f"Total rows (prompts): {len(df)}")
print(f"Unique maps: {prompt_counts.shape[0]}")
print(f"Multi-prompt maps (>1 prompt): {len(multi_map_ids)}")
print(f"Single-prompt maps (=1 prompt): {len(single_map_ids)}")
print("\nTop 10 maps by prompt count:")
print(prompt_counts.sort_values(ascending=False).head(10))

# ------------------------------------------------------------
# 2) Map-level table for single maps (one row per map_id)
# ------------------------------------------------------------
df_single = df[df["map_id"].isin(single_map_ids)].copy()
map_level = df_single.groupby("map_id").first().reset_index()

# Build strat label: operator√óintensity if feasible, else operator only
if USE_INTENSITY_FOR_STRAT and INT_COL in map_level.columns:
    map_level["_strat"] = map_level[OP_COL] + "__" + map_level[INT_COL]
    vc = map_level["_strat"].value_counts()
    if (vc < 2).any():
        print("\n‚ö†Ô∏è Some operator√óintensity groups too rare (<2 single-maps). Falling back to operator-only stratification.")
        map_level["_strat"] = map_level[OP_COL]
else:
    map_level["_strat"] = map_level[OP_COL]

def has_all_ops(dfx: pd.DataFrame) -> bool:
    return set(dfx[OP_COL].unique()) >= set(FIXED_CLASSES)

# ------------------------------------------------------------
# 3) Split single maps into train/val/test with retries
# ------------------------------------------------------------
test_ratio = CFG.TEST_RATIO
val_ratio = CFG.VAL_RATIO
val_rel = val_ratio / (1.0 - test_ratio)

X_idx = np.arange(len(map_level))
y_strat = map_level["_strat"].to_numpy()
map_ids_arr = map_level["map_id"].to_numpy()

best = None
for attempt in range(500):
    rs = CFG.SEED + attempt

    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=rs)
    trainval_i, test_i = next(sss1.split(X_idx, y_strat))

    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=val_rel, random_state=rs + 999)
    train_i, val_i = next(sss2.split(trainval_i, y_strat[trainval_i]))

    single_train_maps = set(map_ids_arr[trainval_i[train_i]])
    single_val_maps   = set(map_ids_arr[trainval_i[val_i]])
    single_test_maps  = set(map_ids_arr[test_i])

    train_maps = set(multi_map_ids) | single_train_maps
    val_maps   = single_val_maps
    test_maps  = single_test_maps

    # leakage check
    if (train_maps & val_maps) or (train_maps & test_maps) or (val_maps & test_maps):
        continue

    df_train_tmp = df[df["map_id"].isin(train_maps)]
    df_val_tmp   = df[df["map_id"].isin(val_maps)]
    df_test_tmp  = df[df["map_id"].isin(test_maps)]

    # must contain all operators in each split
    if not (has_all_ops(df_train_tmp) and has_all_ops(df_val_tmp) and has_all_ops(df_test_tmp)):
        continue

    best = (train_maps, val_maps, test_maps, rs)
    break

if best is None:
    raise RuntimeError(
        "Could not find a leakage-safe split with operator coverage in all splits "
        "and multi-prompt maps forced to TRAIN. "
        "Try: USE_INTENSITY_FOR_STRAT=False, or adjust VAL/TEST ratios."
    )

train_maps, val_maps, test_maps, used_seed = best

# ------------------------------------------------------------
# 4) Build row-level splits (no leakage)
# ------------------------------------------------------------
train_idx = df.index[df["map_id"].isin(train_maps)].to_numpy()
val_idx   = df.index[df["map_id"].isin(val_maps)].to_numpy()
test_idx  = df.index[df["map_id"].isin(test_maps)].to_numpy()

X_train, X_val, X_test = X[train_idx], X[val_idx], X[test_idx]
df_train = df.loc[train_idx].reset_index(drop=True)
df_val   = df.loc[val_idx].reset_index(drop=True)
df_test  = df.loc[test_idx].reset_index(drop=True)

print("\n=== SPLIT SUMMARY ===")
print(f"‚úÖ Split found (seed={used_seed})")
print(f"Train maps: {len(train_maps)}  (includes multi-prompt maps: {len(set(multi_map_ids))})")
print(f"Val maps:   {len(val_maps)}")
print(f"Test maps:  {len(test_maps)}")
print(f"Rows -> Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

# Hard guarantees
assert set(df_train["map_id"]).isdisjoint(df_val["map_id"])
assert set(df_train["map_id"]).isdisjoint(df_test["map_id"])
assert set(df_val["map_id"]).isdisjoint(df_test["map_id"])
assert set(multi_map_ids).issubset(train_maps)
print("‚úÖ Verified: no map_id leakage across splits.")
print("‚úÖ Verified: all multi-prompt maps are in TRAIN.")

# ------------------------------------------------------------
# 5) Detailed diagnostics: operator + intensity coverage
# ------------------------------------------------------------
def print_operator_counts(dfx, name):
    print(f"\n{name} ‚Äî Operator counts")
    print(dfx[OP_COL].value_counts())

def print_op_intensity_table(dfx, name):
    if INT_COL not in dfx.columns:
        print(f"\n{name} ‚Äî intensity column missing; skipping op√óintensity table.")
        return
    print(f"\n{name} ‚Äî Operator √ó Intensity table (counts)")
    tab = (
        dfx.groupby([OP_COL, INT_COL]).size()
        .unstack(fill_value=0)
        .sort_index()
    )
    print(tab)
    print(f"\n{name} ‚Äî Operator totals (row sums)")
    print(tab.sum(axis=1))
    print(f"\n{name} ‚Äî Intensity totals (col sums)")
    print(tab.sum(axis=0))

def report_missing_combos(dfx, name, all_ops, all_ints):
    if INT_COL not in dfx.columns:
        return
    present = set(zip(dfx[OP_COL], dfx[INT_COL]))
    missing = [(op, it) for op in all_ops for it in all_ints if (op, it) not in present]
    if missing:
        print(f"\n‚ö†Ô∏è {name}: Missing operator√óintensity combos ({len(missing)}):")
        print(missing[:40], "..." if len(missing) > 40 else "")
    else:
        print(f"\n‚úÖ {name}: All operator√óintensity combos present.")

ALL_OPS = FIXED_CLASSES[:]  # enforce fixed order
ALL_INTS = sorted(df[INT_COL].unique()) if INT_COL in df.columns else []

print_operator_counts(df_train, "TRAIN")
print_operator_counts(df_val,   "VAL")
print_operator_counts(df_test,  "TEST")

print_op_intensity_table(df_train, "TRAIN")
print_op_intensity_table(df_val,   "VAL")
print_op_intensity_table(df_test,  "TEST")

report_missing_combos(df_train, "TRAIN", ALL_OPS, ALL_INTS)
report_missing_combos(df_val,   "VAL",   ALL_OPS, ALL_INTS)
report_missing_combos(df_test,  "TEST",  ALL_OPS, ALL_INTS)

# ------------------------------------------------------------
# 6) Map-level prompt multiplicity info per split
# ------------------------------------------------------------
def map_prompt_stats(map_set, name):
    sub_counts = prompt_counts.loc[list(map_set)]
    print(f"\n{name} ‚Äî prompts per map statistics")
    print(f"{name} ‚Äî #maps with >1 prompt:", int((sub_counts > 1).sum()))

map_prompt_stats(train_maps, "TRAIN")
map_prompt_stats(val_maps,   "VAL")
map_prompt_stats(test_maps,  "TEST")

# ------------------------------------------------------------
# 7) Optional: show top multi-prompt maps in TRAIN
# ------------------------------------------------------------
top_multi = prompt_counts.loc[multi_map_ids].sort_values(ascending=False).head(20)
print("\nTRAIN ‚Äî Top multi-prompt maps (forced to train):")
print(top_multi)


=== DATASET SUMMARY ===
Total rows (prompts): 564
Unique maps: 401
Multi-prompt maps (>1 prompt): 22
Single-prompt maps (=1 prompt): 379

Top 10 maps by prompt count:
map_id
1646    30
1304    29
1755    26
1532    13
0127    10
0168     8
0142     7
0078     6
0080     6
0001     6
dtype: int64

=== SPLIT SUMMARY ===
‚úÖ Split found (seed=42)
Train maps: 287  (includes multi-prompt maps: 22)
Val maps:   57
Test maps:  57
Rows -> Train: (450, 1701), Val: (57, 1701), Test: (57, 1701)
‚úÖ Verified: no map_id leakage across splits.
‚úÖ Verified: all multi-prompt maps are in TRAIN.

TRAIN ‚Äî Operator counts
operator
select       146
aggregate    134
simplify     109
displace      61
Name: count, dtype: int64

VAL ‚Äî Operator counts
operator
select       19
aggregate    16
simplify     13
displace      9
Name: count, dtype: int64

TEST ‚Äî Operator counts
operator
select       19
aggregate    16
simplify     13
displace      9
Name: count, dtype: int64

TRAIN ‚Äî Operator √ó Intensity tab

## üßº 6) Modality-Aware Preprocessing (map only) ##

In [11]:
# === MODALITY-AWARE PREPROCESSING ===
def split_blocks(X):
    X_map    = X[:, :MAP_DIM].astype(np.float64, copy=True)
    X_prompt = X[:, MAP_DIM:MAP_DIM+PROMPT_DIM].astype(np.float64, copy=True)
    return X_map, X_prompt

def l2_normalize_rows(A, eps=1e-12):
    nrm = np.sqrt((A * A).sum(axis=1, keepdims=True))
    return A / np.maximum(nrm, eps)

# split
Xm_tr, Xp_tr = split_blocks(X_train)
Xm_va, Xp_va = split_blocks(X_val)
Xm_te, Xp_te = split_blocks(X_test)

# prompts: L2 only
Xp_tr = l2_normalize_rows(Xp_tr)
Xp_va = l2_normalize_rows(Xp_va)
Xp_te = l2_normalize_rows(Xp_te)

# maps: inf‚ÜíNaN
for A in (Xm_tr, Xm_va, Xm_te):
    A[~np.isfinite(A)] = np.nan

# impute (train)
imp = SimpleImputer(strategy="median")
Xm_tr_imp = imp.fit_transform(Xm_tr)
Xm_va_imp = imp.transform(Xm_va)
Xm_te_imp = imp.transform(Xm_te)

# clip (5‚Äì95%) train thresholds
q_lo = np.nanpercentile(Xm_tr_imp, 5, axis=0)
q_hi = np.nanpercentile(Xm_tr_imp, 95, axis=0)
def clip_to_q(A, lo, hi): return np.clip(A, lo, hi)

Xm_tr_imp = clip_to_q(Xm_tr_imp, q_lo, q_hi)
Xm_va_imp = clip_to_q(Xm_va_imp, q_lo, q_hi)
Xm_te_imp = clip_to_q(Xm_te_imp, q_lo, q_hi)

# drop zero-variance cols on train
stds = np.nanstd(Xm_tr_imp, axis=0)
keep_mask = stds > 1e-12

# scale kept columns (train fit)
scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(5, 95))
Xm_tr_kept = scaler.fit_transform(Xm_tr_imp[:, keep_mask])
Xm_va_kept = scaler.transform(Xm_va_imp[:, keep_mask])
Xm_te_kept = scaler.transform(Xm_te_imp[:, keep_mask])

# rebuild full map dim (dropped cols = 0)
Xm_tr_s = np.zeros_like(Xm_tr_imp, dtype=np.float64)
Xm_va_s = np.zeros_like(Xm_va_imp, dtype=np.float64)
Xm_te_s = np.zeros_like(Xm_te_imp, dtype=np.float64)
Xm_tr_s[:, keep_mask] = Xm_tr_kept.astype(np.float64)
Xm_va_s[:, keep_mask] = Xm_va_kept.astype(np.float64)
Xm_te_s[:, keep_mask] = Xm_te_kept.astype(np.float64)

# fuse back
X_train_s = np.concatenate([Xm_tr_s, Xp_tr], axis=1).astype(np.float64)
X_val_s   = np.concatenate([Xm_va_s, Xp_va], axis=1).astype(np.float64)
X_test_s  = np.concatenate([Xm_te_s, Xp_te], axis=1).astype(np.float64)

assert np.isfinite(X_train_s).all() and np.isfinite(X_val_s).all() and np.isfinite(X_test_s).all(), "Non-finite after preprocessing."
print("‚úÖ Modality-aware preprocessing complete.")

# save preprocessing bundle
joblib.dump({
    "imp": imp, "q_lo": q_lo, "q_hi": q_hi,
    "keep_mask": keep_mask, "scaler": scaler,
    "map_dim": MAP_DIM, "prompt_dim": PROMPT_DIM
}, PATHS.TRAIN_OUT / "preproc.joblib")


‚úÖ Modality-aware preprocessing complete.


['/Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out/preproc.joblib']

## ‚öñÔ∏è 7) Class Weights ##

In [14]:
# === BUILD y_train_cls + CLASS + MAP-AWARE SAMPLE WEIGHTS ===

import numpy as np
from sklearn.utils.class_weight import compute_class_weight

OP_COL = PATHS.OPERATOR_COL  # "operator"

# ------------------------------------------------------------
# 1) Build y_train_cls directly from df_train
# ------------------------------------------------------------
# factorize gives stable integer labels starting at 0
y_train_cls, class_names = pd.factorize(df_train[OP_COL], sort=True)
n_classes = len(class_names)

print("Operator classes:", list(class_names))

# ------------------------------------------------------------
# 2) Class weights (operator imbalance)
# ------------------------------------------------------------
classes = np.arange(n_classes)

cls_w = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train_cls
)
cls_w = np.asarray(cls_w, dtype="float64")

class_weight_map = dict(zip(class_names, cls_w))
print("Class weights:", class_weight_map)

# per-sample class weight
w_class = np.array([cls_w[c] for c in y_train_cls], dtype="float64")

# ------------------------------------------------------------
# 3) Map-level weighting (prompt multiplicity correction)
# ------------------------------------------------------------
map_counts = df_train["map_id"].value_counts()

# each map contributes ~1 total weight
w_map = df_train["map_id"].map(lambda m: 1.0 / map_counts[m]).to_numpy(dtype="float64")

# ------------------------------------------------------------
# 4) Final sample weights
# ------------------------------------------------------------
sample_w = w_class * w_map

print(
    "Sample weight summary:",
    {
        "min": float(sample_w.min()),
        "max": float(sample_w.max()),
        "mean": float(sample_w.mean()),
    }
)


Operator classes: ['aggregate', 'displace', 'select', 'simplify']
Class weights: {'aggregate': np.float64(0.8395522388059702), 'displace': np.float64(1.8442622950819672), 'select': np.float64(0.7705479452054794), 'simplify': np.float64(1.0321100917431192)}
Sample weight summary: {'min': 0.025684931506849314, 'max': 1.8442622950819672, 'mean': 0.6500530407829777}


In [16]:
# === BUILD CLASS LABELS (train / val / test) ===

OP_COL = PATHS.OPERATOR_COL  # "operator"

# Train labels + class names
y_train_cls, class_names = pd.factorize(df_train[OP_COL], sort=True)

# Val/Test labels must use SAME class order
y_val_cls = pd.Categorical(df_val[OP_COL], categories=class_names).codes
y_test_cls = pd.Categorical(df_test[OP_COL], categories=class_names).codes

# Safety checks
assert (y_val_cls >= 0).all(), "VAL contains unseen operator labels"
assert (y_test_cls >= 0).all(), "TEST contains unseen operator labels"

print("Classes:", list(class_names))
print("y_train_cls shape:", y_train_cls.shape)
print("y_val_cls shape:", y_val_cls.shape)
print("y_test_cls shape:", y_test_cls.shape)


Classes: ['aggregate', 'displace', 'select', 'simplify']
y_train_cls shape: (450,)
y_val_cls shape: (57,)
y_test_cls shape: (57,)


## üß† 8) Train MLP ##

In [17]:
# =========================
# MLP search where each model trains on ALL training data
# =========================
import numpy as np
from pathlib import Path
from pprint import pprint
from dataclasses import dataclass

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix

# ---- numerics: keep float64 everywhere ----
X_train_s = X_train_s.astype(np.float64, copy=False)
X_val_s   = X_val_s.astype(np.float64, copy=False)
X_test_s  = X_test_s.astype(np.float64, copy=False)
sample_w  = sample_w.astype(np.float64, copy=False)

# ---- group by map_id (maps can repeat; prompts don't) ----
assert "map_id" in df_train.columns, "df_train must contain 'map_id' for grouped CV."
groups_tr = df_train["map_id"].astype(str).values

# ---- CV splitter (for scoring only) ----
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

# ---- search space helpers ----
rng = np.random.RandomState(42)

def draw_params(n):
    sizes = [(64,), (128,), (256,), (128, 64), (256, 128), (256, 128, 64)]
    batches = [16, 32, 64, 128]
    for _ in range(n):
        yield {
            "hidden_layer_sizes": sizes[rng.randint(len(sizes))],
            "alpha": 10**rng.uniform(-5, np.log10(3e-2)),          # loguniform(1e-5, 3e-2)
            "learning_rate_init": 10**rng.uniform(-4, np.log10(3e-3)),  # loguniform(1e-4, 3e-3)
            "batch_size": batches[rng.randint(len(batches))],
            "activation": "relu",
            "solver": "adam",
            "max_iter": 800,            # allow convergence w/o early stopping
            "early_stopping": False,    # <‚Äî IMPORTANT: use ALL training samples
            "random_state": 42,
            "verbose": False,
            "tol": 1e-4
        }

# ---- CV scorer using grouped folds; model sees only its fold-train here (for the score only) ----
def cv_macro_f1(params):
    scores = []
    for tr_idx, va_idx in cv.split(X_train_s, y_train_cls, groups_tr):
        clf = MLPClassifier(**params)
        clf.fit(X_train_s[tr_idx], y_train_cls[tr_idx], sample_weight=sample_w[tr_idx])
        pred = clf.predict(X_train_s[va_idx])
        scores.append(f1_score(y_train_cls[va_idx], pred, average="macro"))
    return float(np.mean(scores)), float(np.std(scores))

@dataclass
class Candidate:
    params: dict
    cv_mean: float
    cv_std: float
    val_f1: float
    val_acc: float

# ---- run search ----
N_ITER = 50   # tune this for time/quality tradeoff
candidates = []

print(f"\nSearching {N_ITER} MLP configs...")
for i, params in enumerate(draw_params(N_ITER), 1):
    cv_mean, cv_std = cv_macro_f1(params)

    # IMPORTANT PART: refit SAME PARAMS on FULL TRAIN (no early_stopping) so the model sees ALL training data
    clf_full = MLPClassifier(**params)
    clf_full.fit(X_train_s, y_train_cls, sample_weight=sample_w)

    # evaluate on external VAL (never used for training)
    val_pred = clf_full.predict(X_val_s)
    val_f1 = f1_score(y_val_cls, val_pred, average="macro")
    val_acc = accuracy_score(y_val_cls, val_pred)

    candidates.append(Candidate(params, cv_mean, cv_std, val_f1, val_acc))
    print(f"[{i:02d}/{N_ITER}] cvF1={cv_mean:.3f}¬±{cv_std:.3f} | VAL F1={val_f1:.3f} acc={val_acc:.3f} | {params['hidden_layer_sizes']}, Œ±={params['alpha']:.2e}, lr={params['learning_rate_init']:.1e}, bs={params['batch_size']}")

# ---- pick winner by external VAL macro-F1 (tie-breaker: VAL acc, then CV mean) ----
candidates.sort(key=lambda c: (c.val_f1, c.val_acc, c.cv_mean), reverse=True)
best = candidates[0]
print("\n=== Top candidates (by VAL macro-F1) ===")
for c in candidates[:5]:
    print(f"VAL F1={c.val_f1:.3f} (acc={c.val_acc:.3f}) | cvF1={c.cv_mean:.3f}¬±{c.cv_std:.3f} | params={c.params}")

print("\nüèÜ Selected params:")
pprint(best.params)

# ---- train final model on FULL TRAIN (no early_stopping so it uses 100% of train) ----
final_mlp = MLPClassifier(**best.params)
final_mlp.fit(X_train_s, y_train_cls, sample_weight=sample_w)

# ---- evaluate on VAL & TEST ----
for name, Xs, ys in [("VAL", X_val_s, y_val_cls), ("TEST", X_test_s, y_test_cls)]:
    yhat = final_mlp.predict(Xs)
    acc  = accuracy_score(ys, yhat)
    f1m  = f1_score(ys, yhat, average="macro")
    print(f"\n===== {name} =====")
    print(f"{name}: acc={acc:.4f}  f1_macro={f1m:.4f}")
    print(classification_report(ys, yhat, target_names=list(class_names)))
    print("Confusion matrix:\n", confusion_matrix(ys, yhat))

# ---- save final model ----
out_dir = Path(PATHS.TRAIN_OUT); out_dir.mkdir(parents=True, exist_ok=True)
import joblib
joblib.dump(
    {
        "model": final_mlp,
        "class_names": list(class_names),
        "best_params": best.params,
    },
    out_dir / "best_mlp_fulltrain.joblib"
)
print(f"\n‚úÖ Saved final MLP (trained on ALL TRAIN) to: {out_dir / 'best_mlp_fulltrain.joblib'}")


Searching 50 MLP configs...
[01/50] cvF1=0.694¬±0.068 | VAL F1=0.772 acc=0.772 | (128, 64), Œ±=2.02e-02, lr=1.2e-03, bs=16
[02/50] cvF1=0.868¬±0.058 | VAL F1=0.896 acc=0.895 | (256, 128), Œ±=3.49e-05, lr=1.7e-04, bs=64
[03/50] cvF1=0.791¬±0.044 | VAL F1=0.801 acc=0.807 | (256,), Œ±=1.03e-02, lr=7.7e-04, bs=128
[04/50] cvF1=0.835¬±0.043 | VAL F1=0.889 acc=0.895 | (256,), Œ±=1.18e-05, lr=2.7e-03, bs=128
[05/50] cvF1=0.856¬±0.052 | VAL F1=0.853 acc=0.860 | (256, 128, 64), Œ±=5.47e-05, lr=1.9e-04, bs=16
[06/50] cvF1=0.870¬±0.067 | VAL F1=0.844 acc=0.842 | (64,), Œ±=1.14e-04, lr=6.0e-04, bs=128
[07/50] cvF1=0.870¬±0.060 | VAL F1=0.881 acc=0.877 | (64,), Œ±=1.03e-04, lr=8.0e-04, bs=32
[08/50] cvF1=0.753¬±0.075 | VAL F1=0.842 acc=0.842 | (128, 64), Œ±=2.43e-02, lr=2.2e-04, bs=32
[09/50] cvF1=0.833¬±0.030 | VAL F1=0.860 acc=0.860 | (256, 128, 64), Œ±=4.95e-05, lr=5.7e-04, bs=128
[10/50] cvF1=0.864¬±0.064 | VAL F1=0.881 acc=0.877 | (64,), Œ±=1.45e-05, lr=7.9e-04, bs=16
[11/50] cvF1=0.858¬±0.05

In [18]:
# =========================
# Regression branch (one MLPRegressor per operator)
# =========================
import numpy as np
from pathlib import Path
from pprint import pprint
import joblib

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import loguniform

# ---- 1) Prepare numeric regression targets
def _coerce_param_to_float(s):
    try:
        return float(s)
    except Exception:
        return np.nan

PARAM_COL = PATHS.PARAM_VALUE_COL  # "param_value"

y_train_reg = df_train[PARAM_COL].apply(_coerce_param_to_float).to_numpy()
y_val_reg   = df_val[PARAM_COL].apply(_coerce_param_to_float).to_numpy()
y_test_reg  = df_test[PARAM_COL].apply(_coerce_param_to_float).to_numpy()

assert np.isfinite(y_train_reg).all() and np.isfinite(y_val_reg).all() and np.isfinite(y_test_reg).all(), \
    f"Non-finite values found in regression target '{PARAM_COL}'. Clean/parse them first."

# Optional: log1p transform if param is positive and skewed
USE_LOG1P = False
if USE_LOG1P:
    assert (y_train_reg >= 0).all() and (y_val_reg >= 0).all() and (y_test_reg >= 0).all(), \
        "log1p selected but param has negatives."
    ytr_reg_t = np.log1p(y_train_reg)
    yva_reg_t = np.log1p(y_val_reg)
    yte_reg_t = np.log1p(y_test_reg)
    def inv_t(x): return np.expm1(x)
else:
    ytr_reg_t = y_train_reg.copy()
    yva_reg_t = y_val_reg.copy()
    yte_reg_t = y_test_reg.copy()
    def inv_t(x): return x

# ---- 2) Grouped CV by map_id for *regression* (no stratification needed on a numeric target)
assert "map_id" in df_train.columns
gk = GroupKFold(n_splits=5)
groups_tr = df_train["map_id"].astype(str).values

# ---- 3) Search space for MLPRegressor (kept modest; widen n_iter to search more)
base_reg = MLPRegressor(
    activation="relu",
    solver="adam",
    learning_rate="adaptive",   # <‚Äî helps convergence on tough subsets
    early_stopping=False,       # keep OFF during search so it uses all class data
    max_iter=2000,              # <‚Äî more runway
    tol=1e-3,                   # <‚Äî slightly easier convergence threshold
    random_state=42,
    verbose=False,
    batch_size="auto"           # <‚Äî avoids clipping warnings
)
param_dist_reg = {
    "hidden_layer_sizes": [(64,), (128,), (256,), (128, 64), (256, 128)],
    "alpha": loguniform(1e-6, 3e-2),        # widen upper range for stronger regularization
    "learning_rate_init": loguniform(1e-4, 3e-3),
    # "batch_size": ["auto"]  # not tuning batch size anymore
}

# ---- 4) Fit one regressor per class
n_classes = len(class_names)
regressors = {}
search_summaries = {}

for cls_idx, cls_name in enumerate(class_names):
    m_tr = (y_train_cls == cls_idx)

    Xk = X_train_s[m_tr]
    yk = ytr_reg_t[m_tr]
    gk_tr = groups_tr[m_tr]
    wk = sample_w[m_tr]  # <-- ADD THIS

    if Xk.shape[0] < 10:
        print(f"‚ö†Ô∏è Skipping class '{cls_name}' (too few samples: {Xk.shape[0]}).")
        continue

    t_scaler = StandardScaler()
    yk_s = t_scaler.fit_transform(yk.reshape(-1, 1)).ravel()

    splits = list(gk.split(Xk, yk_s, groups=gk_tr))

    search = RandomizedSearchCV(
        estimator=base_reg,
        param_distributions=param_dist_reg,
        n_iter=40,
        scoring="neg_root_mean_squared_error",
        cv=splits,
        n_jobs=-1,
        refit=True,
        random_state=42,
        verbose=1
    )

    # ‚úÖ weighted fitting
    search.fit(Xk, yk_s, sample_weight=wk)

    print(f"\n=== Regressor for class '{cls_name}' ===")
    print("best CV RMSE:", -search.best_score_)
    print("best params:"); pprint(search.best_params_)
    search_summaries[cls_name] = {"neg_rmse_cv": search.best_score_, "params": search.best_params_}

    reg_full = MLPRegressor(
        **{**search.best_estimator_.get_params(), "early_stopping": False, "max_iter": 2000, "random_state": 42}
    )

    # ‚úÖ weighted refit on full subset
    reg_full.fit(Xk, yk_s, sample_weight=wk)

    regressors[cls_name] = (reg_full, t_scaler)


# ---- 5) Evaluate on VAL & TEST using your classifier's prediction to route to regressors
def route_and_predict(Xs, pred_cls_idx):
    yhat_reg = np.zeros(len(pred_cls_idx), dtype=float)
    for i, cidx in enumerate(pred_cls_idx):
        cname = class_names[cidx]
        pack = regressors.get(cname)
        if pack is None:
            yhat_reg[i] = np.nan
            continue
        reg, t_scaler = pack
        y_pred_scaled = reg.predict(Xs[i:i+1])[0]
        # inverse target scaling
        y_pred = t_scaler.inverse_transform([[y_pred_scaled]])[0,0]
        yhat_reg[i] = y_pred
    return yhat_reg


# helper to print metrics (older sklearn: no squared=False)
def print_reg_metrics(name, y_true, y_pred_transformed):
    # inverse-transform predictions if you used log1p
    y_pred = inv_t(y_pred_transformed)

    # guard against NaNs (e.g., missing regressor for a class)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    if mask.sum() == 0:
        print(f"{name}: no finite pairs to evaluate.")
        return np.nan, np.nan
    if mask.sum() < len(y_true):
        print(f"{name}: dropped {len(y_true) - mask.sum()} samples with NaNs.")

    y_true_m = y_true[mask]
    y_pred_m = y_pred[mask]

    mae = mean_absolute_error(y_true_m, y_pred_m)
    mse = mean_squared_error(y_true_m, y_pred_m)   # older sklearn doesn't support squared=False
    rmse = np.sqrt(mse)
    print(f"{name}: MAE={mae:.4f}  RMSE={rmse:.4f}")
    return mae, rmse


# Classification predictions (already trained classifier)
clf_cls = final_mlp 
val_pred_cls = clf_cls.predict(X_val_s)
test_pred_cls = clf_cls.predict(X_test_s)

# route to per-class regressors
yhat_val_reg_t  = route_and_predict(X_val_s,  val_pred_cls)
yhat_test_reg_t = route_and_predict(X_test_s, test_pred_cls)

print("\n--- Regression with predicted classes (realistic) ---")
print_reg_metrics("VAL",  y_val_reg,  yhat_val_reg_t)
print_reg_metrics("TEST", y_test_reg, yhat_test_reg_t)

# ---- 6) Optional: 'oracle' evaluation to isolate regressor quality (use TRUE class for routing)
yhat_val_oracle_t  = route_and_predict(X_val_s,  y_val_cls)
yhat_test_oracle_t = route_and_predict(X_test_s, y_test_cls)

print("\n--- Regression with TRUE classes (oracle routing) ---")
print_reg_metrics("VAL-oracle",  y_val_reg,  yhat_val_oracle_t)
print_reg_metrics("TEST-oracle", y_test_reg, yhat_test_oracle_t)

# ---- 7) Save bundle
bundle = {
    "classifier": clf_cls,
    "regressors_by_class": regressors,
    "class_names": class_names,
    "use_log1p": USE_LOG1P
}


out_dir = Path(PATHS.TRAIN_OUT)
out_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(bundle, out_dir / "cls_plus_regressors.joblib")
print(f"\n‚úÖ Saved classification+regression bundle to: {out_dir / 'cls_plus_regressors.joblib'}")


Fitting 5 folds for each of 40 candidates, totalling 200 fits

=== Regressor for class 'aggregate' ===
best CV RMSE: 1.0973357636023973
best params:
{'alpha': np.float64(0.0041619125396912095),
 'hidden_layer_sizes': (64,),
 'learning_rate_init': np.float64(0.00010558059144381523)}
Fitting 5 folds for each of 40 candidates, totalling 200 fits

=== Regressor for class 'displace' ===
best CV RMSE: 1.0909824522953682
best params:
{'alpha': np.float64(2.1453931225439485e-06),
 'hidden_layer_sizes': (128,),
 'learning_rate_init': np.float64(0.0001483039268456802)}
Fitting 5 folds for each of 40 candidates, totalling 200 fits

=== Regressor for class 'select' ===
best CV RMSE: 0.775812459723739
best params:
{'alpha': np.float64(3.11927680501103e-05),
 'hidden_layer_sizes': (256,),
 'learning_rate_init': np.float64(0.00010725209743172001)}
Fitting 5 folds for each of 40 candidates, totalling 200 fits

=== Regressor for class 'simplify' ===
best CV RMSE: 1.0010446621807334
best params:
{'alpha

In [19]:
import numpy as np
import pandas as pd

PARAM_COL = PATHS.PARAM_VALUE_COL

def summarize(name, dfx):
    s = pd.to_numeric(dfx[PARAM_COL], errors="coerce")
    print(f"\n{name} param_value summary")
    print(s.describe(percentiles=[.5,.9,.95,.99]))
    print("max:", s.max())
    print("min:", s.min())

summarize("TRAIN", df_train)
summarize("VAL", df_val)
summarize("TEST", df_test)

print("\nPer-operator TEST ranges:")
print(df_test.groupby(PATHS.OPERATOR_COL)[PARAM_COL].apply(lambda x: pd.to_numeric(x, errors="coerce").describe()))



TRAIN param_value summary
count    450.000000
mean      30.689262
std       44.916061
min        0.000000
50%        5.631000
90%      121.822000
95%      129.722000
99%      142.458090
max      167.000000
Name: param_value, dtype: float64
max: 167.0
min: 0.0

VAL param_value summary
count     57.000000
mean      29.399544
std       41.171192
min        0.000000
50%        5.172000
90%       93.567200
95%      113.033800
99%      120.280000
max      127.000000
Name: param_value, dtype: float64
max: 127.0
min: 0.0

TEST param_value summary
count     57.00000
mean      38.30614
std      102.82649
min        0.00000
50%        5.57300
90%       88.90940
95%      101.47200
99%      393.43448
max      753.84600
Name: param_value, dtype: float64
max: 753.846
min: 0.0

Per-operator TEST ranges:
operator        
aggregate  count     16.000000
           mean       1.082500
           std        2.141785
           min        0.000000
           25%        0.000000
           50%        0.0000