## üß© 0) Setup & Imports ##

In [1]:
# ===================== PARAMETERS / IMPORTS =====================
from pathlib import Path
import sys, subprocess, numpy as np, pandas as pd, joblib

from sklearn.model_selection import train_test_split, StratifiedGroupKFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer


from dataclasses import dataclass


from pprint import pprint

from scipy.stats import loguniform, randint

import joblib

# Project config
PROJ_ROOT = Path("../").resolve()
SRC_DIR   = PROJ_ROOT / "src"
if str(PROJ_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJ_ROOT))

from src.config import (
    PATHS, CFG, print_summary,
    DISTANCE_OPS, AREA_OPS,
    USE_DYNAMIC_EXTENT_REFS, ALLOW_FALLBACK_EXTENT,
    EXTENT_DIAG_COL, EXTENT_AREA_COL
)
print_summary()
print("USE_DYNAMIC_EXTENT_REFS:", USE_DYNAMIC_EXTENT_REFS)
print("ALLOW_FALLBACK_EXTENT  :", ALLOW_FALLBACK_EXTENT)
print("EXTENT_DIAG_COL:", EXTENT_DIAG_COL, " EXTENT_AREA_COL:", EXTENT_AREA_COL)


DEFAULT_TILE_DIAG_M = CFG.DEFAULT_TILE_DIAG_M
DEFAULT_TILE_AREA_M2 = CFG.DEFAULT_TILE_AREA_M2

# Dims (fallbacks if CFG unset)
MAP_DIM = PROMPT_DIM = FUSED_DIM = None

BATCH_SIZE  = CFG.BATCH_SIZE

# Clean outputs for a fresh run
PATHS.clean_outputs()


=== CONFIG SUMMARY ===
PROJ_ROOT  : /Users/amirdonyadide/Documents/GitHub/Thesis
DATA_DIR   : /Users/amirdonyadide/Documents/GitHub/Thesis/data
INPUT_DIR  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/input
OUTPUT_DIR : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output
MAPS_ROOT  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/samples/pairs
INPUT PAT. : *_input.geojson
--- User Study ---
USER_STUDY_XLSX : /Users/amirdonyadide/Documents/GitHub/Thesis/data/userstudy/UserStudy.xlsx
RESPONSES_SHEET : Responses
TILE_ID_COL     : tile_id
COMPLETE_COL    : complete
REMOVE_COL      : remove
TEXT_COL        : cleaned_text
PARAM_VALUE_COL : param_value
OPERATOR_COL    : operator
INTENSITY_COL   : intensity
--- Filters / IDs / Split ---
ONLY_COMPLETE   : True
EXCLUDE_REMOVED : True
PROMPT_ID       : r{i:08d}
SPLIT_BY        : tile
--- Outputs ---
PROMPT_OUT : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out
MAP_OUT    : /Users/amirdonyadide/Documents/GitH

## üìö 1) Build Prompt Embeddings (USE) ##

In [2]:
from pathlib import Path

# Import the module
from src.mapvec.prompts import prompt_embeddings as pe

# Choose paths
data_dir = PATHS.DATA_DIR  # or Path("../data").resolve()
in_path  = PATHS.USER_STUDY_XLSX
out_dir  = PATHS.PROMPT_OUT

# Logging (match what CLI does)
pe.setup_logging(verbosity=1)

# Load prompts (will filter complete==True & remove==False because you updated the function)
ids, texts, tile_ids, id_colname = pe.load_prompts_from_source(
    input_path=Path(in_path),
    sheet_name=PATHS.RESPONSES_SHEET,
    tile_id_col=PATHS.TILE_ID_COL,
    complete_col=PATHS.COMPLETE_COL,
    remove_col=PATHS.REMOVE_COL,
    text_col=PATHS.TEXT_COL,
)


print(f"Loaded {len(texts)} prompts.")

# Get embedder based on CFG.PROMPT_ENCODER (dan/transformer/openai-small/openai-large)
embed_fn, model_label = pe.get_embedder(
    kind=CFG.PROMPT_ENCODER,
    data_dir=Path(data_dir),
    l2_normalize=True,
    batch_size=CFG.BATCH_SIZE,
)

# Embed
E = embed_fn(texts)

# Save outputs in the same format as before
pe.save_outputs(
    out_dir=Path(out_dir),
    ids=ids,
    texts=texts,
    tile_ids=tile_ids,          
    E=E,
    model_name=model_label,
    l2_normalized=True,
    id_colname=id_colname,      
    also_save_embeddings_csv=False,
)


print("‚úÖ Prompt embeddings completed.")


2026-01-25 00:08:30 | INFO | Reading Excel: /Users/amirdonyadide/Documents/GitHub/Thesis/data/userstudy/UserStudy.xlsx (sheet=Responses)
2026-01-25 00:08:30 | INFO | Filtered Excel rows: 786 ‚Üí 562 (only_complete=True, exclude_removed=True)


Loaded 562 prompts.


2026-01-25 00:08:31 | INFO | Embedding 562 prompts with OpenAI model=text-embedding-3-small (batch_size=512, l2=True)‚Ä¶
2026-01-25 00:08:33 | INFO | HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-01-25 00:08:35 | INFO | HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2026-01-25 00:08:35 | INFO | Done OpenAI embedding in 3.97s (dim=1536).
2026-01-25 00:08:35 | INFO | Writing outputs to /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out
2026-01-25 00:08:35 | INFO |   saved prompts_embeddings.npz (shape=(562, 1536))
2026-01-25 00:08:35 | INFO |   saved prompts.parquet (rows=562)
2026-01-25 00:08:35 | INFO |   saved meta.json


‚úÖ Prompt embeddings completed.


## üó∫Ô∏è 2) Build Map Embeddings (geometric) ##

In [3]:
from pathlib import Path
import numpy as np
import pandas as pd

from src.mapvec.maps import map_embeddings as me

# ------------------------------------------------------------
# 1) Load allowed tile_ids from Excel
# ------------------------------------------------------------
dfu = pd.read_excel(PATHS.USER_STUDY_XLSX, sheet_name=PATHS.RESPONSES_SHEET)

dfu[PATHS.COMPLETE_COL] = dfu[PATHS.COMPLETE_COL].astype(bool)
dfu[PATHS.REMOVE_COL]   = dfu[PATHS.REMOVE_COL].astype(bool)

mask = pd.Series(True, index=dfu.index)
if PATHS.ONLY_COMPLETE:
    mask &= (dfu[PATHS.COMPLETE_COL] == True)
if PATHS.EXCLUDE_REMOVED:
    mask &= (dfu[PATHS.REMOVE_COL] == False)
dfu = dfu[mask].copy()

tile_raw = dfu[PATHS.TILE_ID_COL]
tile_num = pd.to_numeric(tile_raw, errors="coerce")
if tile_num.notna().all():
    allowed_tile_ids = set(tile_num.astype(int).astype(str).str.zfill(4).tolist())
else:
    allowed_tile_ids = set(tile_raw.astype(str).str.strip().str.zfill(4).tolist())

print(f"Allowed tiles from Excel: {len(allowed_tile_ids)}")

# ------------------------------------------------------------
# 2) Discover GeoJSONs and filter by tile_id
# ------------------------------------------------------------
me.setup_logging(verbosity=1)

pairs = list(me.find_geojsons(PATHS.MAPS_ROOT, PATHS.INPUT_MAPS_PATTERN))
pairs = [(map_id, path) for (map_id, path) in pairs if str(map_id).strip().zfill(4) in allowed_tile_ids]

if not pairs:
    raise RuntimeError("No maps left after Excel filtering.")

print(f"Maps to embed after filtering: {len(pairs)}")

# ------------------------------------------------------------
# 3) First pass: polygon counting
# ------------------------------------------------------------
counts = {}
for map_id, path in pairs:
    try:
        counts[map_id] = me._count_valid_polygons(path)
    except Exception:
        counts[map_id] = 0

max_polygons = max(max(counts.values()), 1)

# ------------------------------------------------------------
# 4) Second pass: embed maps (DYNAMIC extent normalization)
# ------------------------------------------------------------
ids, vecs, rows = [], [], []
feat_names = None
first_dim = None

for map_id, path in pairs:
    vec, names = me.embed_one_map(
        path,
        max_polygons=max_polygons,
        norm="extent",     # ‚úÖ dynamic per-map normalization
        norm_wh=None,
    )

    if first_dim is None:
        first_dim = vec.shape[0]
        feat_names = names
    elif vec.shape[0] != first_dim:
        print(f"Skipping {map_id}: dim mismatch")
        continue

    ids.append(map_id)
    vecs.append(vec)

    # ‚úÖ dynamic per-map extent refs (computed from GeoJSON)
    extent = me.compute_extent_refs(path)
    # optional safety: skip degenerate extents
    if not np.isfinite(extent.get("extent_diag_m", np.nan)) or extent["extent_diag_m"] <= 0:
        print(f"Skipping {map_id}: bad extent_diag_m")
        continue
    if not np.isfinite(extent.get("extent_area_m2", np.nan)) or extent["extent_area_m2"] <= 0:
        print(f"Skipping {map_id}: bad extent_area_m2")
        continue

    rows.append({
        "map_id": map_id,
        "geojson": str(path),
        "n_polygons": counts.get(map_id, 0),

        # Save these into maps.parquet so concat can merge later
        **extent,
    })

E = np.vstack(vecs).astype(np.float32)

# ------------------------------------------------------------
# 5) Save outputs (same format as script)
# ------------------------------------------------------------
me.save_outputs(
    out_dir=PATHS.MAP_OUT,
    rows=rows,
    E=E,
    ids=ids,
    feat_names=feat_names or [],
    save_csv=False,
)

print("‚úÖ Map embeddings completed.")

Allowed tiles from Excel: 399
Maps to embed after filtering: 399
‚úÖ Map embeddings completed.


In [4]:
import numpy as np

def infer_dims(paths):
    prm_npz = paths.PROMPT_OUT / "prompts_embeddings.npz"
    map_npz = paths.MAP_OUT / "maps_embeddings.npz"

    if not prm_npz.exists():
        raise FileNotFoundError(f"Missing {prm_npz} (run prompt embeddings first)")
    z = np.load(prm_npz, allow_pickle=True)
    PROMPT_DIM = int(z["E"].shape[1])

    if not map_npz.exists():
        raise FileNotFoundError(f"Missing {map_npz} (run map embeddings first)")
    z2 = np.load(map_npz, allow_pickle=True)
    MAP_DIM = int(z2["E"].shape[1])

    FUSED_DIM = MAP_DIM + PROMPT_DIM
    return MAP_DIM, PROMPT_DIM, FUSED_DIM

MAP_DIM, PROMPT_DIM, FUSED_DIM = infer_dims(PATHS)

# Prefer inferred dims as the source of truth (CFG may be default/env)
if MAP_DIM != CFG.MAP_DIM or PROMPT_DIM != CFG.PROMPT_DIM:
    print(f"‚ö†Ô∏è CFG dims differ from inferred dims. Using inferred dims.")
    print(f"   inferred: MAP_DIM={MAP_DIM}, PROMPT_DIM={PROMPT_DIM}")
    print(f"   CFG:      MAP_DIM={CFG.MAP_DIM}, PROMPT_DIM={CFG.PROMPT_DIM}")


print("‚úÖ Inferred dims:", {"MAP_DIM": MAP_DIM, "PROMPT_DIM": PROMPT_DIM, "FUSED_DIM": FUSED_DIM})


‚ö†Ô∏è CFG dims differ from inferred dims. Using inferred dims.
   inferred: MAP_DIM=165, PROMPT_DIM=1536
   CFG:      MAP_DIM=165, PROMPT_DIM=512
‚úÖ Inferred dims: {'MAP_DIM': 165, 'PROMPT_DIM': 1536, 'FUSED_DIM': 1701}


## üîó 3) Concatenate (pairs ‚Üí fused rows) ##

In [5]:
from pathlib import Path
import numpy as np
import pandas as pd
import json

from src.mapvec.concat import concat_embeddings as ce
ce.setup_logging(verbosity=1)

map_npz_path = Path(PATHS.MAP_OUT / "maps_embeddings.npz")
maps_pq      = Path(PATHS.MAP_OUT / "maps.parquet")           # ‚úÖ NEW (has extent_*)
prm_npz_path = Path(PATHS.PROMPT_OUT / "prompts_embeddings.npz")
prompts_pq   = Path(PATHS.PROMPT_OUT / "prompts.parquet")
out_dir      = Path(PATHS.TRAIN_OUT)
out_dir.mkdir(parents=True, exist_ok=True)

# ---- build pairs from prompts.parquet (authoritative) ----
pairs = pd.read_parquet(prompts_pq)

if "prompt_id" not in pairs.columns or "tile_id" not in pairs.columns:
    raise RuntimeError("prompts.parquet must contain columns: prompt_id, tile_id")

# keep text for hybrid/rule-based param extraction later
need_cols = ["tile_id", "prompt_id", "text"]
missing_cols = [c for c in need_cols if c not in pairs.columns]
if missing_cols:
    raise RuntimeError(f"prompts.parquet missing required columns for option B: {missing_cols}")

pairs = pairs.rename(columns={"tile_id": "map_id"})[["map_id", "prompt_id", "text"]].copy()
pairs["map_id"] = pairs["map_id"].astype(str).str.strip().str.zfill(4)
pairs["prompt_id"] = pairs["prompt_id"].astype(str).str.strip()
pairs = pairs.dropna(subset=["map_id", "prompt_id"])
pairs = pairs[(pairs["map_id"] != "") & (pairs["prompt_id"] != "")]
pairs = pairs.drop_duplicates(subset=["map_id", "prompt_id"]).reset_index(drop=True)

# ---- load map extent refs from maps.parquet and merge into pairs ----
if not maps_pq.exists():
    raise FileNotFoundError(f"Missing {maps_pq}. Run map embedding first to create maps.parquet with extent_* columns.")

maps_df = pd.read_parquet(maps_pq)
maps_df["map_id"] = maps_df["map_id"].astype(str).str.strip().str.zfill(4)

required_extent_cols = ["map_id", "extent_diag_m", "extent_area_m2"]
missing = [c for c in required_extent_cols if c not in maps_df.columns]
if missing:
    raise RuntimeError(f"maps.parquet is missing required extent columns: {missing}. Re-run map embedding with extent saving.")

# keep a minimal set (add more if you like)
extent_cols = [
    "map_id",
    "extent_diag_m",
    "extent_area_m2",
    "extent_width_m",
    "extent_height_m",
    "extent_minx",
    "extent_miny",
    "extent_maxx",
    "extent_maxy",
]
extent_cols = [c for c in extent_cols if c in maps_df.columns]

pairs = pairs.merge(maps_df[extent_cols], on="map_id", how="left")

# safety: rows missing extents means mismatch between prompts and embedded maps
n_missing_extent = pairs["extent_diag_m"].isna().sum()
if n_missing_extent:
    print(f"‚ö†Ô∏è Dropping {n_missing_extent} rows with missing extent refs after merge.")
    pairs = pairs.dropna(subset=["extent_diag_m", "extent_area_m2"]).reset_index(drop=True)


# ---- load embeddings ----
E_map, map_ids = ce.load_npz(map_npz_path)
E_prm, prm_ids = ce.load_npz(prm_npz_path)

idx_map = {k: i for i, k in enumerate(map_ids)}
idx_prm = {k: i for i, k in enumerate(prm_ids)}

# ---- match & build X ----
chosen_rows, im_list, ip_list = [], [], []
missing_ids = 0

for i, row in enumerate(pairs.itertuples(index=False), start=0):
    im = idx_map.get(row.map_id)
    ip = idx_prm.get(row.prompt_id)
    if im is None or ip is None:
        missing_ids += 1
        continue
    chosen_rows.append(i)
    im_list.append(im)
    ip_list.append(ip)

if not im_list:
    raise RuntimeError("No valid pairs after ID matching.")

if missing_ids:
    print(f"‚ö†Ô∏è Skipped {missing_ids} rows with missing IDs in embeddings")

X_map = E_map[np.asarray(im_list, dtype=int)].astype(np.float32, copy=False)
X_prm = E_prm[np.asarray(ip_list, dtype=int)].astype(np.float32, copy=False)
X = np.hstack([X_map, X_prm]).astype(np.float32, copy=False)

np.save(out_dir / "X_concat.npy", X)

join_df = pairs.iloc[chosen_rows].reset_index(drop=True)

# ‚úÖ No constant tile_* columns anymore. We keep dynamic extent_* columns from maps.parquet.
assert X.shape[0] == len(join_df), "Row count mismatch between X and join_df."

join_df.to_parquet(out_dir / "train_pairs.parquet", index=False)

meta = {
    "shape": [int(X.shape[0]), int(X.shape[1])],
    "map_dim": int(E_map.shape[1]),
    "prompt_dim": int(E_prm.shape[1]),
    "rows": int(X.shape[0]),
    "skipped_pairs_missing_ids": int(missing_ids),
    "sources": {
        "prompts_parquet": str(prompts_pq),
        "maps_parquet": str(maps_pq),
        "map_npz": str(map_npz_path),
        "prompt_npz": str(prm_npz_path),
    },
    "extent_cols_saved": [c for c in extent_cols if c != "map_id"],
}
(out_dir / "meta.json").write_text(json.dumps(meta, indent=2))

print("‚úÖ Concatenation completed.")

‚úÖ Concatenation completed.


## üì• 4) Load & Basic Cleaning ##

In [7]:
# === LOAD FUSED DATA (operator + param_value + DYNAMIC-EXTENT normalized target) ===
import numpy as np
import pandas as pd

# IMPORTANT: make sure these are imported from src.config somewhere above in the notebook:
# from src.config import PATHS, CFG, DISTANCE_OPS, AREA_OPS

X = np.load(PATHS.TRAIN_OUT / "X_concat.npy")
pairs_df = pd.read_parquet(PATHS.TRAIN_OUT / "train_pairs.parquet")
print(f"Loaded X: {X.shape}, pairs: {pairs_df.shape}")

# --- Rebuild labels from Excel and merge (train_pairs already has extent_* from maps.parquet) ---
dfu = pd.read_excel(PATHS.USER_STUDY_XLSX, sheet_name=PATHS.RESPONSES_SHEET)

dfu[PATHS.COMPLETE_COL] = dfu[PATHS.COMPLETE_COL].astype(bool)
dfu[PATHS.REMOVE_COL]   = dfu[PATHS.REMOVE_COL].astype(bool)

mask_excel = pd.Series(True, index=dfu.index)
if PATHS.ONLY_COMPLETE:
    mask_excel &= (dfu[PATHS.COMPLETE_COL] == True)
if PATHS.EXCLUDE_REMOVED:
    mask_excel &= (dfu[PATHS.REMOVE_COL] == False)
dfu = dfu[mask_excel].copy()

# prompt_id mapping must match prompt_embeddings.py
dfu = dfu.reset_index(drop=False).rename(columns={"index": "_row"})
prefix = PATHS.PROMPT_ID_PREFIX
width  = PATHS.PROMPT_ID_WIDTH
dfu["prompt_id"] = dfu["_row"].apply(lambda r: f"{prefix}{int(r):0{width}d}")

# normalize tile_id -> map_id
tile_raw = dfu[PATHS.TILE_ID_COL]
tile_num = pd.to_numeric(tile_raw, errors="coerce")
if tile_num.notna().all():
    dfu["map_id"] = tile_num.astype(int).astype(str).str.zfill(4)
else:
    dfu["map_id"] = tile_raw.astype(str).str.strip().str.zfill(4)

labels = dfu[
    ["map_id", "prompt_id", PATHS.OPERATOR_COL, PATHS.PARAM_VALUE_COL, PATHS.INTENSITY_COL]
].copy()

# Merge labels onto pairs_df (which already contains extent_* columns from maps.parquet)
df = pairs_df.merge(labels, on=["map_id", "prompt_id"], how="left")
# Option B needs prompt text for rule-based parsing
if "text" not in df.columns:
    raise RuntimeError(
        "train_pairs.parquet is missing 'text'. "
        "Update concat step to keep 'text' from prompts.parquet."
    )
df["text"] = df["text"].astype("string")

OP_COL    = PATHS.OPERATOR_COL
PARAM_COL = PATHS.PARAM_VALUE_COL

# Clean targets
df[OP_COL] = df[OP_COL].astype("string").str.strip().str.lower()
df.loc[df[OP_COL].isin(["", "nan"]), OP_COL] = pd.NA
df[PARAM_COL] = pd.to_numeric(df[PARAM_COL], errors="coerce")

# --- Ensure dynamic extent refs exist (from concat merge of maps.parquet) ---
REQ_EXT = ["extent_diag_m", "extent_area_m2"]
missing = [c for c in REQ_EXT if c not in df.columns]
assert not missing, f"Missing {missing} in df. Check concat step merged maps.parquet extent_* columns."

df["extent_diag_m"]  = pd.to_numeric(df["extent_diag_m"], errors="coerce")
df["extent_area_m2"] = pd.to_numeric(df["extent_area_m2"], errors="coerce")

# --- Keep only rows with valid targets + extents ---
mask = (
    df[OP_COL].notna() &
    (df[OP_COL] != "") &
    (df[OP_COL] != "nan") &          # ‚úÖ important
    df[PARAM_COL].notna() &
    df["extent_diag_m"].notna() &
    df["extent_area_m2"].notna() &
    (df["extent_diag_m"] > 0) &
    (df["extent_area_m2"] > 0)
)


X  = X[mask.values].astype(np.float64, copy=False)
df = df.loc[mask].reset_index(drop=True)

# --- Build normalized regression target using DYNAMIC extents ---
DIST_OPS_SET = set(DISTANCE_OPS)  # from src.config
AREA_OPS_SET = set(AREA_OPS)      # from src.config

df["param_norm"] = np.nan

m_dist = df[OP_COL].isin(DIST_OPS_SET)
m_area = df[OP_COL].isin(AREA_OPS_SET)

df.loc[m_dist, "param_norm"] = df.loc[m_dist, PARAM_COL] / df.loc[m_dist, "extent_diag_m"]
df.loc[m_area, "param_norm"] = df.loc[m_area, PARAM_COL] / df.loc[m_area, "extent_area_m2"]

# Sanity: everything should be filled
bad = df["param_norm"].isna().sum()
assert bad == 0, (
    f"param_norm has {bad} NaNs. "
    "This usually means an operator is not in DISTANCE_OPS/AREA_OPS or extents are missing."
)

print(f"After cleaning: X={X.shape}, df={df.shape}, ops={sorted(df[OP_COL].unique())}")
print("Example rows:")
display(df.head(10))

Loaded X: (562, 1701), pairs: (562, 11)
After cleaning: X=(562, 1701), df=(562, 15), ops=['aggregate', 'displace', 'select', 'simplify']
Example rows:


Unnamed: 0,map_id,prompt_id,text,extent_diag_m,extent_area_m2,extent_width_m,extent_height_m,extent_minx,extent_miny,extent_maxx,extent_maxy,operator,param_value,intensity,param_norm
0,1304,r00000000,Union few of the buildings.,582.418016,169604.830164,412.352091,411.310707,369009.126498,5624443.0,369421.478589,5624855.0,aggregate,0.0,medium,0.0
1,1457,r00000002,Remove small buildings and eliminate narrow an...,496.278103,116190.319796,404.903965,286.957723,370209.074685,5626969.0,370613.97865,5627256.0,select,17.805,low,0.000153
2,1663,r00000005,Bundle nearby buildings into larger blocks.,533.109561,138157.356917,329.923688,418.755494,371811.357509,5630840.0,372141.281197,5631259.0,aggregate,0.0,medium,0.0
3,1122,r00000006,Simplify small geometric details below a speci...,597.176996,178021.909966,434.102877,410.091523,367409.757832,5630048.0,367843.86071,5630458.0,simplify,1.0,low,0.001675
4,1706,r00000009,Eliminate repeated blocks.,465.952069,108172.503069,315.34572,343.02829,372305.411445,5628541.0,372620.757165,5628884.0,select,18.226,low,0.000168
5,1174,r00000010,Make a space between the polygons.,485.597704,105023.12109,414.108392,253.612636,367806.423992,5631230.0,368220.532385,5631484.0,displace,1.975,low,0.004067
6,116,r00000011,Aggregate some of the buildings.,577.475387,166688.512862,403.286009,413.325801,359417.860505,5619642.0,359821.146514,5620056.0,aggregate,0.0,medium,0.0
7,417,r00000013,Exclude shapes with an area less than 20 squar...,571.706715,163423.957488,404.661369,403.853617,361813.128457,5622451.0,362217.789827,5622855.0,select,20.0,low,0.000122
8,1488,r00000014,Aggregate the blocks.,500.054694,115279.716486,276.818974,416.444418,370607.033823,5619641.0,370883.852797,5620058.0,aggregate,0.0,medium,0.0
9,175,r00000015,Simplify the shapes of the small buildings by ...,503.061104,119618.905293,409.629591,292.017247,359816.877837,5623767.0,360226.507427,5624059.0,aggregate,0.0,medium,0.0


## ‚úÇÔ∏è 5) Split & Targets ##

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
import json

FIXED_CLASSES = ["simplify", "select", "aggregate", "displace"]
USE_INTENSITY_FOR_STRAT = True  # will fall back to operator-only if needed

OP_COL   = PATHS.OPERATOR_COL        # "operator"
PARAM_COL = PATHS.PARAM_VALUE_COL    # "param_value"
INT_COL  = PATHS.INTENSITY_COL       # "intensity"

df = df.copy()

# operator: keep NA properly, and drop "nan"/empty
df[OP_COL] = df[OP_COL].astype("string").str.strip().str.lower()
df.loc[df[OP_COL].isin(["", "nan"]), OP_COL] = pd.NA

# intensity: keep NA properly (do NOT turn NA into "nan" string)
if INT_COL in df.columns:
    df[INT_COL] = df[INT_COL].astype("string").str.strip().str.lower()
    df.loc[df[INT_COL].isin(["", "nan"]), INT_COL] = pd.NA

# ------------------------------------------------------------
# 1) Group stats: prompts per map_id
# ------------------------------------------------------------
prompt_counts = df.groupby("map_id").size()
multi_map_ids = prompt_counts[prompt_counts > 1].index.tolist()
single_map_ids = prompt_counts[prompt_counts == 1].index.tolist()

print("=== DATASET SUMMARY ===")
print(f"Total rows (prompts): {len(df)}")
print(f"Unique maps: {prompt_counts.shape[0]}")
print(f"Multi-prompt maps (>1 prompt): {len(multi_map_ids)}")
print(f"Single-prompt maps (=1 prompt): {len(single_map_ids)}")
print("\nTop 10 maps by prompt count:")
print(prompt_counts.sort_values(ascending=False).head(10))

# ------------------------------------------------------------
# 2) Map-level table for single maps (one row per map_id)
# ------------------------------------------------------------
df_single = df[df["map_id"].isin(single_map_ids)].copy()
map_level = df_single.groupby("map_id").first().reset_index()
map_level = map_level.dropna(subset=[OP_COL]).copy()

# Build strat label: operator√óintensity if feasible, else operator only
if USE_INTENSITY_FOR_STRAT and INT_COL in map_level.columns:
    map_level["_strat"] = map_level[OP_COL] + "__" + map_level[INT_COL]
    vc = map_level["_strat"].value_counts()
    if (vc < 2).any():
        print("\n‚ö†Ô∏è Some operator√óintensity groups too rare (<2 single-maps). Falling back to operator-only stratification.")
        map_level["_strat"] = map_level[OP_COL]
else:
    map_level["_strat"] = map_level[OP_COL]

def has_all_ops(dfx: pd.DataFrame) -> bool:
    return set(dfx[OP_COL].unique()) >= set(FIXED_CLASSES)

# ------------------------------------------------------------
# 3) Split single maps into train/val/test with retries
# ------------------------------------------------------------
test_ratio = CFG.TEST_RATIO
val_ratio = CFG.VAL_RATIO
val_rel = val_ratio / (1.0 - test_ratio)

X_idx = np.arange(len(map_level))
y_strat = map_level["_strat"].to_numpy()
map_ids_arr = map_level["map_id"].to_numpy()

best = None
for attempt in range(500):
    rs = CFG.SEED + attempt

    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio, random_state=rs)
    trainval_i, test_i = next(sss1.split(X_idx, y_strat))

    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=val_rel, random_state=rs + 999)
    train_i, val_i = next(sss2.split(trainval_i, y_strat[trainval_i]))

    single_train_maps = set(map_ids_arr[trainval_i[train_i]])
    single_val_maps   = set(map_ids_arr[trainval_i[val_i]])
    single_test_maps  = set(map_ids_arr[test_i])

    train_maps = set(multi_map_ids) | single_train_maps
    val_maps   = single_val_maps
    test_maps  = single_test_maps

    # leakage check
    if (train_maps & val_maps) or (train_maps & test_maps) or (val_maps & test_maps):
        continue

    df_train_tmp = df[df["map_id"].isin(train_maps)]
    df_val_tmp   = df[df["map_id"].isin(val_maps)]
    df_test_tmp  = df[df["map_id"].isin(test_maps)]

    # must contain all operators in each split
    if not (has_all_ops(df_train_tmp) and has_all_ops(df_val_tmp) and has_all_ops(df_test_tmp)):
        continue

    best = (train_maps, val_maps, test_maps, rs)
    break

if best is None:
    raise RuntimeError(
        "Could not find a leakage-safe split with operator coverage in all splits "
        "and multi-prompt maps forced to TRAIN. "
        "Try: USE_INTENSITY_FOR_STRAT=False, or adjust VAL/TEST ratios."
    )

train_maps, val_maps, test_maps, used_seed = best

# ------------------------------------------------------------
# 4) Build row-level splits (no leakage)
# ------------------------------------------------------------
train_idx = df.index[df["map_id"].isin(train_maps)].to_numpy()
val_idx   = df.index[df["map_id"].isin(val_maps)].to_numpy()
test_idx  = df.index[df["map_id"].isin(test_maps)].to_numpy()

X_train, X_val, X_test = X[train_idx], X[val_idx], X[test_idx]
df_train = df.loc[train_idx].reset_index(drop=True)
df_val   = df.loc[val_idx].reset_index(drop=True)
df_test  = df.loc[test_idx].reset_index(drop=True)

print("\n=== SPLIT SUMMARY ===")
print(f"‚úÖ Split found (seed={used_seed})")
print(f"Train maps: {len(train_maps)}  (includes multi-prompt maps: {len(set(multi_map_ids))})")
print(f"Val maps:   {len(val_maps)}")
print(f"Test maps:  {len(test_maps)}")
print(f"Rows -> Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

# Hard guarantees
assert set(df_train["map_id"]).isdisjoint(df_val["map_id"])
assert set(df_train["map_id"]).isdisjoint(df_test["map_id"])
assert set(df_val["map_id"]).isdisjoint(df_test["map_id"])
assert set(multi_map_ids).issubset(train_maps)
print("‚úÖ Verified: no map_id leakage across splits.")
print("‚úÖ Verified: all multi-prompt maps are in TRAIN.")

split_path = PATHS.TRAIN_OUT / "splits.json"
json.dump(
    {
        "train_idx": train_idx.tolist(),
        "val_idx": val_idx.tolist(),
        "test_idx": test_idx.tolist(),
        "seed_used": int(used_seed),
        "use_intensity_for_strat": bool(USE_INTENSITY_FOR_STRAT),
    },
    open(split_path, "w"),
    indent=2
)
print("‚úÖ Saved splits to", split_path)

# ------------------------------------------------------------
# 5) Detailed diagnostics: operator + intensity coverage
# ------------------------------------------------------------
def print_operator_counts(dfx, name):
    print(f"\n{name} ‚Äî Operator counts")
    print(dfx[OP_COL].value_counts())

def print_op_intensity_table(dfx, name):
    if INT_COL not in dfx.columns:
        print(f"\n{name} ‚Äî intensity column missing; skipping op√óintensity table.")
        return
    print(f"\n{name} ‚Äî Operator √ó Intensity table (counts)")
    tab = (
        dfx.groupby([OP_COL, INT_COL]).size()
        .unstack(fill_value=0)
        .sort_index()
    )
    print(tab)
    print(f"\n{name} ‚Äî Operator totals (row sums)")
    print(tab.sum(axis=1))
    print(f"\n{name} ‚Äî Intensity totals (col sums)")
    print(tab.sum(axis=0))

def report_missing_combos(dfx, name, all_ops, all_ints):
    if INT_COL not in dfx.columns:
        return
    present = set(zip(dfx[OP_COL], dfx[INT_COL]))
    missing = [(op, it) for op in all_ops for it in all_ints if (op, it) not in present]
    if missing:
        print(f"\n‚ö†Ô∏è {name}: Missing operator√óintensity combos ({len(missing)}):")
        print(missing[:40], "..." if len(missing) > 40 else "")
    else:
        print(f"\n‚úÖ {name}: All operator√óintensity combos present.")

ALL_OPS = FIXED_CLASSES[:]  # enforce fixed order
ALL_INTS = sorted(df[INT_COL].unique()) if INT_COL in df.columns else []

print_operator_counts(df_train, "TRAIN")
print_operator_counts(df_val,   "VAL")
print_operator_counts(df_test,  "TEST")

print_op_intensity_table(df_train, "TRAIN")
print_op_intensity_table(df_val,   "VAL")
print_op_intensity_table(df_test,  "TEST")

report_missing_combos(df_train, "TRAIN", ALL_OPS, ALL_INTS)
report_missing_combos(df_val,   "VAL",   ALL_OPS, ALL_INTS)
report_missing_combos(df_test,  "TEST",  ALL_OPS, ALL_INTS)

# ------------------------------------------------------------
# 6) Map-level prompt multiplicity info per split
# ------------------------------------------------------------
def map_prompt_stats(map_set, name):
    sub_counts = prompt_counts.loc[list(map_set)]
    print(f"\n{name} ‚Äî prompts per map statistics")
    print(f"{name} ‚Äî #maps with >1 prompt:", int((sub_counts > 1).sum()))

map_prompt_stats(train_maps, "TRAIN")
map_prompt_stats(val_maps,   "VAL")
map_prompt_stats(test_maps,  "TEST")

# ------------------------------------------------------------
# 7) Optional: show top multi-prompt maps in TRAIN
# ------------------------------------------------------------
top_multi = prompt_counts.loc[multi_map_ids].sort_values(ascending=False).head(20)
print("\nTRAIN ‚Äî Top multi-prompt maps (forced to train):")
print(top_multi)


=== DATASET SUMMARY ===
Total rows (prompts): 562
Unique maps: 399
Multi-prompt maps (>1 prompt): 22
Single-prompt maps (=1 prompt): 377

Top 10 maps by prompt count:
map_id
1646    30
1304    29
1755    26
1532    13
0127    10
0168     8
0142     7
0078     6
0080     6
0001     6
dtype: int64

=== SPLIT SUMMARY ===
‚úÖ Split found (seed=42)
Train maps: 285  (includes multi-prompt maps: 22)
Val maps:   57
Test maps:  57
Rows -> Train: (448, 1701), Val: (57, 1701), Test: (57, 1701)
‚úÖ Verified: no map_id leakage across splits.
‚úÖ Verified: all multi-prompt maps are in TRAIN.
‚úÖ Saved splits to /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out/splits.json

TRAIN ‚Äî Operator counts
operator
select       144
aggregate    134
simplify     109
displace      61
Name: count, dtype: Int64

VAL ‚Äî Operator counts
operator
select       19
aggregate    16
simplify     13
displace      9
Name: count, dtype: Int64

TEST ‚Äî Operator counts
operator
select       19
aggregate  

## üßº 6) Modality-Aware Preprocessing (map only) ##

In [9]:
# === MODALITY-AWARE PREPROCESSING ===
def split_blocks(X):
    X_map    = X[:, :MAP_DIM].astype(np.float64, copy=True)
    X_prompt = X[:, MAP_DIM:MAP_DIM+PROMPT_DIM].astype(np.float64, copy=True)
    return X_map, X_prompt

def l2_normalize_rows(A, eps=1e-12):
    nrm = np.sqrt((A * A).sum(axis=1, keepdims=True))
    return A / np.maximum(nrm, eps)

# split
Xm_tr, Xp_tr = split_blocks(X_train)
Xm_va, Xp_va = split_blocks(X_val)
Xm_te, Xp_te = split_blocks(X_test)

# prompts: L2 only
Xp_tr = l2_normalize_rows(Xp_tr)
Xp_va = l2_normalize_rows(Xp_va)
Xp_te = l2_normalize_rows(Xp_te)

# maps: inf‚ÜíNaN
for A in (Xm_tr, Xm_va, Xm_te):
    A[~np.isfinite(A)] = np.nan

# impute (train)
imp = SimpleImputer(strategy="median")
Xm_tr_imp = imp.fit_transform(Xm_tr)
Xm_va_imp = imp.transform(Xm_va)
Xm_te_imp = imp.transform(Xm_te)

# clip (5‚Äì95%) train thresholds
q_lo = np.nanpercentile(Xm_tr_imp, 5, axis=0)
q_hi = np.nanpercentile(Xm_tr_imp, 95, axis=0)
def clip_to_q(A, lo, hi): return np.clip(A, lo, hi)

Xm_tr_imp = clip_to_q(Xm_tr_imp, q_lo, q_hi)
Xm_va_imp = clip_to_q(Xm_va_imp, q_lo, q_hi)
Xm_te_imp = clip_to_q(Xm_te_imp, q_lo, q_hi)

# drop zero-variance cols on train
stds = np.nanstd(Xm_tr_imp, axis=0)
keep_mask = stds > 1e-12

# scale kept columns (train fit)
scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(5, 95))
Xm_tr_kept = scaler.fit_transform(Xm_tr_imp[:, keep_mask])
Xm_va_kept = scaler.transform(Xm_va_imp[:, keep_mask])
Xm_te_kept = scaler.transform(Xm_te_imp[:, keep_mask])

# rebuild full map dim (dropped cols = 0)
Xm_tr_s = np.zeros_like(Xm_tr_imp, dtype=np.float64)
Xm_va_s = np.zeros_like(Xm_va_imp, dtype=np.float64)
Xm_te_s = np.zeros_like(Xm_te_imp, dtype=np.float64)
Xm_tr_s[:, keep_mask] = Xm_tr_kept.astype(np.float64)
Xm_va_s[:, keep_mask] = Xm_va_kept.astype(np.float64)
Xm_te_s[:, keep_mask] = Xm_te_kept.astype(np.float64)

# fuse back
X_train_s = np.concatenate([Xm_tr_s, Xp_tr], axis=1).astype(np.float64)
X_val_s   = np.concatenate([Xm_va_s, Xp_va], axis=1).astype(np.float64)
X_test_s  = np.concatenate([Xm_te_s, Xp_te], axis=1).astype(np.float64)

assert np.isfinite(X_train_s).all() and np.isfinite(X_val_s).all() and np.isfinite(X_test_s).all(), "Non-finite after preprocessing."
print("‚úÖ Modality-aware preprocessing complete.")

# save preprocessing bundle
joblib.dump({
    "imp": imp,
    "q_lo": q_lo,
    "q_hi": q_hi,
    "clip_quantiles": (5, 95),
    "keep_mask": keep_mask,
    "scaler": scaler,
    "map_dim": MAP_DIM,
    "prompt_dim": PROMPT_DIM,
    "prompt_l2_eps": 1e-12,
}, PATHS.TRAIN_OUT / "preproc.joblib")


‚úÖ Modality-aware preprocessing complete.


['/Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out/preproc.joblib']

## ‚öñÔ∏è 7) Class Weights ##

In [10]:
# === BUILD y_train_cls + CLASS + MAP-AWARE SAMPLE WEIGHTS ===

import numpy as np
from sklearn.utils.class_weight import compute_class_weight

OP_COL = PATHS.OPERATOR_COL  # "operator"

# ------------------------------------------------------------
# 1) Build y_train_cls directly from df_train
# ------------------------------------------------------------
# factorize gives stable integer labels starting at 0
# Use a fixed, stable class order
class_names = np.array(FIXED_CLASSES)  # ["simplify","select","aggregate","displace"]
y_train_cls = pd.Categorical(df_train[OP_COL], categories=class_names).codes
assert (y_train_cls >= 0).all(), "Found an operator not in FIXED_CLASSES"
n_classes = len(class_names)


print("Operator classes:", list(class_names))

# ------------------------------------------------------------
# 2) Class weights (operator imbalance)
# ------------------------------------------------------------
classes = np.arange(n_classes)

cls_w = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train_cls
)
cls_w = np.asarray(cls_w, dtype="float64")

class_weight_map = dict(zip(class_names, cls_w))
print("Class weights:", class_weight_map)

# per-sample class weight
w_class = np.array([cls_w[c] for c in y_train_cls], dtype="float64")

# ------------------------------------------------------------
# 3) Map-level weighting (prompt multiplicity correction)
# ------------------------------------------------------------
map_counts = df_train["map_id"].value_counts()

# each map contributes ~1 total weight
w_map = df_train["map_id"].map(lambda m: 1.0 / map_counts[m]).to_numpy(dtype="float64")

# ------------------------------------------------------------
# 4) Final sample weights
# ------------------------------------------------------------
sample_w = w_class * w_map

print(
    "Sample weight summary:",
    {
        "min": float(sample_w.min()),
        "max": float(sample_w.max()),
        "mean": float(sample_w.mean()),
    }
)


Operator classes: [np.str_('simplify'), np.str_('select'), np.str_('aggregate'), np.str_('displace')]
Class weights: {np.str_('simplify'): np.float64(1.0275229357798166), np.str_('select'): np.float64(0.7777777777777778), np.str_('aggregate'): np.float64(0.835820895522388), np.str_('displace'): np.float64(1.8360655737704918)}
Sample weight summary: {'min': 0.025925925925925925, 'max': 1.8360655737704918, 'mean': 0.6487687942076353}


In [11]:
# === BUILD CLASS LABELS (train / val / test) ===

OP_COL = PATHS.OPERATOR_COL  # "operator"

# Fixed, global class order (MUST match training + bundle)
class_names = np.array(FIXED_CLASSES)

y_train_cls = pd.Categorical(
    df_train[OP_COL],
    categories=class_names
).codes

y_val_cls = pd.Categorical(
    df_val[OP_COL],
    categories=class_names
).codes

y_test_cls = pd.Categorical(
    df_test[OP_COL],
    categories=class_names
).codes

# Safety checks
assert (y_train_cls >= 0).all(), "TRAIN contains unseen operator labels"
assert (y_val_cls >= 0).all(), "VAL contains unseen operator labels"
assert (y_test_cls >= 0).all(), "TEST contains unseen operator labels"

print("Classes (fixed order):", list(class_names))
print("y_train_cls shape:", y_train_cls.shape)
print("y_val_cls shape:", y_val_cls.shape)
print("y_test_cls shape:", y_test_cls.shape)


Classes (fixed order): [np.str_('simplify'), np.str_('select'), np.str_('aggregate'), np.str_('displace')]
y_train_cls shape: (448,)
y_val_cls shape: (57,)
y_test_cls shape: (57,)


## üß† 8) Train MLP ##

In [12]:
# =========================
# MLP search where each model trains on ALL training data
# =========================
import numpy as np
from pathlib import Path
from pprint import pprint
from dataclasses import dataclass

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix

# ---- numerics: keep float64 everywhere ----
X_train_s = X_train_s.astype(np.float64, copy=False)
X_val_s   = X_val_s.astype(np.float64, copy=False)
X_test_s  = X_test_s.astype(np.float64, copy=False)
sample_w  = sample_w.astype(np.float64, copy=False)

# ---- group by map_id (maps can repeat; prompts don't) ----
assert "map_id" in df_train.columns, "df_train must contain 'map_id' for grouped CV."
groups_tr = df_train["map_id"].astype(str).values
# ---- fixed class order (must match how y_*_cls were created) ----
class_names = np.array(FIXED_CLASSES)


# ---- CV splitter (for scoring only) ----
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

# ---- search space helpers ----
rng = np.random.RandomState(42)

def draw_params(n):
    sizes = [(64,), (128,), (256,), (128, 64), (256, 128), (256, 128, 64)]
    batches = [16, 32, 64, 128]
    for _ in range(n):
        yield {
            "hidden_layer_sizes": sizes[rng.randint(len(sizes))],
            "alpha": 10**rng.uniform(-5, np.log10(3e-2)),          # loguniform(1e-5, 3e-2)
            "learning_rate_init": 10**rng.uniform(-4, np.log10(3e-3)),  # loguniform(1e-4, 3e-3)
            "batch_size": batches[rng.randint(len(batches))],
            "activation": "relu",
            "solver": "adam",
            "max_iter": 800,            # allow convergence w/o early stopping
            "early_stopping": False,    # <‚Äî IMPORTANT: use ALL training samples
            "random_state": 42,
            "verbose": False,
            "tol": 1e-4
        }

# ---- CV scorer using grouped folds; model sees only its fold-train here (for the score only) ----
def cv_macro_f1(params):
    scores = []
    for tr_idx, va_idx in cv.split(X_train_s, y_train_cls, groups_tr):
        clf = MLPClassifier(**params)
        clf.fit(X_train_s[tr_idx], y_train_cls[tr_idx], sample_weight=sample_w[tr_idx])
        pred = clf.predict(X_train_s[va_idx])
        scores.append(f1_score(y_train_cls[va_idx], pred, average="macro"))
    return float(np.mean(scores)), float(np.std(scores))

@dataclass
class Candidate:
    params: dict
    cv_mean: float
    cv_std: float
    val_f1: float
    val_acc: float

# ---- run search ----
N_ITER = 50   # tune this for time/quality tradeoff
candidates = []

print(f"\nSearching {N_ITER} MLP configs...")
for i, params in enumerate(draw_params(N_ITER), 1):
    cv_mean, cv_std = cv_macro_f1(params)

    # IMPORTANT PART: refit SAME PARAMS on FULL TRAIN (no early_stopping) so the model sees ALL training data
    clf_full = MLPClassifier(**params)
    clf_full.fit(X_train_s, y_train_cls, sample_weight=sample_w)

    # evaluate on external VAL (never used for training)
    val_pred = clf_full.predict(X_val_s)
    val_f1 = f1_score(y_val_cls, val_pred, average="macro")
    val_acc = accuracy_score(y_val_cls, val_pred)

    candidates.append(Candidate(params, cv_mean, cv_std, val_f1, val_acc))
    print(f"[{i:02d}/{N_ITER}] cvF1={cv_mean:.3f}¬±{cv_std:.3f} | VAL F1={val_f1:.3f} acc={val_acc:.3f} | {params['hidden_layer_sizes']}, Œ±={params['alpha']:.2e}, lr={params['learning_rate_init']:.1e}, bs={params['batch_size']}")

# ---- pick winner by external VAL macro-F1 (tie-breaker: VAL acc, then CV mean) ----
candidates.sort(key=lambda c: (c.val_f1, c.val_acc, c.cv_mean), reverse=True)
best = candidates[0]
print("\n=== Top candidates (by VAL macro-F1) ===")
for c in candidates[:5]:
    print(f"VAL F1={c.val_f1:.3f} (acc={c.val_acc:.3f}) | cvF1={c.cv_mean:.3f}¬±{c.cv_std:.3f} | params={c.params}")

print("\nüèÜ Selected params:")
pprint(best.params)

# ---- train final model on FULL TRAIN (no early_stopping so it uses 100% of train) ----
final_mlp = MLPClassifier(**best.params)
final_mlp.fit(X_train_s, y_train_cls, sample_weight=sample_w)

# ---- evaluate on VAL & TEST ----
for name, Xs, ys in [("VAL", X_val_s, y_val_cls), ("TEST", X_test_s, y_test_cls)]:
    yhat = final_mlp.predict(Xs)
    acc  = accuracy_score(ys, yhat)
    f1m  = f1_score(ys, yhat, average="macro")
    print(f"\n===== {name} =====")
    print(f"{name}: acc={acc:.4f}  f1_macro={f1m:.4f}")
    print(classification_report(ys, yhat, labels=np.arange(len(class_names)), target_names=list(class_names)))
    print("Confusion matrix:\n", confusion_matrix(ys, yhat))

# ---- save final model ----
out_dir = Path(PATHS.TRAIN_OUT); out_dir.mkdir(parents=True, exist_ok=True)
import joblib
joblib.dump(
    {
        "model": final_mlp,
        "class_names": list(class_names),
        "best_params": best.params,
    },
    out_dir / "best_mlp_fulltrain.joblib"
)
print(f"\n‚úÖ Saved final MLP (trained on ALL TRAIN) to: {out_dir / 'best_mlp_fulltrain.joblib'}")


Searching 50 MLP configs...
[01/50] cvF1=0.717¬±0.087 | VAL F1=0.830 acc=0.842 | (128, 64), Œ±=2.02e-02, lr=1.2e-03, bs=16
[02/50] cvF1=0.794¬±0.093 | VAL F1=0.944 acc=0.947 | (256, 128), Œ±=3.49e-05, lr=1.7e-04, bs=64
[03/50] cvF1=0.710¬±0.113 | VAL F1=0.830 acc=0.842 | (256,), Œ±=1.03e-02, lr=7.7e-04, bs=128
[04/50] cvF1=0.784¬±0.086 | VAL F1=0.920 acc=0.930 | (256,), Œ±=1.18e-05, lr=2.7e-03, bs=128
[05/50] cvF1=0.798¬±0.077 | VAL F1=0.903 acc=0.912 | (256, 128, 64), Œ±=5.47e-05, lr=1.9e-04, bs=16
[06/50] cvF1=0.814¬±0.064 | VAL F1=0.944 acc=0.947 | (64,), Œ±=1.14e-04, lr=6.0e-04, bs=128
[07/50] cvF1=0.818¬±0.074 | VAL F1=0.944 acc=0.947 | (64,), Œ±=1.03e-04, lr=8.0e-04, bs=32
[08/50] cvF1=0.662¬±0.122 | VAL F1=0.808 acc=0.825 | (128, 64), Œ±=2.43e-02, lr=2.2e-04, bs=32
[09/50] cvF1=0.782¬±0.083 | VAL F1=0.940 acc=0.947 | (256, 128, 64), Œ±=4.95e-05, lr=5.7e-04, bs=128
[10/50] cvF1=0.834¬±0.069 | VAL F1=0.944 acc=0.947 | (64,), Œ±=1.45e-05, lr=7.9e-04, bs=16
[11/50] cvF1=0.821¬±0.06

In [16]:
# =========================
# Regression branch (one MLPRegressor per operator) ‚Äî DYNAMIC EXTENT VERSION
# =========================
import numpy as np
from pathlib import Path
from pprint import pprint
import joblib

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import loguniform

# -------------------------
# 0) Targets: use param_norm (created earlier) + keep raw param_value for reporting
# -------------------------
OP_COL    = PATHS.OPERATOR_COL        # "operator"
PARAM_COL = PATHS.PARAM_VALUE_COL     # "param_value"

# FIX: enforce the same class order everywhere
class_names = np.array(FIXED_CLASSES)

# FIX: ensure labels use SAME class order
y_train_cls = pd.Categorical(df_train[OP_COL], categories=class_names).codes
y_val_cls   = pd.Categorical(df_val[OP_COL], categories=class_names).codes
y_test_cls  = pd.Categorical(df_test[OP_COL], categories=class_names).codes

# Safety checks (no unseen labels)
assert (y_train_cls >= 0).all(), "TRAIN contains unseen operator labels (check FIXED_CLASSES)"
assert (y_val_cls   >= 0).all(), "VAL contains unseen operator labels (check FIXED_CLASSES)"
assert (y_test_cls  >= 0).all(), "TEST contains unseen operator labels (check FIXED_CLASSES)"

# param_norm must exist from the edited LOAD-FUSED cell
assert "param_norm" in df_train.columns and "param_norm" in df_val.columns and "param_norm" in df_test.columns, \
    "Missing param_norm in df_* (make sure you computed it in the earlier cell)."

# Dynamic per-map refs must exist (merged from maps.parquet into train_pairs.parquet)
REQ_EXT = ["extent_diag_m", "extent_area_m2"]
for split_name, dfx in [("df_train", df_train), ("df_val", df_val), ("df_test", df_test)]:
    missing = [c for c in REQ_EXT if c not in dfx.columns]
    assert not missing, f"Missing {missing} in {split_name}. Check concat merged maps.parquet extent_* columns."
    dfx["extent_diag_m"]  = pd.to_numeric(dfx["extent_diag_m"], errors="coerce")
    dfx["extent_area_m2"] = pd.to_numeric(dfx["extent_area_m2"], errors="coerce")

y_train_norm = pd.to_numeric(df_train["param_norm"], errors="coerce").to_numpy()
y_val_norm   = pd.to_numeric(df_val["param_norm"], errors="coerce").to_numpy()
y_test_norm  = pd.to_numeric(df_test["param_norm"], errors="coerce").to_numpy()

y_val_raw  = pd.to_numeric(df_val[PARAM_COL], errors="coerce").to_numpy()
y_test_raw = pd.to_numeric(df_test[PARAM_COL], errors="coerce").to_numpy()

assert np.isfinite(y_train_norm).all() and np.isfinite(y_val_norm).all() and np.isfinite(y_test_norm).all(), \
    "Non-finite values found in param_norm. Check your normalization step."
assert np.isfinite(y_val_raw).all() and np.isfinite(y_test_raw).all(), \
    "Non-finite values found in param_value in val/test. Check Excel parsing."

# Optional: log1p on normalized values (usually not needed after normalization)
USE_LOG1P = False
if USE_LOG1P:
    assert (y_train_norm >= 0).all() and (y_val_norm >= 0).all() and (y_test_norm >= 0).all(), \
        "log1p selected but param_norm has negatives."
    ytr_t = np.log1p(y_train_norm)
    yva_t = np.log1p(y_val_norm)
    yte_t = np.log1p(y_test_norm)
    def inv_t(x): return np.expm1(x)
else:
    ytr_t = y_train_norm.copy()
    yva_t = y_val_norm.copy()
    yte_t = y_test_norm.copy()
    def inv_t(x): return x

# -------------------------
# 1) Grouped CV by map_id (no leakage)
# -------------------------
assert "map_id" in df_train.columns
gk = GroupKFold(n_splits=5)
groups_tr = df_train["map_id"].astype(str).values

# -------------------------
# 2) Search space for MLPRegressor
# -------------------------
base_reg = MLPRegressor(
    activation="relu",
    solver="adam",
    learning_rate="adaptive",
    early_stopping=False,
    max_iter=2000,
    tol=1e-3,
    random_state=42,
    verbose=False,
    batch_size="auto"
)

param_dist_reg = {
    "hidden_layer_sizes": [(64,), (128,), (256,), (128, 64), (256, 128)],
    "alpha": loguniform(1e-6, 3e-2),
    "learning_rate_init": loguniform(1e-4, 3e-3),
}

# -------------------------
# 3) Fit one regressor per operator (on normalized target)
# -------------------------
regressors = {}
search_summaries = {}

for cls_idx, cls_name in enumerate(class_names):
    cls_name = str(cls_name)  # ‚úÖ REQUIRED FIX: ensure dict keys are normal Python strings

    m_tr = (y_train_cls == cls_idx)

    Xk = X_train_s[m_tr]
    yk = ytr_t[m_tr]
    gk_tr = groups_tr[m_tr]
    wk = sample_w[m_tr]

    if Xk.shape[0] < 10:
        print(f"‚ö†Ô∏è Skipping class '{cls_name}' (too few samples: {Xk.shape[0]}).")
        continue

    t_scaler = StandardScaler()
    yk_s = t_scaler.fit_transform(yk.reshape(-1, 1)).ravel()

    splits = list(gk.split(Xk, yk_s, groups=gk_tr))

    search = RandomizedSearchCV(
        estimator=base_reg,
        param_distributions=param_dist_reg,
        n_iter=40,
        scoring="neg_root_mean_squared_error",
        cv=splits,
        n_jobs=-1,
        refit=True,
        random_state=42,
        verbose=1
    )

    search.fit(Xk, yk_s, sample_weight=wk)

    rmse_scaled = -search.best_score_
    rmse_norm_units = rmse_scaled * float(t_scaler.scale_[0])

    print(f"\n=== Regressor for class '{cls_name}' (predicting param_norm) ===")
    print("best CV RMSE (scaled):", rmse_scaled)
    print("best CV RMSE (param_norm units):", rmse_norm_units)
    print("best params:"); pprint(search.best_params_)

    search_summaries[cls_name] = {
        "rmse_scaled": float(rmse_scaled),
        "rmse_param_norm": float(rmse_norm_units),
        "params": search.best_params_
    }

    reg_full = MLPRegressor(
        **{**search.best_estimator_.get_params(), "early_stopping": False, "max_iter": 2000, "random_state": 42}
    )
    reg_full.fit(Xk, yk_s, sample_weight=wk)

    regressors[cls_name] = (reg_full, t_scaler)

# -------------------------
# 4) Routing + prediction: output REAL param_value using DYNAMIC extents
# -------------------------
DIST_OPS_SET = set(DISTANCE_OPS)
AREA_OPS_SET = set(AREA_OPS)

def route_and_predict_param_value(Xs, df_s, pred_cls_idx):
    """
    Predict param_value in original units (meters or m¬≤).
    Regressors predict param_norm; then:
      - inverse StandardScaler
      - inverse log1p (if used)
      - unnormalize with per-row dynamic extent: diag for distance ops, area for select
    """
    yhat = np.full(len(pred_cls_idx), np.nan, dtype=float)

    extent_diag = pd.to_numeric(df_s["extent_diag_m"], errors="coerce").to_numpy(dtype=float)
    extent_area = pd.to_numeric(df_s["extent_area_m2"], errors="coerce").to_numpy(dtype=float)

    for i, cidx in enumerate(pred_cls_idx):
        cname = str(class_names[int(cidx)])
        pack = regressors.get(cname)
        if pack is None:
            continue

        reg, t_scaler = pack
        pred_scaled = reg.predict(Xs[i:i+1])[0]
        pred_t = t_scaler.inverse_transform([[pred_scaled]])[0, 0]  # back to ytr_t units
        pred_norm = inv_t(pred_t)

        # OPTIONAL safety: clamp negatives
        pred_norm = max(0.0, float(pred_norm))

        if cname in DIST_OPS_SET:
            if np.isfinite(extent_diag[i]) and extent_diag[i] > 0:
                yhat[i] = pred_norm * extent_diag[i]
        elif cname in AREA_OPS_SET:
            if np.isfinite(extent_area[i]) and extent_area[i] > 0:
                yhat[i] = pred_norm * extent_area[i]
        else:
            yhat[i] = np.nan

    return yhat

def print_reg_metrics(name, y_true_raw, y_pred_raw):
    mask = np.isfinite(y_true_raw) & np.isfinite(y_pred_raw)
    if mask.sum() == 0:
        print(f"{name}: no finite pairs to evaluate.")
        return np.nan, np.nan
    if mask.sum() < len(y_true_raw):
        print(f"{name}: dropped {len(y_true_raw) - mask.sum()} samples with NaNs.")

    yt = y_true_raw[mask]
    yp = y_pred_raw[mask]

    mae = mean_absolute_error(yt, yp)
    rmse = float(np.sqrt(mean_squared_error(yt, yp)))
    print(f"{name}: MAE={mae:.4f}  RMSE={rmse:.4f}")
    return mae, rmse

# Classification predictions (already trained classifier)
clf_cls = final_mlp
val_pred_cls  = clf_cls.predict(X_val_s)
test_pred_cls = clf_cls.predict(X_test_s)

# Predict param_value (real units)
yhat_val  = route_and_predict_param_value(X_val_s,  df_val,  val_pred_cls)
yhat_test = route_and_predict_param_value(X_test_s, df_test, test_pred_cls)

print("\n--- Regression with predicted classes (realistic) ---")
print_reg_metrics("VAL",  y_val_raw,  yhat_val)
print_reg_metrics("TEST", y_test_raw, yhat_test)

# Oracle routing (true operator)
yhat_val_or  = route_and_predict_param_value(X_val_s,  df_val,  y_val_cls)
yhat_test_or = route_and_predict_param_value(X_test_s, df_test, y_test_cls)

print("\n--- Regression with TRUE classes (oracle routing) ---")
print_reg_metrics("VAL-oracle",  y_val_raw,  yhat_val_or)
print_reg_metrics("TEST-oracle", y_test_raw, yhat_test_or)

# -------------------------
# 5) Save bundle (include normalization metadata)
# -------------------------
bundle = {
    "classifier": clf_cls,
    "regressors_by_class": regressors,
    "class_names": list(map(str, class_names)),
    "use_log1p": USE_LOG1P,
    "target": "param_norm",
    "normalization": {
        "type": "dynamic_extent",
        "distance_ops": list(DISTANCE_OPS),
        "area_ops": list(AREA_OPS),
        "distance_ref_col": "extent_diag_m",
        "area_ref_col": "extent_area_m2",
    },
    "cv_summary": search_summaries,
}

out_dir = Path(PATHS.TRAIN_OUT)
out_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(bundle, out_dir / "cls_plus_regressors.joblib")
print(f"\n‚úÖ Saved classification+regression bundle to: {out_dir / 'cls_plus_regressors.joblib'}")
print("\n--- Regression with TRUE classes (oracle routing) ---")
print_reg_metrics("VAL-oracle",  y_val_raw,  yhat_val_or)
print_reg_metrics("TEST-oracle", y_test_raw, yhat_test_or)


Fitting 5 folds for each of 40 candidates, totalling 200 fits

=== Regressor for class 'simplify' (predicting param_norm) ===
best CV RMSE (scaled): 1.2821859869441417
best CV RMSE (param_norm units): 0.0051663700097502215
best params:
{'alpha': np.float64(0.0041619125396912095),
 'hidden_layer_sizes': (64,),
 'learning_rate_init': np.float64(0.00010558059144381523)}
Fitting 5 folds for each of 40 candidates, totalling 200 fits

=== Regressor for class 'select' (predicting param_norm) ===
best CV RMSE (scaled): 0.679893022270926
best CV RMSE (param_norm units): 0.00026332285975812415
best params:
{'alpha': np.float64(2.1453931225439485e-06),
 'hidden_layer_sizes': (128,),
 'learning_rate_init': np.float64(0.0001483039268456802)}
Fitting 5 folds for each of 40 candidates, totalling 200 fits

=== Regressor for class 'aggregate' (predicting param_norm) ===
best CV RMSE (scaled): 1.0448835330852817
best CV RMSE (param_norm units): 0.003430202896463292
best params:
{'alpha': np.float64(0.00

(11.37863762165307, 22.556855347571947)

In [17]:
import pandas as pd

def per_operator_reg_report(split_name, df_s, y_true_raw, y_pred_raw, op_idx, class_names):
    rows = []
    for k, op in enumerate(class_names):
        m = (op_idx == k)
        # keep only finite pairs
        m = m & np.isfinite(y_true_raw) & np.isfinite(y_pred_raw)
        n = int(m.sum())
        if n == 0:
            continue
        yt = y_true_raw[m]
        yp = y_pred_raw[m]
        mae = mean_absolute_error(yt, yp)
        rmse = float(np.sqrt(mean_squared_error(yt, yp)))
        rows.append({
            "split": split_name,
            "operator": str(op),
            "n": n,
            "MAE": mae,
            "RMSE": rmse,
        })
    return pd.DataFrame(rows)

# -------- realistic: grouped by PREDICTED operator --------
df_val_perop_real = per_operator_reg_report(
    "VAL_realistic_by_pred_op", df_val, y_val_raw, yhat_val, val_pred_cls, class_names
)
df_test_perop_real = per_operator_reg_report(
    "TEST_realistic_by_pred_op", df_test, y_test_raw, yhat_test, test_pred_cls, class_names
)

# -------- oracle: grouped by TRUE operator --------
df_val_perop_or = per_operator_reg_report(
    "VAL_oracle_by_true_op", df_val, y_val_raw, yhat_val_or, y_val_cls, class_names
)
df_test_perop_or = per_operator_reg_report(
    "TEST_oracle_by_true_op", df_test, y_test_raw, yhat_test_or, y_test_cls, class_names
)

df_perop = pd.concat(
    [df_val_perop_real, df_test_perop_real, df_val_perop_or, df_test_perop_or],
    ignore_index=True
).sort_values(["split", "operator"]).reset_index(drop=True)

display(df_perop)


Unnamed: 0,split,operator,n,MAE,RMSE
0,TEST_oracle_by_true_op,aggregate,22,1.647941,2.411753
1,TEST_oracle_by_true_op,displace,12,2.026431,2.185604
2,TEST_oracle_by_true_op,select,25,30.494501,39.160429
3,TEST_oracle_by_true_op,simplify,17,2.461298,2.92982
4,TEST_realistic_by_pred_op,aggregate,21,1.691911,2.463439
5,TEST_realistic_by_pred_op,displace,12,2.026431,2.185604
6,TEST_realistic_by_pred_op,select,24,30.553488,39.52473
7,TEST_realistic_by_pred_op,simplify,19,7.572395,22.373621
8,VAL_oracle_by_true_op,aggregate,11,1.706704,2.266044
9,VAL_oracle_by_true_op,displace,6,1.396595,1.551651
