# MapVec end-to-end pipeline üìí

This notebook runs the **entire pipeline**:
1. Prompt embeddings (Universal Sentence Encoder)
2. Map embeddings (handcrafted polygon features)
3. Concatenation into a training matrix
4. Helper cells to inspect vectors by `prompt_id` or `map_id`

**Edit the Parameters** in the next cell to match your project layout.


In [4]:
# ===================== PARAMETERS =====================
from pathlib import Path
import sys
import subprocess
import importlib
import shutil
import numpy as np
import pandas as pd
import shlex
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, mean_squared_error, mean_absolute_error
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

PROJ_ROOT = Path("../").resolve()       # adjust if your notebook sits elsewhere
SRC_DIR   = PROJ_ROOT / "src"
# Put project root on sys.path
if str(PROJ_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJ_ROOT))

from src.config import PATHS, CFG, print_summary

In [5]:
print_summary()  # optional

# Access paths like:
MAPS_ROOT = PATHS.MAPS_ROOT
INPUT_MAPS_PATTERN = PATHS.INPUT_MAPS_PATTERN
PROMPTS_CSV = PATHS.PROMPTS_CSV
PRM_NPZ = PATHS.PRM_NPZ

# Dims (auto-inferred if available; else None until you set them)
MAP_DIM = CFG.MAP_DIM or 996         # fallback
PROMPT_DIM = CFG.PROMPT_DIM or 512   # fallback
FUSED_DIM = CFG.FUSED_DIM or (MAP_DIM + PROMPT_DIM)
BATCH_SIZE = CFG.BATCH_SIZE

=== CONFIG SUMMARY ===
PROJ_ROOT  : /Users/amirdonyadide/Documents/GitHub/Thesis
DATA_DIR   : /Users/amirdonyadide/Documents/GitHub/Thesis/data
INPUT_DIR  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/input
OUTPUT_DIR : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output
MAPS_ROOT  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/samples/pairs
INPUT PAT. : *_input.geojson
PROMPTS_CSV: /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/prompts.csv
PAIRS_CSV  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/pairs.csv
PROMPT_OUT : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out
MAP_OUT    : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/map_out
TRAIN_OUT  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out
MODEL_OUT  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/models
SPLIT_OUT  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out/splits
PRM_NPZ    : /Users/amirdonyadide/Document

In [69]:
PATHS.clean_outputs()

üßπ Removing old directory: /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out
üßπ Removing old directory: /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/map_out
üßπ Removing old directory: /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out
üßπ Removing old directory: /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/models
‚úÖ All output folders cleaned and recreated fresh.



## 1) Prompt embeddings
Runs `src/mapvec/prompts/prompt_embeddings.py` using your chosen USE model and saves artifacts to `PROMPT_OUT`.

In [70]:
cmd = [
    sys.executable, "-m", "src.mapvec.prompts.prompt_embeddings",
    "--input",    str(PATHS.PROMPTS_CSV),
    "--model",    str(CFG.USE_MODEL),
    "--l2",       
    "--out_dir",  str(PATHS.PROMPT_OUT),
    "-v"
]
print("CMD:", " ".join(cmd))
res = subprocess.run(cmd, cwd=str(PATHS.PROJ_ROOT))  # shell=False by default
if res.returncode != 0:
    raise SystemExit("Prompt embedding step failed.")
print("Prompt embeddings completed.")

CMD: /opt/anaconda3/envs/thesis/bin/python -m src.mapvec.prompts.prompt_embeddings --input /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/prompts.csv --model dan --l2 --out_dir /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out -v


18:12:46 | DEBUG | FILE_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/src/mapvec/prompts
18:12:46 | DEBUG | PROJECT_ROOT=/Users/amirdonyadide/Documents/GitHub/Thesis
18:12:46 | DEBUG | DEFAULT_DATA_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/data
18:12:46 | INFO | DATA_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/data
18:12:46 | INFO | INPUT=/Users/amirdonyadide/Documents/GitHub/Thesis/data/input/prompts.csv
18:12:46 | INFO | OUT_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out
18:12:46 | INFO | Reading CSV: /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/prompts.csv
18:12:46 | INFO | Loaded 500 prompts (id_col=prompt_id). Sample IDs: p001, p002, p003‚Ä¶
18:12:46 | INFO | Using local USE-dan at /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/model_dan
18:12:46 | INFO | Loading USE-dan from local path: /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/model_dan ‚Ä¶
18:12:50 | INFO | Fingerprint not found. Saved model loading will 

Prompt embeddings completed.


## 2) Map embeddings
Runs the map embedding module on the GeoJSON inputs. Skips problematic features, logs warnings, and writes `embeddings.npz` to `PAIR_MAP_OUT`.

In [71]:
cmd = [
    sys.executable, "-m", "src.mapvec.maps.map_embeddings",
    "--root", str(PATHS.MAPS_ROOT),
    "--pattern", PATHS.INPUT_MAPS_PATTERN,
    "--out_dir", str(PATHS.MAP_OUT),
    "-v"
]
print("CMD:", " ".join(cmd))
res = subprocess.run(cmd, cwd=str(PATHS.PROJ_ROOT)) 
if res.returncode != 0:
    raise SystemExit("Map embedding step failed.")
print("‚úÖ Input map embeddings completed.")

CMD: /opt/anaconda3/envs/thesis/bin/python -m src.mapvec.maps.map_embeddings --root /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/samples/pairs --pattern *_input.geojson --out_dir /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/map_out -v


18:12:56 | DEBUG | PROJECT_ROOT=/Users/amirdonyadide/Documents/GitHub/Thesis
18:12:56 | DEBUG | DATA_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/data
18:12:56 | INFO | Scanning /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/samples/pairs (pattern=*_input.geojson)‚Ä¶
18:12:58 | INFO | OK  map_id=0073  -> vector[249]
18:12:59 | INFO | OK  map_id=0080  -> vector[249]
18:13:00 | INFO | OK  map_id=0093  -> vector[249]
18:13:03 | INFO | OK  map_id=0122  -> vector[249]
18:13:04 | INFO | OK  map_id=0123  -> vector[249]
18:13:05 | INFO | OK  map_id=0127  -> vector[249]
18:13:06 | INFO | OK  map_id=0158  -> vector[249]
18:13:08 | INFO | OK  map_id=0159  -> vector[249]
18:13:09 | INFO | OK  map_id=0160  -> vector[249]
18:13:10 | INFO | OK  map_id=0165  -> vector[249]
18:13:11 | INFO | OK  map_id=0167  -> vector[249]
18:13:12 | INFO | OK  map_id=0168  -> vector[249]
18:13:13 | INFO | OK  map_id=0171  -> vector[249]
18:13:16 | INFO | OK  map_id=0208  -> vector[249]
18:13:20 | INFO | O

‚úÖ Input map embeddings completed.


18:20:43 | INFO | OK  map_id=1757  -> vector[249]
18:20:43 | INFO | Saved 300 vectors (failed=0) to /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/map_out


## 3) Concatenate
Joins map & prompt vectors using `pairs.csv` and writes `X_concat.npy` and `train_pairs.parquet` to `TRAIN_OUT`.

In [72]:
cmd = [
    sys.executable, "-m", "src.mapvec.concat.concat_embeddings",
    "--pairs",      str(PATHS.PAIRS_CSV),
    "--map_npz",    str(PATHS.MAP_OUT / "maps_embeddings.npz"),   # single-map embeddings
    "--prompt_npz", str(PATHS.PROMPT_OUT / "prompts_embeddings.npz"),
    "--out_dir",    str(PATHS.TRAIN_OUT),
    "--drop_dupes",          # optional flag (keep if you want)
    # "--fail_on_missing",   # optional: uncomment if you prefer hard failure on missing IDs
]

print("CMD:", " ".join(cmd))
res = subprocess.run(cmd, cwd=str(PATHS.PROJ_ROOT))  # run from project root
if res.returncode != 0:
    raise SystemExit("Concatenation step failed.")
print("‚úÖ Concatenation completed successfully.")

CMD: /opt/anaconda3/envs/thesis/bin/python -m src.mapvec.concat.concat_embeddings --pairs /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/pairs.csv --map_npz /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/map_out/maps_embeddings.npz --prompt_npz /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out/prompts_embeddings.npz --out_dir /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out --drop_dupes
‚úÖ Concatenation completed successfully.


18:20:56 | INFO | Map  embeddings: (300, 249) from /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/map_out/maps_embeddings.npz
18:20:56 | INFO | Prompt embeddings: (500, 512) from /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out/prompts_embeddings.npz
18:20:56 | INFO | X shape = (450, 761)  (map_dim=249, prompt_dim=512)
18:20:56 | INFO | Saved to /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out in 0.02s


## 4) Load + basic cleaning 

In [73]:
# Load data
X = np.load(PATHS.TRAIN_OUT / "X_concat.npy")
pairs_df = pd.read_parquet(PATHS.TRAIN_OUT / "train_pairs.parquet")
print(f"Loaded X: {X.shape}, pairs: {pairs_df.shape}")

# Basic cleaning (lowercase + remove missing operator/param)
OP_COL = "operator"
PARAM_COLS = ["param"]

df = pairs_df.copy()
df[OP_COL] = df[OP_COL].astype(str).str.strip().str.lower()
mask = df[OP_COL].notna()
for c in PARAM_COLS:
    mask &= df[c].notna()

X = X[mask.values].astype("float32", copy=False)
df = df.loc[mask].reset_index(drop=True)
print(f"After cleaning: {X.shape}, {df.shape}")

Loaded X: (450, 761), pairs: (450, 4)
After cleaning: (450, 761), (450, 4)


## 5) Split + target encoding + Feature preprocessing (fit only on train)

In [74]:
# ========== SPLIT + TARGETS ==========
FIXED_CLASSES = ["simplify", "select", "aggregate", "displace"]

# --- Split into train / val / test (stratify by operator if balanced)
X_train, X_temp, df_train, df_temp = train_test_split(
    X, df,
    test_size=CFG.VAL_RATIO + CFG.TEST_RATIO,
    random_state=CFG.SEED,
    shuffle=True,
    stratify=df[OP_COL] if df[OP_COL].nunique() > 1 else None
)
rel_test = CFG.TEST_RATIO / (CFG.VAL_RATIO + CFG.TEST_RATIO)
X_val, X_test, df_val, df_test = train_test_split(
    X_temp, df_temp,
    test_size=rel_test,
    random_state=CFG.SEED,
    shuffle=True,
    stratify=df_temp[OP_COL] if df_temp[OP_COL].nunique() > 1 else None
)
print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

# --- Encode targets
le = LabelEncoder().fit(FIXED_CLASSES)
y_train_cls = le.transform(df_train[OP_COL])
y_val_cls   = le.transform(df_val[OP_COL])
y_test_cls  = le.transform(df_test[OP_COL])

y_train_reg = df_train[PARAM_COLS].to_numpy(dtype="float32")
y_val_reg   = df_val[PARAM_COLS].to_numpy(dtype="float32")
y_test_reg  = df_test[PARAM_COLS].to_numpy(dtype="float32")

# ========== IMPUTE + SCALE (TRAIN-ONLY FIT) ==========
for X_ in (X_train, X_val, X_test):
    X_[~np.isfinite(X_)] = np.nan  # replace inf ‚Üí NaN

imp = SimpleImputer(strategy="median")
X_train_imp = imp.fit_transform(X_train)
X_val_imp   = imp.transform(X_val)
X_test_imp  = imp.transform(X_test)

scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(5, 95))
X_train_s = scaler.fit_transform(X_train_imp)
X_val_s   = scaler.transform(X_val_imp)
X_test_s  = scaler.transform(X_test_imp)

assert np.isfinite(X_train_s).all() and np.isfinite(X_val_s).all() and np.isfinite(X_test_s).all(), \
    "Non-finite values after scaling."

print("‚úÖ Preprocessing complete ‚Äî data ready for model training")

# Class weights for imbalance
cls_w    = compute_class_weight("balanced", classes=np.arange(len(classes)), y=y_train_cls)
sample_w = np.array([cls_w[c] for c in y_train_cls], dtype="float32")
print("Class weights:", dict(zip(classes, cls_w)))


Train: (315, 761), Val: (67, 761), Test: (68, 761)
‚úÖ Preprocessing complete ‚Äî data ready for model training
Class weights: {'aggregate': np.float64(0.9264705882352942), 'displace': np.float64(1.0361842105263157), 'select': np.float64(0.984375), 'simplify': np.float64(1.0641891891891893)}


In [None]:
import numpy as np
import pandas as pd
import re
import json

# --- inputs you already have ---
# X_train_s (fused) is after downstream scaling; for sanity of the raw map embeddings,
# run this on the *pre-scaler embeddings* right after pooling (i.e., before StandardScaler/RobustScaler).
# If you only have fused now, also keep the unfused map block around as X_map (N, 249).
X_map = X_train[:, :249]  # adapt if you store the map block separately
# load your real feature names (you already save this in map_out/feature_names.json)
feat_path = Path(PATHS.MAP_OUT) / "feature_names.json"
map_feat_names = np.array(json.loads(feat_path.read_text()))

def family_of(name: str):
    m = re.match(r"^([^_]+(?:_[^_]+)*)__(mean|min|max|std|q25|q50|q75)$", name)
    return (m.group(1), m.group(2)) if m else (name, "")

# 1) Basic integrity
print("shape:", X_map.shape, "dtype:", X_map.dtype)
print("finite %:", 100*np.isfinite(X_map).mean())
nan_cols = np.isnan(X_map).any(axis=0).sum()
print("cols having any NaN:", nan_cols)

# 2) Zero/near-zero variance (bad / uninformative)
stds = np.nanstd(X_map, axis=0)
zz = (stds < 1e-12).sum()
print("zero-variance cols:", zz)

# 3) Per-family expected ranges after stabilization
#    area was normalized to [0,1] then log1p -> [0, log(2)~=0.693]
#    lengths/dists normalized by diag then log1p -> typically [0, ~2.5] (depending on geometry)
families_log_cap = {
    "area": 0.70,  # safety margin over log(2)=0.693
}
len_fams = {
    "perimeter","eq_diameter","bbox_width","bbox_height",
    "nn_dist_min","nn_dist_median","nn_dist_max","knn1","knn2","knn3",
    "mean_neighbor_distance_touches","mean_neighbor_distance_intersects",
}
count_fams = {
    "vertex_count","neighbor_count_touches","neighbor_count_intersects",
    "hole_count","density_r05","density_r10","reflex_count",
}
ratio_fams = {
    "compactness","circularity","elongation","convexity","rectangularity",
    "straightness","bbox_aspect","extent","eccentricity","hole_area_ratio","reflex_ratio",
}
coord_fams = {"centroid_x","centroid_y"}  # bbox-normalized (‚âà[0,1])

def colmask(families):
    return np.array([family_of(n)[0] in families for n in map_feat_names])

# Hard checks
def assert_max(mask, cap, label):
    if mask.any():
        vmax = np.nanmax(X_map[:, mask])
        print(f"{label}: max={vmax:.3f}  (cap={cap})")
        assert np.isfinite(vmax)
        # soft warn instead of hard assert to avoid stopping your flow
        if vmax > cap:
            print(f"‚ö†Ô∏è {label} exceeds expected cap {cap:.2f}. Inspect tails/features.")

# Area cap
assert_max(colmask({"area"}), families_log_cap["area"], "area (log1p of [0,1])")

# Length distance families: log1p(perimeter/diag) etc. Usually < ~2.5; warn if > 4.
assert_max(colmask(len_fams), 4.0, "length/distance families (log1p normalized)")

# Counts: log1p(count). Usually small; warn if > 6.
assert_max(colmask(count_fams), 6.0, "count families (log1p)")

# Ratios: scale-free (not logged). Should generally be bounded: extent‚àà[0,1], bbox_aspect>=0, etc.
def describe_family(fams, name):
    m = colmask(fams)
    if m.any():
        fam_vals = X_map[:, m]
        print(f"{name}: abs max={np.nanmax(np.abs(fam_vals)):.3f}  p99={np.nanpercentile(np.abs(fam_vals),99):.3f}")
describe_family(ratio_fams, "ratios")
describe_family(coord_fams, "centroid coords")

# 4) Top offenders by absolute value (to find any remaining blow-ups)
absmax = np.nanmax(np.abs(X_map), axis=0)
idx = np.argsort(-absmax)[:10]
print("\nTop-10 |value| columns:")
for j in idx:
    print(f"{absmax[j]:10.3f}  {map_feat_names[j]}")


shape: (315, 249) dtype: float32
finite %: 100.0
cols having any NaN: 0
zero-variance cols: 24
area (log1p of [0,1]): max=0.122  (cap=0.7)
length/distance families (log1p normalized): max=222.054  (cap=4.0)
‚ö†Ô∏è length/distance families (log1p normalized) exceeds expected cap 4.00. Inspect tails/features.
count families (log1p): max=98.000  (cap=6.0)
‚ö†Ô∏è count families (log1p) exceeds expected cap 6.00. Inspect tails/features.
ratios: abs max=22876.488  p99=19.147
centroid coords: abs max=0.998  p99=0.992

Top-10 |value| columns:
 22876.488  eccentricity__max
  1411.121  eccentricity__std
   222.054  knn3__max
   221.796  knn2__max
   179.998  orientation__max
   178.490  orientation__q75
   170.792  orientation__q50
   150.490  nn_dist_max__max
   150.490  nn_dist_median__max
   149.664  mean_neighbor_distance_touches__max


In [78]:
def split_map(X, n_map=249): return X[:, :n_map]

def summarize(X):
    med = np.nanmedian(X, axis=0)
    iqr = np.nanpercentile(X, 75, axis=0) - np.nanpercentile(X, 25, axis=0)
    return med, iqr

Xtr_map = split_map(X_train)
Xva_map = split_map(X_val)
Xte_map = split_map(X_test)

med_tr, iqr_tr = summarize(Xtr_map)
med_va, iqr_va = summarize(Xva_map)
med_te, iqr_te = summarize(Xte_map)

def report_drift(med_a, iqr_a, med_b, iqr_b, label):
    # relative median shift normalized by train IQR
    eps = 1e-6
    rel = np.abs(med_b - med_a) / np.maximum(iqr_a, eps)
    worst = np.argsort(-rel)[:10]
    print(f"\nTop-10 median shifts ({label}) normalized by train IQR:")
    for j in worst:
        print(f"{rel[j]:7.3f}  {map_feat_names[j]}  med_tr={med_a[j]:.3f}  med_{label}={med_b[j]:.3f}  IQR_tr={iqr_a[j]:.3f}")

report_drift(med_tr, iqr_tr, med_va, iqr_va, "val")
report_drift(med_tr, iqr_tr, med_te, iqr_te, "test")



Top-10 median shifts (val) normalized by train IQR:
  0.368  nn_dist_min__q50  med_tr=3.405  med_val=4.306  IQR_tr=2.449
  0.333  density_r10__min  med_tr=2.000  med_val=1.000  IQR_tr=3.000
  0.333  density_r05__q50  med_tr=8.000  med_val=7.000  IQR_tr=3.000
  0.294  density_r10__max  med_tr=54.000  med_val=49.000  IQR_tr=17.000
  0.251  angle_std__min  med_tr=0.000  med_val=0.000  IQR_tr=0.001
  0.250  density_r05__max  med_tr=21.000  med_val=19.000  IQR_tr=8.000
  0.250  density_r05__q75  med_tr=11.000  med_val=10.000  IQR_tr=4.000
  0.244  convexity__q25  med_tr=0.975  med_val=0.983  IQR_tr=0.031
  0.232  eq_diameter__q25  med_tr=0.010  med_val=0.011  IQR_tr=0.003
  0.228  centroid_y__min  med_tr=0.018  med_val=0.020  IQR_tr=0.009

Top-10 median shifts (test) normalized by train IQR:
  0.345  elongation__q25  med_tr=1.054  med_test=1.002  IQR_tr=0.151
  0.277  map_bbox_h  med_tr=0.964  med_test=0.958  IQR_tr=0.019
  0.255  nn_dist_median__q50  med_tr=4.508  med_test=3.761  IQR_tr=2

In [79]:
from sklearn.feature_selection import mutual_info_classif

# y_train_cls already encoded with LabelEncoder
mi = mutual_info_classif(Xtr_map, y_train_cls, discrete_features=False, random_state=0)
top = np.argsort(-mi)[:20]
print("\nTop-20 features by mutual information with class:")
for j in top:
    print(f"MI={mi[j]:.4f}  {map_feat_names[j]}")



Top-20 features by mutual information with class:
MI=0.4162  nn_dist_median__q75
MI=0.4147  nn_dist_min__mean
MI=0.4080  orientation__max
MI=0.3770  knn3__q50
MI=0.3726  neighbor_count_touches__std
MI=0.3694  rectangularity__q50
MI=0.3686  knn1__q50
MI=0.3679  knn1__max
MI=0.3654  bbox_height__q25
MI=0.3606  eq_diameter__mean
MI=0.3599  eq_diameter__q75
MI=0.3591  elongation__q50
MI=0.3577  angle_std__q75
MI=0.3544  eccentricity__min
MI=0.3485  centroid_x__q50
MI=0.3481  nn_dist_max__std
MI=0.3478  bbox_aspect__q50
MI=0.3476  map_bbox_w
MI=0.3402  centroid_y__min
MI=0.3391  area__std


In [80]:
import numpy as np
import re

# 1) Split fused features -> map (249) + text (rest)
MAP_DIM = 249
Xtr_map = X_train[:, :MAP_DIM].astype(np.float32, copy=True)
Xva_map = X_val[:,   :MAP_DIM].astype(np.float32, copy=True)
Xte_map = X_test[:,  :MAP_DIM].astype(np.float32, copy=True)

# names[] must be the exact 249 names in order
feat_path = Path(PATHS.MAP_OUT) / "feature_names.json"
map_feat_names = np.array(json.loads(feat_path.read_text()))

def family_of(name: str):
    m = re.match(r"^([^_]+(?:_[^_]+)*)__(mean|min|max|std|q25|q50|q75)$", name)
    return (m.group(1), m.group(2)) if m else (name, "")

bases = np.array([family_of(n)[0] for n in map_feat_names])
stats = np.array([family_of(n)[1] for n in map_feat_names])

# 2) locate globals for per-sample scale
ix_w  = int(np.where(map_feat_names == "map_bbox_w")[0][0])
ix_h  = int(np.where(map_feat_names == "map_bbox_h")[0][0])

def diag_from_row(row):
    w = max(row[ix_w], 1e-12)
    h = max(row[ix_h], 1e-12)
    return float((w*w + h*h)**0.5)

def area_from_row(row):
    w = max(row[ix_w], 1e-12)
    h = max(row[ix_h], 1e-12)
    return float(w*h)

len_fams = {
    "perimeter","eq_diameter","bbox_width","bbox_height",
    "nn_dist_min","nn_dist_median","nn_dist_max",
    "knn1","knn2","knn3",
    "mean_neighbor_distance_touches","mean_neighbor_distance_intersects",
}
count_fams = {
    "vertex_count","neighbor_count_touches","neighbor_count_intersects",
    "hole_count","density_r05","density_r10","reflex_count",
}
area_fams = {"area"}  # extent is already normalized

len_mask   = np.isin(bases, list(len_fams))
count_mask = np.isin(bases, list(count_fams))
area_mask  = np.isin(bases, list(area_fams))

# Don‚Äôt touch std columns when logging (std can be 0 and negative not applicable)
is_std = (stats == "__std")

def normalize_and_log(X):
    X = X.copy()
    # per-row diag and area
    diags = np.array([diag_from_row(r)  for r in X], dtype=np.float32)[:, None]
    areas = np.array([area_from_row(r)  for r in X], dtype=np.float32)[:, None]

    # lengths/distances: divide by diag
    if len_mask.any():
        X[:, len_mask] = X[:, len_mask] / np.clip(diags, 1e-12, None)

    # areas: divide by bbox area
    if area_mask.any():
        X[:, area_mask] = X[:, area_mask] / np.clip(areas, 1e-12, None)

    # log1p: counts + (now normalized) lengths/distances; skip std columns
    log_mask = (count_mask | len_mask) & (~is_std)
    if log_mask.any():
        A = X[:, log_mask]
        A = np.where(np.isfinite(A) & (A >= 0), np.log1p(A), 0.0)
        X[:, log_mask] = A

    # winsorize EVERY map column by train percentiles (1‚Äì99) computed once on Xtr_map
    return X

# compute train caps, then apply to all
Xtr_tmp = normalize_and_log(Xtr_map)
lo = np.percentile(Xtr_tmp, 1, axis=0)
hi = np.percentile(Xtr_tmp, 99, axis=0)
def win(A): return np.clip(A, lo, hi)

Xtr_map_sane = win(normalize_and_log(Xtr_map))
Xva_map_sane = win(normalize_and_log(Xva_map))
Xte_map_sane = win(normalize_and_log(Xte_map))

# drop zero-variance cols (on train only)
stds = np.std(Xtr_map_sane, axis=0)
keep = stds > 1e-12
print("Dropping", (~keep).sum(), "zero-variance map columns")
Xtr_map_sane = Xtr_map_sane[:, keep]
Xva_map_sane = Xva_map_sane[:, keep]
Xte_map_sane = Xte_map_sane[:, keep]

# sanity peek
print("After sanitize: max|train| =", np.max(np.abs(Xtr_map_sane)))


Dropping 24 zero-variance map columns
After sanitize: max|train| = 272.45865


In [81]:
clf = MLPClassifier(
    hidden_layer_sizes=(128, 64),   # smaller net for 450 samples
    activation="relu",
    solver="adam",
    alpha=1e-3,                     # L2 regularization
    learning_rate_init=3e-4,        # works well with RobustScaler outputs (~O(1))
    batch_size=32,                  # smaller batches stabilize updates
    max_iter=300,                   # early stopping will cut it shorter anyway
    early_stopping=True,
    n_iter_no_change=15,
    validation_fraction=0.15,       # matches your val split ratio
    tol=1e-4,
    random_state=CFG.SEED,
    verbose=True
)

try:
    clf.fit(X_train_s, y_train_cls, sample_weight=sample_w)
except Exception as e:
    print(f"‚ö†Ô∏è Adam crashed ({e}). Falling back to lbfgs (no sample_weight).")
    clf = MLPClassifier(
        hidden_layer_sizes=(128, 64),
        activation="relu",
        solver="lbfgs",
        alpha=1e-3,       # slightly stronger L2 for stability
        max_iter=500,     # usually converges earlier
        tol=1e-4,
        random_state=CFG.SEED,
        verbose=True
    )
    clf.fit(X_train_s, y_train_cls)

Iteration 1, loss = nan
Validation score: 0.211241
Iteration 2, loss = nan
Validation score: 0.211241
Iteration 3, loss = nan
Validation score: 0.211241
Iteration 4, loss = nan
Validation score: 0.211241
Iteration 5, loss = nan
Validation score: 0.211241
Iteration 6, loss = nan
Validation score: 0.211241
Iteration 7, loss = nan
Validation score: 0.211241
Iteration 8, loss = nan
Validation score: 0.211241
Iteration 9, loss = nan
Validation score: 0.211241
Iteration 10, loss = nan
Validation score: 0.211241
Iteration 11, loss = nan
Validation score: 0.211241
Iteration 12, loss = nan
Validation score: 0.211241
Iteration 13, loss = nan
Validation score: 0.211241
Iteration 14, loss = nan
Validation score: 0.211241
Iteration 15, loss = nan
Validation score: 0.211241
Iteration 16, loss = nan
Validation score: 0.211241
Iteration 17, loss = nan
Validation score: 0.211241
Validation score did not improve more than tol=0.000100 for 15 consecutive epochs. Stopping.
‚ö†Ô∏è Adam crashed (Solver prod

ABNORMAL: 

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [72]:
def check_matrix(name, X):
    print(f"--- {name} ---")
    print("shape:", X.shape, "dtype:", X.dtype)
    print("finite:", np.isfinite(X).all())
    print("min/max:", np.nanmin(X), np.nanmax(X))
    print("mean/std:", np.nanmean(X), np.nanstd(X))
    # any all-NaN or all-constant columns before scaling?
    col_nan = np.isnan(X).all(axis=0).sum()
    col_zero_var = (np.nanstd(X, axis=0) == 0).sum()
    print("all-NaN cols:", col_nan, "zero-variance cols:", col_zero_var)

check_matrix("X_train", X_train)
check_matrix("X_val",   X_val)
check_matrix("X_test",  X_test)

check_matrix("X_train_imp", X_train_imp)
check_matrix("X_train_s",   X_train_s)

print("y_train_cls has NaN?", np.isnan(y_train_cls).any() if hasattr(y_train_cls, "__len__") else False)
print("classes present in train:", sorted(set(df_train[OP_COL])))

--- X_train ---
shape: (315, 1508) dtype: float32
finite: True
min/max: -552.0 4054873900.0
mean/std: 24875.152 8824922.0
all-NaN cols: 0 zero-variance cols: 58
--- X_val ---
shape: (67, 1508) dtype: float32
finite: True
min/max: -505.0 96049.9
mean/std: 22.502472 675.2408
all-NaN cols: 0 zero-variance cols: 72
--- X_test ---
shape: (68, 1508) dtype: float32
finite: True
min/max: -552.0 4054873900.0
mean/std: 115146.09 18993522.0
all-NaN cols: 0 zero-variance cols: 67
--- X_train_imp ---
shape: (315, 1508) dtype: float32
finite: True
min/max: -552.0 4054873900.0
mean/std: 24875.152 8824922.0
all-NaN cols: 0 zero-variance cols: 58
--- X_train_s ---
shape: (315, 1508) dtype: float32
finite: True
min/max: -20.723267 142484560.0
mean/std: 1179.2317 357503.12
all-NaN cols: 0 zero-variance cols: 58
y_train_cls has NaN? False
classes present in train: ['aggregate', 'displace', 'select', 'simplify']


In [57]:
y_val_pred  = clf.predict(X_val_s)
val_acc     = accuracy_score(y_val_cls, y_val_pred)
val_f1m     = f1_score(y_val_cls, y_val_pred, average="macro")
print(f"[VAL] acc={val_acc:.3f}  f1_macro={val_f1m:.3f}")
print(classification_report(y_val_cls, y_val_pred, target_names=classes))

[VAL] acc=0.224  f1_macro=0.191
              precision    recall  f1-score   support

   aggregate       0.00      0.00      0.00        17
    displace       0.42      0.42      0.42        12
      select       0.50      0.04      0.08        24
    simplify       0.17      0.64      0.27        14

    accuracy                           0.22        67
   macro avg       0.27      0.28      0.19        67
weighted avg       0.29      0.22      0.16        67



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [58]:
regressors = {}
for idx, name in enumerate(classes):
    sel = (y_train_cls == idx)
    n = int(sel.sum())
    if n < 5:
        print(f"‚ö†Ô∏è  Skipping regressor for '{name}' (only {n} samples).")
        continue

    reg = MLPRegressor(
        hidden_layer_sizes=(128, 64),
        activation="relu",
        solver="adam",
        alpha=1e-3,
        learning_rate_init=5e-4,
        max_iter=600,
        early_stopping=True,
        n_iter_no_change=30,
        random_state=SEED,
        verbose=False
    )
    reg.fit(X_train_s[sel], y_train_reg[sel])  # (n,d) -> (n,1)
    regressors[name] = reg
    print(f"‚úÖ Regressor trained for '{name}' on {n} samples.")

  y = column_or_1d(y, warn=True)
  activation += self.intercepts_[i]
  y = column_or_1d(y, warn=True)


‚úÖ Regressor trained for 'aggregate' on 84 samples.


  y = column_or_1d(y, warn=True)


‚úÖ Regressor trained for 'displace' on 82 samples.


  y = column_or_1d(y, warn=True)


‚úÖ Regressor trained for 'select' on 77 samples.
‚úÖ Regressor trained for 'simplify' on 72 samples.




In [59]:
# Classification
y_test_pred = clf.predict(X_test_s)
test_acc    = accuracy_score(y_test_cls, y_test_pred)
test_f1m    = f1_score(y_test_cls, y_test_pred, average="macro")
print(f"[TEST] acc={test_acc:.3f}  f1_macro={test_f1m:.3f}")
print(classification_report(y_test_cls, y_test_pred, target_names=classes))

# Parameter regression (conditioned on correct operator)
mask = (y_test_pred == y_test_cls)
print(f"Parameter evaluation on {int(mask.sum())}/{len(mask)} samples with correct operator prediction.")
y_pred_params = np.full_like(y_test_reg, np.nan, dtype="float32")  # (n,1)

for i, ok in enumerate(mask):
    if not ok:
        continue
    cls_name = classes[y_test_pred[i]]
    reg = regressors.get(cls_name)
    if reg is None:
        continue
    pred = reg.predict(X_test_s[i:i+1])[0]
    y_pred_params[i] = pred if hasattr(pred, "__len__") else [float(pred)]

valid = np.isfinite(y_pred_params).all(axis=1) & mask
if valid.any():
    mse = mean_squared_error(y_test_reg[valid], y_pred_params[valid])
    mae = mean_absolute_error(y_test_reg[valid], y_pred_params[valid])
    print(f"[TEST Param | correct-ops] MSE={mse:.4f}  MAE={mae:.4f}")
else:
    print("No valid parameter predictions to score.")

[TEST] acc=0.279  f1_macro=0.214
              precision    recall  f1-score   support

   aggregate       0.00      0.00      0.00        20
    displace       0.42      0.33      0.37        15
      select       1.00      0.07      0.13        14
    simplify       0.24      0.68      0.35        19

    accuracy                           0.28        68
   macro avg       0.41      0.27      0.21        68
weighted avg       0.36      0.28      0.21        68

Parameter evaluation on 19/68 samples with correct operator prediction.
[TEST Param | correct-ops] MSE=390.9869  MAE=7.3307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [60]:
joblib.dump(imp,        MODEL_OUT / "imputer.joblib")
joblib.dump(scaler,     MODEL_OUT / "scaler.joblib")
joblib.dump(le,         MODEL_OUT / "label_encoder.joblib")
joblib.dump(clf,        MODEL_OUT / "mlp_classifier.joblib")
joblib.dump(regressors, MODEL_OUT / "per_class_regressors.joblib")
print(f"Models saved to {MODEL_OUT}")

Models saved to ../data/output/models


In [61]:
def predict_pipeline(X_batch):
    """
    X_batch: (n, 1508) raw concatenated features
    Returns: dict with operator_probs, operator_label, params_pred
    """
    Xb = np.asarray(X_batch, dtype="float32")
    Xb[~np.isfinite(Xb)] = np.nan
    Xb = imp.transform(Xb)
    Xb = scaler.transform(Xb)
    Xb = np.clip(Xb, -CLIP_ABS, CLIP_ABS)

    proba = clf.predict_proba(Xb)                    # (n, 4)
    pred_idx = np.argmax(proba, axis=1)
    pred_labels = le.inverse_transform(pred_idx).tolist()

    params = []
    for i, lbl in enumerate(pred_labels):
        reg = regressors.get(lbl)
        if reg is None:
            params.append([np.nan])                 # single param
        else:
            pred = reg.predict(Xb[i:i+1])[0]
            params.append(pred.tolist() if hasattr(pred, "tolist") else [float(pred)])

    return {"operator_probs": proba, "operator_label": pred_labels, "params_pred": np.array(params)}