## üß© 0) Setup & Imports ##

In [None]:
# ===================== PARAMETERS / IMPORTS =====================
from pathlib import Path
import sys, subprocess, numpy as np, pandas as pd, joblib

from sklearn.model_selection import train_test_split, StratifiedGroupKFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, make_scorer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer


from dataclasses import dataclass


from pprint import pprint

from scipy.stats import loguniform, randint

import joblib

# Project config
PROJ_ROOT = Path("../").resolve()
SRC_DIR   = PROJ_ROOT / "src"
if str(PROJ_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJ_ROOT))

from src.config import PATHS, CFG, print_summary
print_summary()

# Dims (fallbacks if CFG unset)
MAP_DIM     = CFG.MAP_DIM or 165
PROMPT_DIM  = CFG.PROMPT_DIM or 512
FUSED_DIM   = CFG.FUSED_DIM or (MAP_DIM + PROMPT_DIM)
BATCH_SIZE  = CFG.BATCH_SIZE

# Clean outputs for a fresh run
PATHS.clean_outputs()


=== CONFIG SUMMARY ===
PROJ_ROOT  : /Users/amirdonyadide/Documents/GitHub/Thesis
DATA_DIR   : /Users/amirdonyadide/Documents/GitHub/Thesis/data
INPUT_DIR  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/input
OUTPUT_DIR : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output
MAPS_ROOT  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/samples/pairs
INPUT PAT. : *_input.geojson
PROMPTS_CSV: /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/prompts.csv
PAIRS_CSV  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/pairs.csv
PROMPT_OUT : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out
MAP_OUT    : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/map_out
TRAIN_OUT  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out
MODEL_OUT  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/models
SPLIT_OUT  : /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out/splits
PRM_NPZ    : /Users/amirdonyadide/Document

## üìö 1) Build Prompt Embeddings (USE) ##

In [74]:
# === PROMPT EMBEDDINGS ===
cmd = [
    sys.executable, "-m", "src.mapvec.prompts.prompt_embeddings",
    "--input",    str(PATHS.PROMPTS_CSV),
    "--model",    str(CFG.USE_MODEL),
    "--l2",
    "--out_dir",  str(PATHS.PROMPT_OUT),
    "-v",
]
print("CMD:", " ".join(cmd))
res = subprocess.run(cmd, cwd=str(PATHS.PROJ_ROOT))
if res.returncode != 0:
    raise SystemExit("Prompt embedding step failed.")
print("‚úÖ Prompt embeddings completed.")


CMD: /opt/anaconda3/envs/thesis/bin/python -m src.mapvec.prompts.prompt_embeddings --input /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/prompts.csv --model dan --l2 --out_dir /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out -v


KeyboardInterrupt: 

## üó∫Ô∏è 2) Build Map Embeddings (geometric) ##

In [None]:
# === MAP EMBEDDINGS ===
cmd = [
    sys.executable, "-m", "src.mapvec.maps.map_embeddings",
    "--root", str(PATHS.MAPS_ROOT),
    "--pattern", PATHS.INPUT_MAPS_PATTERN,
    "--out_dir", str(PATHS.MAP_OUT),
    "--norm", "fixed",
    "--norm-wh", "400x400",
    "-v",
]
print("CMD:", " ".join(cmd))
res = subprocess.run(cmd, cwd=str(PATHS.PROJ_ROOT))
if res.returncode != 0:
    raise SystemExit("Map embedding step failed.")
print("‚úÖ Map embeddings completed.")


CMD: /opt/anaconda3/envs/thesis/bin/python -m src.mapvec.maps.map_embeddings --root /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/samples/pairs --pattern *_input.geojson --out_dir /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/map_out --norm fixed --norm-wh 400x400 -v


17:07:06 | DEBUG | PROJECT_ROOT=/Users/amirdonyadide/Documents/GitHub/Thesis
17:07:06 | DEBUG | DATA_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/data
17:07:06 | INFO | Scanning /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/samples/pairs (pattern=*_input.geojson)‚Ä¶
17:07:06 | INFO | First pass: counting polygons to normalize poly_count‚Ä¶
17:07:11 | INFO | Max polygons across dataset: 789
17:07:12 | INFO | OK  map_id=0073  -> vector[165]
17:07:14 | INFO | OK  map_id=0080  -> vector[165]
17:07:14 | INFO | OK  map_id=0093  -> vector[165]
17:07:18 | INFO | OK  map_id=0122  -> vector[165]
17:07:19 | INFO | OK  map_id=0123  -> vector[165]
17:07:20 | INFO | OK  map_id=0127  -> vector[165]
17:07:21 | INFO | OK  map_id=0158  -> vector[165]
17:07:23 | INFO | OK  map_id=0159  -> vector[165]
17:07:24 | INFO | OK  map_id=0160  -> vector[165]
17:07:25 | INFO | OK  map_id=0165  -> vector[165]
17:07:26 | INFO | OK  map_id=0167  -> vector[165]
17:07:27 | INFO | OK  map_id=0168  -> vecto

‚úÖ Map embeddings completed.


17:14:47 | INFO | OK  map_id=1757  -> vector[165]
17:14:47 | INFO | Saved 300 vectors (failed=0) to /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/map_out


## üîó 3) Concatenate (pairs ‚Üí fused rows) ##

In [None]:
# === CONCATENATION ===
cmd = [
    sys.executable, "-m", "src.mapvec.concat.concat_embeddings",
    "--pairs",      str(PATHS.PAIRS_CSV),
    "--map_npz",    str(PATHS.MAP_OUT / "maps_embeddings.npz"),
    "--prompt_npz", str(PATHS.PROMPT_OUT / "prompts_embeddings.npz"),
    "--out_dir",    str(PATHS.TRAIN_OUT),
    "--drop_dupes",
    # "--l2-prompt",     # safety net if you want L2 here as well
    # "--fail_on_missing"
    # "--save-blocks"
]
print("CMD:", " ".join(cmd))
res = subprocess.run(cmd, cwd=str(PATHS.PROJ_ROOT))
if res.returncode != 0:
    raise SystemExit("Concatenation step failed.")
print("‚úÖ Concatenation completed.")


CMD: /opt/anaconda3/envs/thesis/bin/python -m src.mapvec.concat.concat_embeddings --pairs /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/pairs.csv --map_npz /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/map_out/maps_embeddings.npz --prompt_npz /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out/prompts_embeddings.npz --out_dir /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out --drop_dupes
‚úÖ Concatenation completed.


17:15:10 | INFO | Map  embeddings: (300, 165) from /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/map_out/maps_embeddings.npz
17:15:10 | INFO | Prompt embeddings: (500, 512) from /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out/prompts_embeddings.npz
17:15:10 | INFO | X shape = (450, 677)  (map_dim=165, prompt_dim=512)
17:15:10 | INFO | Saved to /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out in 0.02s


## üì• 4) Load & Basic Cleaning ##

In [None]:
# === LOAD FUSED DATA ===
X = np.load(PATHS.TRAIN_OUT / "X_concat.npy")
pairs_df = pd.read_parquet(PATHS.TRAIN_OUT / "train_pairs.parquet")
print(f"Loaded X: {X.shape}, pairs: {pairs_df.shape}")

OP_COL = "operator"
PARAM_COLS = ["param"]

df = pairs_df.copy()
df[OP_COL] = df[OP_COL].astype(str).str.strip().str.lower()

mask = df[OP_COL].notna()
for c in PARAM_COLS:
    mask &= df[c].notna()

X  = X[mask.values].astype("float64", copy=False)
df = df.loc[mask].reset_index(drop=True)
print(f"After cleaning: X={X.shape}, df={df.shape}, ops={sorted(df[OP_COL].unique())}")


Loaded X: (450, 677), pairs: (450, 4)
After cleaning: X=(450, 677), df=(450, 4), ops=['aggregate', 'displace', 'select', 'simplify']


## ‚úÇÔ∏è 5) Split & Targets ##

In [None]:
# === SPLIT ===
FIXED_CLASSES = ["simplify", "select", "aggregate", "displace"]

X_train, X_temp, df_train, df_temp = train_test_split(
    X, df,
    test_size=CFG.VAL_RATIO + CFG.TEST_RATIO,
    random_state=CFG.SEED,
    shuffle=True,
    stratify=df[OP_COL] if df[OP_COL].nunique() > 1 else None
)
rel_test = CFG.TEST_RATIO / (CFG.VAL_RATIO + CFG.TEST_RATIO)
X_val, X_test, df_val, df_test = train_test_split(
    X_temp, df_temp,
    test_size=rel_test,
    random_state=CFG.SEED,
    shuffle=True,
    stratify=df_temp[OP_COL] if df_temp[OP_COL].nunique() > 1 else None
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

# === TARGETS ===
le = LabelEncoder().fit(FIXED_CLASSES)
y_train_cls = le.transform(df_train[OP_COL])
y_val_cls   = le.transform(df_val[OP_COL])
y_test_cls  = le.transform(df_test[OP_COL])

y_train_reg = df_train[PARAM_COLS].to_numpy(dtype="float64")
y_val_reg   = df_val[PARAM_COLS].to_numpy(dtype="float64")
y_test_reg  = df_test[PARAM_COLS].to_numpy(dtype="float64")


Train: (315, 677), Val: (67, 677), Test: (68, 677)


## üßº 6) Modality-Aware Preprocessing (map only) ##

In [None]:
# === MODALITY-AWARE PREPROCESSING ===
MAP_DIM     = CFG.MAP_DIM or 165       # set to true map dim
PROMPT_DIM  = CFG.PROMPT_DIM or 512

def split_blocks(X):
    X_map    = X[:, :MAP_DIM].astype(np.float64, copy=True)
    X_prompt = X[:, MAP_DIM:MAP_DIM+PROMPT_DIM].astype(np.float64, copy=True)
    return X_map, X_prompt

def l2_normalize_rows(A, eps=1e-12):
    nrm = np.sqrt((A * A).sum(axis=1, keepdims=True))
    return A / np.maximum(nrm, eps)

# split
Xm_tr, Xp_tr = split_blocks(X_train)
Xm_va, Xp_va = split_blocks(X_val)
Xm_te, Xp_te = split_blocks(X_test)

# prompts: L2 only
Xp_tr = l2_normalize_rows(Xp_tr)
Xp_va = l2_normalize_rows(Xp_va)
Xp_te = l2_normalize_rows(Xp_te)

# maps: inf‚ÜíNaN
for A in (Xm_tr, Xm_va, Xm_te):
    A[~np.isfinite(A)] = np.nan

# impute (train)
imp = SimpleImputer(strategy="median")
Xm_tr_imp = imp.fit_transform(Xm_tr)
Xm_va_imp = imp.transform(Xm_va)
Xm_te_imp = imp.transform(Xm_te)

# clip (5‚Äì95%) train thresholds
q_lo = np.nanpercentile(Xm_tr_imp, 5, axis=0)
q_hi = np.nanpercentile(Xm_tr_imp, 95, axis=0)
def clip_to_q(A, lo, hi): return np.clip(A, lo, hi)

Xm_tr_imp = clip_to_q(Xm_tr_imp, q_lo, q_hi)
Xm_va_imp = clip_to_q(Xm_va_imp, q_lo, q_hi)
Xm_te_imp = clip_to_q(Xm_te_imp, q_lo, q_hi)

# drop zero-variance cols on train
stds = np.nanstd(Xm_tr_imp, axis=0)
keep_mask = stds > 1e-12

# scale kept columns (train fit)
scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(5, 95))
Xm_tr_kept = scaler.fit_transform(Xm_tr_imp[:, keep_mask])
Xm_va_kept = scaler.transform(Xm_va_imp[:, keep_mask])
Xm_te_kept = scaler.transform(Xm_te_imp[:, keep_mask])

# rebuild full map dim (dropped cols = 0)
Xm_tr_s = np.zeros_like(Xm_tr_imp, dtype=np.float64)
Xm_va_s = np.zeros_like(Xm_va_imp, dtype=np.float64)
Xm_te_s = np.zeros_like(Xm_te_imp, dtype=np.float64)
Xm_tr_s[:, keep_mask] = Xm_tr_kept.astype(np.float64)
Xm_va_s[:, keep_mask] = Xm_va_kept.astype(np.float64)
Xm_te_s[:, keep_mask] = Xm_te_kept.astype(np.float64)

# fuse back
X_train_s = np.concatenate([Xm_tr_s, Xp_tr], axis=1).astype(np.float64)
X_val_s   = np.concatenate([Xm_va_s, Xp_va], axis=1).astype(np.float64)
X_test_s  = np.concatenate([Xm_te_s, Xp_te], axis=1).astype(np.float64)

assert np.isfinite(X_train_s).all() and np.isfinite(X_val_s).all() and np.isfinite(X_test_s).all(), "Non-finite after preprocessing."
print("‚úÖ Modality-aware preprocessing complete.")

# save preprocessing bundle
joblib.dump({
    "imp": imp, "q_lo": q_lo, "q_hi": q_hi,
    "keep_mask": keep_mask, "scaler": scaler,
    "map_dim": MAP_DIM, "prompt_dim": PROMPT_DIM
}, PATHS.TRAIN_OUT / "preproc.joblib")


‚úÖ Modality-aware preprocessing complete.


['/Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out/preproc.joblib']

## ‚öñÔ∏è 7) Class Weights ##

In [None]:
classes  = list(le.classes_)
n_classes = len(classes)
cls_w    = compute_class_weight(class_weight="balanced",
                                classes=np.arange(n_classes),
                                y=y_train_cls)
sample_w = np.array([cls_w[c] for c in y_train_cls], dtype="float64")
print("Class weights:", dict(zip(classes, cls_w)))


Class weights: {np.str_('aggregate'): np.float64(0.9264705882352942), np.str_('displace'): np.float64(1.0361842105263157), np.str_('select'): np.float64(0.984375), np.str_('simplify'): np.float64(1.0641891891891893)}


## üß† 8) Train MLP ##

In [76]:
# =========================
# MLP search where each model trains on ALL training data
# =========================
import numpy as np
from pathlib import Path
from pprint import pprint
from dataclasses import dataclass

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix

# ---- numerics: keep float64 everywhere ----
X_train_s = X_train_s.astype(np.float64, copy=False)
X_val_s   = X_val_s.astype(np.float64, copy=False)
X_test_s  = X_test_s.astype(np.float64, copy=False)
sample_w  = sample_w.astype(np.float64, copy=False)

# ---- group by map_id (maps can repeat; prompts don't) ----
assert "map_id" in df_train.columns, "df_train must contain 'map_id' for grouped CV."
groups_tr = df_train["map_id"].astype(str).values

# ---- CV splitter (for scoring only) ----
cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

# ---- search space helpers ----
rng = np.random.RandomState(42)

def draw_params(n):
    sizes = [(64,), (128,), (256,), (128, 64), (256, 128), (256, 128, 64)]
    batches = [16, 32, 64, 128]
    for _ in range(n):
        yield {
            "hidden_layer_sizes": sizes[rng.randint(len(sizes))],
            "alpha": 10**rng.uniform(-5, np.log10(3e-2)),          # loguniform(1e-5, 3e-2)
            "learning_rate_init": 10**rng.uniform(-4, np.log10(3e-3)),  # loguniform(1e-4, 3e-3)
            "batch_size": batches[rng.randint(len(batches))],
            "activation": "relu",
            "solver": "adam",
            "max_iter": 800,            # allow convergence w/o early stopping
            "early_stopping": False,    # <‚Äî IMPORTANT: use ALL training samples
            "random_state": 42,
            "verbose": False,
            "tol": 1e-4
        }

# ---- CV scorer using grouped folds; model sees only its fold-train here (for the score only) ----
def cv_macro_f1(params):
    scores = []
    for tr_idx, va_idx in cv.split(X_train_s, y_train_cls, groups_tr):
        clf = MLPClassifier(**params)
        clf.fit(X_train_s[tr_idx], y_train_cls[tr_idx], sample_weight=sample_w[tr_idx])
        pred = clf.predict(X_train_s[va_idx])
        scores.append(f1_score(y_train_cls[va_idx], pred, average="macro"))
    return float(np.mean(scores)), float(np.std(scores))

@dataclass
class Candidate:
    params: dict
    cv_mean: float
    cv_std: float
    val_f1: float
    val_acc: float

# ---- run search ----
N_ITER = 50   # tune this for time/quality tradeoff
candidates = []

print(f"\nSearching {N_ITER} MLP configs...")
for i, params in enumerate(draw_params(N_ITER), 1):
    cv_mean, cv_std = cv_macro_f1(params)

    # IMPORTANT PART: refit SAME PARAMS on FULL TRAIN (no early_stopping) so the model sees ALL training data
    clf_full = MLPClassifier(**params)
    clf_full.fit(X_train_s, y_train_cls, sample_weight=sample_w)

    # evaluate on external VAL (never used for training)
    val_pred = clf_full.predict(X_val_s)
    val_f1 = f1_score(y_val_cls, val_pred, average="macro")
    val_acc = accuracy_score(y_val_cls, val_pred)

    candidates.append(Candidate(params, cv_mean, cv_std, val_f1, val_acc))
    print(f"[{i:02d}/{N_ITER}] cvF1={cv_mean:.3f}¬±{cv_std:.3f} | VAL F1={val_f1:.3f} acc={val_acc:.3f} | {params['hidden_layer_sizes']}, Œ±={params['alpha']:.2e}, lr={params['learning_rate_init']:.1e}, bs={params['batch_size']}")

# ---- pick winner by external VAL macro-F1 (tie-breaker: VAL acc, then CV mean) ----
candidates.sort(key=lambda c: (c.val_f1, c.val_acc, c.cv_mean), reverse=True)
best = candidates[0]
print("\n=== Top candidates (by VAL macro-F1) ===")
for c in candidates[:5]:
    print(f"VAL F1={c.val_f1:.3f} (acc={c.val_acc:.3f}) | cvF1={c.cv_mean:.3f}¬±{c.cv_std:.3f} | params={c.params}")

print("\nüèÜ Selected params:")
pprint(best.params)

# ---- train final model on FULL TRAIN (no early_stopping so it uses 100% of train) ----
final_mlp = MLPClassifier(**best.params)
final_mlp.fit(X_train_s, y_train_cls, sample_weight=sample_w)

# ---- evaluate on VAL & TEST ----
for name, Xs, ys in [("VAL", X_val_s, y_val_cls), ("TEST", X_test_s, y_test_cls)]:
    yhat = final_mlp.predict(Xs)
    acc  = accuracy_score(ys, yhat)
    f1m  = f1_score(ys, yhat, average="macro")
    print(f"\n===== {name} =====")
    print(f"{name}: acc={acc:.4f}  f1_macro={f1m:.4f}")
    print(classification_report(ys, yhat, target_names=list(le.classes_)))
    print("Confusion matrix:\n", confusion_matrix(ys, yhat))

# ---- save final model ----
out_dir = Path(PATHS.TRAIN_OUT); out_dir.mkdir(parents=True, exist_ok=True)
import joblib
joblib.dump({"model": final_mlp, "label_encoder": le, "best_params": best.params}, out_dir / "best_mlp_fulltrain.joblib")
print(f"\n‚úÖ Saved final MLP (trained on ALL TRAIN) to: {out_dir / 'best_mlp_fulltrain.joblib'}")



Searching 50 MLP configs...
[01/50] cvF1=0.230¬±0.078 | VAL F1=0.758 acc=0.761 | (128, 64), Œ±=2.02e-02, lr=1.2e-03, bs=16
[02/50] cvF1=0.241¬±0.014 | VAL F1=0.674 acc=0.687 | (256, 128), Œ±=3.49e-05, lr=1.7e-04, bs=64
[03/50] cvF1=0.212¬±0.037 | VAL F1=0.696 acc=0.701 | (256,), Œ±=1.03e-02, lr=7.7e-04, bs=128
[04/50] cvF1=0.225¬±0.034 | VAL F1=0.665 acc=0.672 | (256,), Œ±=1.18e-05, lr=2.7e-03, bs=128
[05/50] cvF1=0.218¬±0.052 | VAL F1=0.690 acc=0.701 | (256, 128, 64), Œ±=5.47e-05, lr=1.9e-04, bs=16
[06/50] cvF1=0.224¬±0.036 | VAL F1=0.696 acc=0.701 | (64,), Œ±=1.14e-04, lr=6.0e-04, bs=128
[07/50] cvF1=0.236¬±0.047 | VAL F1=0.695 acc=0.701 | (64,), Œ±=1.03e-04, lr=8.0e-04, bs=32
[08/50] cvF1=0.241¬±0.058 | VAL F1=0.773 acc=0.776 | (128, 64), Œ±=2.43e-02, lr=2.2e-04, bs=32
[09/50] cvF1=0.206¬±0.038 | VAL F1=0.677 acc=0.687 | (256, 128, 64), Œ±=4.95e-05, lr=5.7e-04, bs=128
[10/50] cvF1=0.243¬±0.034 | VAL F1=0.681 acc=0.687 | (64,), Œ±=1.45e-05, lr=7.9e-04, bs=16
[11/50] cvF1=0.229¬±0.03



[23/50] cvF1=0.217¬±0.031 | VAL F1=0.678 acc=0.687 | (64,), Œ±=4.84e-03, lr=2.0e-04, bs=128
[24/50] cvF1=0.226¬±0.046 | VAL F1=0.648 acc=0.657 | (256,), Œ±=4.91e-05, lr=1.1e-03, bs=64
[25/50] cvF1=0.226¬±0.043 | VAL F1=0.691 acc=0.701 | (64,), Œ±=1.28e-03, lr=2.3e-03, bs=32
[26/50] cvF1=0.225¬±0.028 | VAL F1=0.709 acc=0.716 | (64,), Œ±=1.52e-02, lr=1.8e-03, bs=128
[27/50] cvF1=0.213¬±0.021 | VAL F1=0.618 acc=0.627 | (128,), Œ±=2.15e-05, lr=3.5e-04, bs=32
[28/50] cvF1=0.236¬±0.022 | VAL F1=0.672 acc=0.687 | (256, 128), Œ±=3.44e-03, lr=8.7e-04, bs=64
[29/50] cvF1=0.222¬±0.031 | VAL F1=0.646 acc=0.657 | (256,), Œ±=4.38e-04, lr=1.5e-04, bs=32
[30/50] cvF1=0.210¬±0.034 | VAL F1=0.678 acc=0.687 | (256,), Œ±=4.42e-03, lr=6.7e-04, bs=64
[31/50] cvF1=0.206¬±0.046 | VAL F1=0.676 acc=0.687 | (256, 128, 64), Œ±=5.21e-04, lr=5.9e-04, bs=64
[32/50] cvF1=0.205¬±0.035 | VAL F1=0.618 acc=0.627 | (128,), Œ±=1.23e-05, lr=1.4e-04, bs=64
[33/50] cvF1=0.248¬±0.044 | VAL F1=0.677 acc=0.687 | (64,), Œ±=1.24e-

In [80]:
# =========================
# Regression branch (one MLPRegressor per operator)
# =========================
import numpy as np
from pathlib import Path
from pprint import pprint
import joblib

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import loguniform

# ---- 1) Prepare numeric regression targets
def _coerce_param_to_float(s):
    # Try robust parse; you can customize if your 'param' has units or JSON.
    try:
        return float(s)
    except Exception:
        return np.nan

y_train_reg = df_train["param"].apply(_coerce_param_to_float).to_numpy()
y_val_reg   = df_val["param"].apply(_coerce_param_to_float).to_numpy()
y_test_reg  = df_test["param"].apply(_coerce_param_to_float).to_numpy()

# Guard: drop if any NaNs (or you can filter rows; here we assert)
assert np.isfinite(y_train_reg).all() and np.isfinite(y_val_reg).all() and np.isfinite(y_test_reg).all(), \
    "Non-finite values found in regression target 'param'. Clean/parse them first."

# Optional: log1p transform if param is positive and skewed
USE_LOG1P = False
if USE_LOG1P:
    assert (y_train_reg >= 0).all() and (y_val_reg >= 0).all() and (y_test_reg >= 0).all(), \
        "log1p selected but param has negatives."
    ytr_reg_t = np.log1p(y_train_reg)
    yva_reg_t = np.log1p(y_val_reg)
    yte_reg_t = np.log1p(y_test_reg)
    def inv_t(x): return np.expm1(x)
else:
    ytr_reg_t = y_train_reg.copy()
    yva_reg_t = y_val_reg.copy()
    yte_reg_t = y_test_reg.copy()
    def inv_t(x): return x

# ---- 2) Grouped CV by map_id for *regression* (no stratification needed on a numeric target)
assert "map_id" in df_train.columns
gk = GroupKFold(n_splits=5)
groups_tr = df_train["map_id"].astype(str).values

# ---- 3) Search space for MLPRegressor (kept modest; widen n_iter to search more)
base_reg = MLPRegressor(
    activation="relu",
    solver="adam",
    learning_rate="adaptive",   # <‚Äî helps convergence on tough subsets
    early_stopping=False,       # keep OFF during search so it uses all class data
    max_iter=2000,              # <‚Äî more runway
    tol=1e-3,                   # <‚Äî slightly easier convergence threshold
    random_state=42,
    verbose=False,
    batch_size="auto"           # <‚Äî avoids clipping warnings
)
param_dist_reg = {
    "hidden_layer_sizes": [(64,), (128,), (256,), (128, 64), (256, 128)],
    "alpha": loguniform(1e-6, 3e-2),        # widen upper range for stronger regularization
    "learning_rate_init": loguniform(1e-4, 3e-3),
    # "batch_size": ["auto"]  # not tuning batch size anymore
}

# ---- 4) Fit one regressor per class
class_names = list(le.classes_)
n_classes = len(class_names)
regressors = {}
search_summaries = {}

for cls_idx, cls_name in enumerate(class_names):
    # mask for this class in TRAIN
    m_tr = (y_train_cls == cls_idx)
    Xk, yk, gk_tr = X_train_s[m_tr], ytr_reg_t[m_tr], groups_tr[m_tr]
    if Xk.shape[0] < 10:
        print(f"‚ö†Ô∏è Skipping class '{cls_name}' (too few samples: {Xk.shape[0]}).")
        continue

    # grouped CV splits for this class only
    splits = list(gk.split(Xk, yk, groups=gk_tr))

    # negative RMSE is a good search objective
    search = RandomizedSearchCV(
        estimator=base_reg,
        param_distributions=param_dist_reg,
        n_iter=40,
        scoring="neg_root_mean_squared_error",
        cv=splits,
        n_jobs=-1,
        refit=True,
        random_state=42,
        verbose=1
    )
    search.fit(Xk, yk)

    print(f"\n=== Regressor for class '{cls_name}' ===")
    print("best CV RMSE:", -search.best_score_)
    print("best params:"); pprint(search.best_params_)
    search_summaries[cls_name] = {"neg_rmse_cv": search.best_score_, "params": search.best_params_}

    # Refit on the FULL class-specific TRAIN subset
    reg_full = MLPRegressor(
        **{**search.best_estimator_.get_params(), "early_stopping": False, "max_iter": 2000, "random_state": 42}
    )
    reg_full.fit(Xk, yk)
    regressors[cls_name] = reg_full

# ---- 5) Evaluate on VAL & TEST using your classifier's prediction to route to regressors
def route_and_predict(Xs, pred_cls_idx):
    yhat_reg = np.zeros(len(pred_cls_idx), dtype=float)
    for i, cidx in enumerate(pred_cls_idx):
        cname = class_names[cidx]
        reg = regressors.get(cname, None)
        if reg is None:
            # Fallback: if a regressor is missing for a rare class, you can use a global mean or nearest regressor
            yhat_reg[i] = np.nan
        else:
            yhat_reg[i] = reg.predict(Xs[i:i+1])[0]
    return yhat_reg

# helper to print metrics (older sklearn: no squared=False)
def print_reg_metrics(name, y_true, y_pred_transformed):
    # inverse-transform predictions if you used log1p
    y_pred = inv_t(y_pred_transformed)

    # guard against NaNs (e.g., missing regressor for a class)
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    if mask.sum() == 0:
        print(f"{name}: no finite pairs to evaluate.")
        return np.nan, np.nan
    if mask.sum() < len(y_true):
        print(f"{name}: dropped {len(y_true) - mask.sum()} samples with NaNs.")

    y_true_m = y_true[mask]
    y_pred_m = y_pred[mask]

    mae = mean_absolute_error(y_true_m, y_pred_m)
    mse = mean_squared_error(y_true_m, y_pred_m)   # older sklearn doesn't support squared=False
    rmse = np.sqrt(mse)
    print(f"{name}: MAE={mae:.4f}  RMSE={rmse:.4f}")
    return mae, rmse


# Classification predictions (already trained classifier)
clf_cls = clf  # <- ensure this is your trained best classifier
val_pred_cls = clf_cls.predict(X_val_s)
test_pred_cls = clf_cls.predict(X_test_s)

# route to per-class regressors
yhat_val_reg_t  = route_and_predict(X_val_s,  val_pred_cls)
yhat_test_reg_t = route_and_predict(X_test_s, test_pred_cls)

print("\n--- Regression with predicted classes (realistic) ---")
print_reg_metrics("VAL",  y_val_reg,  yhat_val_reg_t)
print_reg_metrics("TEST", y_test_reg, yhat_test_reg_t)

# ---- 6) Optional: 'oracle' evaluation to isolate regressor quality (use TRUE class for routing)
yhat_val_oracle_t  = route_and_predict(X_val_s,  y_val_cls)
yhat_test_oracle_t = route_and_predict(X_test_s, y_test_cls)

print("\n--- Regression with TRUE classes (oracle routing) ---")
print_reg_metrics("VAL-oracle",  y_val_reg,  yhat_val_oracle_t)
print_reg_metrics("TEST-oracle", y_test_reg, yhat_test_oracle_t)

# ---- 7) Save bundle
bundle = {
    "classifier": clf_cls,
    "regressors_by_class": regressors,
    "label_encoder": le,
    "use_log1p": USE_LOG1P
}
out_dir = Path(PATHS.TRAIN_OUT)
out_dir.mkdir(parents=True, exist_ok=True)
joblib.dump(bundle, out_dir / "cls_plus_regressors.joblib")
print(f"\n‚úÖ Saved classification+regression bundle to: {out_dir / 'cls_plus_regressors.joblib'}")


Fitting 5 folds for each of 40 candidates, totalling 200 fits

=== Regressor for class 'aggregate' ===
best CV RMSE: 3.776095454330027
best params:
{'alpha': np.float64(2.861167865082196e-05),
 'hidden_layer_sizes': (256, 128),
 'learning_rate_init': np.float64(0.0002516607127550297)}
Fitting 5 folds for each of 40 candidates, totalling 200 fits

=== Regressor for class 'displace' ===
best CV RMSE: 0.5718840359495102
best params:
{'alpha': np.float64(0.0005248702648435531),
 'hidden_layer_sizes': (256, 128),
 'learning_rate_init': np.float64(0.0004628518674713464)}
Fitting 5 folds for each of 40 candidates, totalling 200 fits





=== Regressor for class 'select' ===
best CV RMSE: 59.038668859579765
best params:
{'alpha': np.float64(2.1453931225439485e-06),
 'hidden_layer_sizes': (128,),
 'learning_rate_init': np.float64(0.0001483039268456802)}




Fitting 5 folds for each of 40 candidates, totalling 200 fits

=== Regressor for class 'simplify' ===
best CV RMSE: 2.7372283634097707
best params:
{'alpha': np.float64(0.000522114714225509),
 'hidden_layer_sizes': (256, 128),
 'learning_rate_init': np.float64(0.00016149614799999194)}

--- Regression with predicted classes (realistic) ---
VAL: MAE=21.7280  RMSE=42.3725
TEST: MAE=23.2668  RMSE=41.8655

--- Regression with TRUE classes (oracle routing) ---
VAL-oracle: MAE=9.8681  RMSE=24.4868
TEST-oracle: MAE=10.0799  RMSE=23.9558

‚úÖ Saved classification+regression bundle to: /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out/cls_plus_regressors.joblib
