# MapVec end-to-end pipeline üìí

This notebook runs the **entire pipeline**:
1. Prompt embeddings (Universal Sentence Encoder)
2. Map embeddings (handcrafted polygon features)
3. Concatenation into a training matrix
4. Helper cells to inspect vectors by `prompt_id` or `map_id`

**Edit the Parameters** in the next cell to match your project layout.


In [27]:
# ===================== PARAMETERS =====================
from pathlib import Path
import sys
import subprocess
import importlib
import shutil
import numpy as np
import pandas as pd
import shlex
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, mean_squared_error, mean_absolute_error
from sklearn.utils.class_weight import compute_class_weight
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [50]:
# ---------- PROJECT ROOTS ----------
# Project root that contains `src/` and `data/`
PROJ_ROOT = Path("../")

# Data locations
DATA_DIR    = PROJ_ROOT / "data"
PROMPTS_CSV = DATA_DIR / "input" / "prompts.csv"   # columns: prompt_id,text (or id,text)
PAIRS_CSV   = DATA_DIR / "input" / "pairs.csv"     # columns: map_id,prompt_id
MAPS_ROOT   = DATA_DIR / "input" / "samples" / "pairs"

# File patterns
INPUT_MAPS_PATTERN  = "*_input.geojson"
OUTPUT_MAPS_PATTERN = "*_generalized.geojson"

# Output directories
PROMPT_OUT   = DATA_DIR / "output" / "prompt_out"
MAP_OUT      = DATA_DIR / "output" / "map_out"
PAIR_MAP_OUT = DATA_DIR / "output" / "pair_map_out"
TRAIN_OUT    = DATA_DIR / "output" / "train_out"
MODEL_OUT    = DATA_DIR / "output" / "models"

SPLIT_OUT    = TRAIN_OUT / "splits"

# Paths to precomputed embeddings (for concatenation)
MAP_NPZ = PAIR_MAP_OUT / "embeddings.npz"
PRM_NPZ = PROMPT_OUT / "embeddings.npz"

# ---------- EMBEDDINGS / MODEL CHOICES ----------
# USE model variant for prompt embeddings: 'dan' or 'transformer'
USE_MODEL  = "dan"

# Expected dims
MAP_DIM     = 996    # map embedding dim
PROMPT_DIM  = 512    # prompt (USE) embedding dim
FUSED_DIM   = MAP_DIM + PROMPT_DIM  # 1508
BATCH_SIZE  = 512

# ---------- DATA SPLITS ----------
VAL_RATIO = 0.15   # 15% validation
TEST_RATIO = 0.15  # 15% test (remaining 70% train)
SEED = 42          # reproducibility

In [11]:
# ===================== CLEAN PREVIOUS OUTPUTS =====================
for d in [PROMPT_OUT, MAP_OUT, TRAIN_OUT, MODEL_OUT, PAIR_MAP_OUT, SPLIT_OUT]:
    if d.exists():
        print(f"üßπ Removing old directory: {d}")
        shutil.rmtree(d)
    d.mkdir(parents=True, exist_ok=True)

print("‚úÖ All output folders cleaned and recreated fresh.")

üßπ Removing old directory: ../data/output/prompt_out
üßπ Removing old directory: ../data/output/map_out
üßπ Removing old directory: ../data/output/train_out
üßπ Removing old directory: ../data/models
üßπ Removing old directory: ../data/output/pair_map_out
‚úÖ All output folders cleaned and recreated fresh.


In [51]:
# Make sure Python can import your local modules (src/)
sys.path.insert(0, str(PROJ_ROOT))
PROMPT_OUT.mkdir(parents=True, exist_ok=True)
MAP_OUT.mkdir(parents=True, exist_ok=True)
TRAIN_OUT.mkdir(parents=True, exist_ok=True)
MODEL_OUT.mkdir(parents=True, exist_ok=True)
PAIR_MAP_OUT.mkdir(parents=True, exist_ok=True)
SPLIT_OUT.mkdir(parents=True, exist_ok=True)
MODEL_OUT.mkdir(parents=True, exist_ok=True)
print("‚úÖ All output folders created.")


‚úÖ All output folders created.


## 0) Dependency check (Parquet engine)
We ensure `pyarrow` or `fastparquet` is available for `pandas.to_parquet`.

## 1) Prompt embeddings
Runs `src/mapvec/prompts/prompt_embeddings.py` using your chosen USE model and saves artifacts to `PROMPT_OUT`.

In [13]:
cmd = (
    f"python {shlex.quote(str(PROJ_ROOT / 'src' / 'mapvec' / 'prompts' / 'prompt_embeddings.py'))} "
    f"--input {shlex.quote(str(PROMPTS_CSV))} --model {shlex.quote(str(USE_MODEL))} --l2 --out_dir {shlex.quote(str(PROMPT_OUT))} -v"
)
print(cmd)
res = subprocess.run(cmd, shell=True)
if res.returncode != 0:
    raise SystemExit('Prompt embedding step failed.')
print('Prompt embeddings completed.')

python ../src/mapvec/prompts/prompt_embeddings.py --input ../data/input/prompts.csv --model dan --l2 --out_dir ../data/output/prompt_out -v


14:54:18 | DEBUG | FILE_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/src/mapvec/prompts
14:54:18 | DEBUG | PROJECT_ROOT=/Users/amirdonyadide/Documents/GitHub/Thesis
14:54:18 | DEBUG | DEFAULT_DATA_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/data
14:54:18 | INFO | DATA_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/data
14:54:18 | INFO | INPUT=/Users/amirdonyadide/Documents/GitHub/Thesis/data/input/prompts.csv
14:54:18 | INFO | OUT_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out
14:54:18 | INFO | Reading CSV: /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/prompts.csv
14:54:18 | INFO | Loaded 500 prompts (id_col=prompt_id). Sample IDs: p001, p002, p003‚Ä¶
14:54:18 | INFO | Using local USE-dan at /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/model_dan
14:54:18 | INFO | Loading USE-dan from local path: /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/model_dan ‚Ä¶
14:54:21 | INFO | Fingerprint not found. Saved model loading will 

Prompt embeddings completed.


## 2) Map embeddings
Runs the map embedding module on the GeoJSON inputs. Skips problematic features, logs warnings, and writes `embeddings.npz` to `PAIR_MAP_OUT`.

In [14]:
# notebook snippet
cmd = [
    sys.executable, "-m", "src.mapvec.maps.pair_map_embeddings",
    "--root", str(MAPS_ROOT),
    "--input_pattern", str(INPUT_MAPS_PATTERN),
    "--gen_pattern", str(OUTPUT_MAPS_PATTERN),
    "--out_dir", str(PAIR_MAP_OUT),
    "-v"
]
print("CMD:", " ".join(cmd))
res = subprocess.run(cmd, cwd=str(PROJ_ROOT))
if res.returncode != 0:
    raise SystemExit("Pair map embedding step failed.")
print("Pair map embeddings completed.")

CMD: /opt/anaconda3/envs/thesis/bin/python -m src.mapvec.maps.pair_map_embeddings --root ../data/input/samples/pairs --input_pattern *_input.geojson --gen_pattern *_generalized.geojson --out_dir ../data/output/pair_map_out -v


14:54:33 | DEBUG | PROJECT_ROOT=/Users/amirdonyadide/Documents/GitHub/Thesis
14:54:33 | DEBUG | DATA_DIR=/Users/amirdonyadide/Documents/GitHub/Thesis/data
14:54:33 | INFO | Scanning /Users/amirdonyadide/Documents/GitHub/Thesis/data/input/samples/pairs (in=*_input.geojson, gen=*_generalized.geojson)‚Ä¶
14:54:38 | INFO | OK  map_id=0073  -> pair_vec[996] (per_map_dim=249)
14:54:39 | INFO | OK  map_id=0080  -> pair_vec[996] (per_map_dim=249)
14:54:40 | INFO | OK  map_id=0093  -> pair_vec[996] (per_map_dim=249)
14:54:44 | INFO | OK  map_id=0122  -> pair_vec[996] (per_map_dim=249)
14:54:46 | INFO | OK  map_id=0123  -> pair_vec[996] (per_map_dim=249)
14:54:47 | INFO | OK  map_id=0127  -> pair_vec[996] (per_map_dim=249)
14:54:48 | INFO | OK  map_id=0158  -> pair_vec[996] (per_map_dim=249)
14:54:52 | INFO | OK  map_id=0159  -> pair_vec[996] (per_map_dim=249)
14:54:53 | INFO | OK  map_id=0160  -> pair_vec[996] (per_map_dim=249)
14:54:55 | INFO | OK  map_id=0165  -> pair_vec[996] (per_map_dim=24

Pair map embeddings completed.


15:06:47 | INFO | OK  map_id=1757  -> pair_vec[996] (per_map_dim=249)
15:06:48 | INFO | Saved 300 pair vectors (failed=0) to /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/pair_map_out


## 3) Concatenate
Joins map & prompt vectors using `pairs.csv` and writes `X_concat.npy` and `train_pairs.parquet` to `TRAIN_OUT`.

In [18]:
cmd = [
    sys.executable, "-m", "src.mapvec.concat.concat_embeddings",
    "--pairs",      str(PAIRS_CSV),
    "--map_npz",    str(PAIR_MAP_OUT / "embeddings.npz"),   # from pair_map_out
    "--prompt_npz", str(PROMPT_OUT / "embeddings.npz"),
    "--out_dir",    str(TRAIN_OUT),
    "--drop_dupes",                                   # optional: drop duplicate (map_id,prompt_id)
    # "--fail_on_missing",                            # optional: stop instead of skipping missing IDs
]
print("CMD:", " ".join(cmd))

# Run from the project root so src/ is importable
res = subprocess.run(cmd, cwd=str(PROJ_ROOT))
if res.returncode != 0:
    raise SystemExit("Concatenation step failed.")
print("Concatenation completed.")

CMD: /opt/anaconda3/envs/thesis/bin/python -m src.mapvec.concat.concat_embeddings --pairs ../data/input/pairs.csv --map_npz ../data/output/pair_map_out/embeddings.npz --prompt_npz ../data/output/prompt_out/embeddings.npz --out_dir ../data/output/train_out --drop_dupes
Concatenation completed.


19:01:58 | INFO | Map  embeddings: (300, 996) from /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/pair_map_out/embeddings.npz
19:01:58 | INFO | Prompt embeddings: (500, 512) from /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/prompt_out/embeddings.npz
19:01:58 | INFO | X shape = (450, 1508)  (map_dim=996, prompt_dim=512)
19:01:58 | INFO | Saved to /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out in 0.05s


## 4) Split dataset  
Splits the concatenated feature matrix `X_concat.npy` and its metadata `train_pairs.parquet` into separate **training**, **validation**, and **test** subsets.  
Each split preserves row alignment between features and metadata, and the resulting files are saved under `TRAIN_OUT/splits/` as:  

- `X_train.npy`, `pairs_train.parquet`  
- `X_val.npy`, `pairs_val.parquet`  
- `X_test.npy`, `pairs_test.parquet`  

In [19]:
# Load data
X = np.load(TRAIN_OUT / "X_concat.npy")
pairs_df = pd.read_parquet(TRAIN_OUT / "train_pairs.parquet")

print(f"Loaded X: {X.shape}, pairs: {pairs_df.shape}")

# --- Step 1: Train/Test split
X_train, X_temp, df_train, df_temp = train_test_split(
    X, pairs_df, test_size=VAL_RATIO + TEST_RATIO, random_state=SEED, shuffle=True
)

# --- Step 2: Split temp into Val/Test
relative_test_ratio = TEST_RATIO / (VAL_RATIO + TEST_RATIO)
X_val, X_test, df_val, df_test = train_test_split(
    X_temp, df_temp, test_size=relative_test_ratio, random_state=SEED, shuffle=True
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

# --- Save splits
split_dir = TRAIN_OUT / "splits"
split_dir.mkdir(exist_ok=True)

np.save(split_dir / "X_train.npy", X_train)
np.save(split_dir / "X_val.npy",   X_val)
np.save(split_dir / "X_test.npy",  X_test)

df_train.to_parquet(split_dir / "pairs_train.parquet", index=False)
df_val.to_parquet(split_dir / "pairs_val.parquet", index=False)
df_test.to_parquet(split_dir / "pairs_test.parquet", index=False)

print(f"Saved splits to {split_dir}")

Loaded X: (450, 1508), pairs: (450, 4)
Train: (315, 1508), Val: (67, 1508), Test: (68, 1508)
Saved splits to ../data/output/train_out/splits


In [67]:
# ===================== MODEL TRAINING PARAMETERS =====================
# --------- CONFIG ----------
OP_COL         = "operator"
PARAM_COLS     = ["param"]                     # keep list so y has shape (n,1)
FIXED_CLASSES  = ["simplify", "select", "aggregate", "displace"]
CLIP_ABS = None  # set to 8.0 to enable, or None to disable

In [68]:
def filter_X_and_df(X, df):
    """Filter rows that have operator and param; normalize operator; keep X aligned."""
    df2 = df.copy()
    df2[OP_COL] = df2[OP_COL].astype(str).str.strip().str.lower()
    mask = df2[OP_COL].notna()
    for c in PARAM_COLS:
        mask &= df2[c].notna()
    mask = mask.values  # numpy bool array aligned with X
    X2   = X[mask]
    df2  = df2.loc[mask].reset_index(drop=True)
    return X2.astype("float32", copy=False), df2

# Apply to your splits produced earlier
X_train, df_train = filter_X_and_df(X_train, df_train)
X_val,   df_val   = filter_X_and_df(X_val,   df_val)
X_test,  df_test  = filter_X_and_df(X_test,  df_test)

In [69]:
# Fixed class order (even if a class is missing in train)
le = LabelEncoder()
le.fit(FIXED_CLASSES)
classes = le.classes_.tolist()
print("Classes:", classes)

y_train_cls = le.transform(df_train[OP_COL])
y_val_cls   = le.transform(df_val[OP_COL])
y_test_cls  = le.transform(df_test[OP_COL])

# Single-parameter targets -> 2D arrays (n,1)
y_train_reg = df_train[PARAM_COLS].to_numpy(dtype="float32")
y_val_reg   = df_val[PARAM_COLS].to_numpy(dtype="float32")
y_test_reg  = df_test[PARAM_COLS].to_numpy(dtype="float32")

Classes: ['aggregate', 'displace', 'select', 'simplify']


In [70]:
# Replace inf with NaN so imputer can handle them
for X in (X_train, X_val, X_test):
    X[~np.isfinite(X)] = np.nan

imp = SimpleImputer(strategy="median")
X_train_imp = imp.fit_transform(X_train)
X_val_imp   = imp.transform(X_val)
X_test_imp  = imp.transform(X_test)

scaler = RobustScaler(with_centering=True, with_scaling=True, quantile_range=(5, 95))
X_train_s = scaler.fit_transform(X_train_imp)
X_val_s   = scaler.transform(X_val_imp)
X_test_s  = scaler.transform(X_test_imp)

# Clip extremes to stabilize optimizer
if CLIP_ABS is not None:
    X_train_s = np.clip(X_train_s, -CLIP_ABS, CLIP_ABS)
    X_val_s   = np.clip(X_val_s,   -CLIP_ABS, CLIP_ABS)
    X_test_s  = np.clip(X_test_s,  -CLIP_ABS, CLIP_ABS)

assert np.isfinite(X_train_s).all() and np.isfinite(X_val_s).all() and np.isfinite(X_test_s).all(), "Non-finite after scaling."

In [71]:
# Class weights for imbalance
cls_w    = compute_class_weight("balanced", classes=np.arange(len(classes)), y=y_train_cls)
sample_w = np.array([cls_w[c] for c in y_train_cls], dtype="float32")
print("Class weights:", dict(zip(classes, cls_w)))

clf = MLPClassifier(
    hidden_layer_sizes=(256, 128),
    activation="relu",
    solver="adam",
    alpha=1e-3,
    learning_rate_init=1e-4,
    batch_size=128,
    max_iter=400,
    early_stopping=True,
    n_iter_no_change=20,
    random_state=SEED,
    verbose=True
)

try:
    clf.fit(X_train_s, y_train_cls, sample_weight=sample_w)
except Exception as e:
    print(f"‚ö†Ô∏è Adam crashed ({e}). Falling back to lbfgs (no sample_weight).")
    clf = MLPClassifier(
        hidden_layer_sizes=(256, 128),
        activation="relu",
        solver="lbfgs",
        alpha=1e-3,
        max_iter=500,
        random_state=SEED,
        verbose=True
    )
    clf.fit(X_train_s, y_train_cls)

Class weights: {'aggregate': np.float64(0.9375), 'displace': np.float64(0.9603658536585366), 'select': np.float64(1.0227272727272727), 'simplify': np.float64(1.09375)}
Iteration 1, loss = nan
Validation score: 0.415531
Iteration 2, loss = nan
Validation score: 0.415531
Iteration 3, loss = nan
Validation score: 0.415531
Iteration 4, loss = nan
Validation score: 0.415531
Iteration 5, loss = nan
Validation score: 0.415531
Iteration 6, loss = nan
Validation score: 0.415531
Iteration 7, loss = nan
Validation score: 0.415531
Iteration 8, loss = nan
Validation score: 0.415531
Iteration 9, loss = nan
Validation score: 0.415531
Iteration 10, loss = nan
Validation score: 0.415531
Iteration 11, loss = nan
Validation score: 0.415531
Iteration 12, loss = nan
Validation score: 0.415531
Iteration 13, loss = nan
Validation score: 0.415531
Iteration 14, loss = nan
Validation score: 0.415531
Iteration 15, loss = nan
Validation score: 0.415531
Iteration 16, loss = nan
Validation score: 0.415531
Iteration

ABNORMAL: 

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [72]:
def check_matrix(name, X):
    print(f"--- {name} ---")
    print("shape:", X.shape, "dtype:", X.dtype)
    print("finite:", np.isfinite(X).all())
    print("min/max:", np.nanmin(X), np.nanmax(X))
    print("mean/std:", np.nanmean(X), np.nanstd(X))
    # any all-NaN or all-constant columns before scaling?
    col_nan = np.isnan(X).all(axis=0).sum()
    col_zero_var = (np.nanstd(X, axis=0) == 0).sum()
    print("all-NaN cols:", col_nan, "zero-variance cols:", col_zero_var)

check_matrix("X_train", X_train)
check_matrix("X_val",   X_val)
check_matrix("X_test",  X_test)

check_matrix("X_train_imp", X_train_imp)
check_matrix("X_train_s",   X_train_s)

print("y_train_cls has NaN?", np.isnan(y_train_cls).any() if hasattr(y_train_cls, "__len__") else False)
print("classes present in train:", sorted(set(df_train[OP_COL])))

--- X_train ---
shape: (315, 1508) dtype: float32
finite: True
min/max: -552.0 4054873900.0
mean/std: 24875.152 8824922.0
all-NaN cols: 0 zero-variance cols: 58
--- X_val ---
shape: (67, 1508) dtype: float32
finite: True
min/max: -505.0 96049.9
mean/std: 22.502472 675.2408
all-NaN cols: 0 zero-variance cols: 72
--- X_test ---
shape: (68, 1508) dtype: float32
finite: True
min/max: -552.0 4054873900.0
mean/std: 115146.09 18993522.0
all-NaN cols: 0 zero-variance cols: 67
--- X_train_imp ---
shape: (315, 1508) dtype: float32
finite: True
min/max: -552.0 4054873900.0
mean/std: 24875.152 8824922.0
all-NaN cols: 0 zero-variance cols: 58
--- X_train_s ---
shape: (315, 1508) dtype: float32
finite: True
min/max: -20.723267 142484560.0
mean/std: 1179.2317 357503.12
all-NaN cols: 0 zero-variance cols: 58
y_train_cls has NaN? False
classes present in train: ['aggregate', 'displace', 'select', 'simplify']


In [57]:
y_val_pred  = clf.predict(X_val_s)
val_acc     = accuracy_score(y_val_cls, y_val_pred)
val_f1m     = f1_score(y_val_cls, y_val_pred, average="macro")
print(f"[VAL] acc={val_acc:.3f}  f1_macro={val_f1m:.3f}")
print(classification_report(y_val_cls, y_val_pred, target_names=classes))

[VAL] acc=0.224  f1_macro=0.191
              precision    recall  f1-score   support

   aggregate       0.00      0.00      0.00        17
    displace       0.42      0.42      0.42        12
      select       0.50      0.04      0.08        24
    simplify       0.17      0.64      0.27        14

    accuracy                           0.22        67
   macro avg       0.27      0.28      0.19        67
weighted avg       0.29      0.22      0.16        67



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [58]:
regressors = {}
for idx, name in enumerate(classes):
    sel = (y_train_cls == idx)
    n = int(sel.sum())
    if n < 5:
        print(f"‚ö†Ô∏è  Skipping regressor for '{name}' (only {n} samples).")
        continue

    reg = MLPRegressor(
        hidden_layer_sizes=(128, 64),
        activation="relu",
        solver="adam",
        alpha=1e-3,
        learning_rate_init=5e-4,
        max_iter=600,
        early_stopping=True,
        n_iter_no_change=30,
        random_state=SEED,
        verbose=False
    )
    reg.fit(X_train_s[sel], y_train_reg[sel])  # (n,d) -> (n,1)
    regressors[name] = reg
    print(f"‚úÖ Regressor trained for '{name}' on {n} samples.")

  y = column_or_1d(y, warn=True)
  activation += self.intercepts_[i]
  y = column_or_1d(y, warn=True)


‚úÖ Regressor trained for 'aggregate' on 84 samples.


  y = column_or_1d(y, warn=True)


‚úÖ Regressor trained for 'displace' on 82 samples.


  y = column_or_1d(y, warn=True)


‚úÖ Regressor trained for 'select' on 77 samples.
‚úÖ Regressor trained for 'simplify' on 72 samples.




In [59]:
# Classification
y_test_pred = clf.predict(X_test_s)
test_acc    = accuracy_score(y_test_cls, y_test_pred)
test_f1m    = f1_score(y_test_cls, y_test_pred, average="macro")
print(f"[TEST] acc={test_acc:.3f}  f1_macro={test_f1m:.3f}")
print(classification_report(y_test_cls, y_test_pred, target_names=classes))

# Parameter regression (conditioned on correct operator)
mask = (y_test_pred == y_test_cls)
print(f"Parameter evaluation on {int(mask.sum())}/{len(mask)} samples with correct operator prediction.")
y_pred_params = np.full_like(y_test_reg, np.nan, dtype="float32")  # (n,1)

for i, ok in enumerate(mask):
    if not ok:
        continue
    cls_name = classes[y_test_pred[i]]
    reg = regressors.get(cls_name)
    if reg is None:
        continue
    pred = reg.predict(X_test_s[i:i+1])[0]
    y_pred_params[i] = pred if hasattr(pred, "__len__") else [float(pred)]

valid = np.isfinite(y_pred_params).all(axis=1) & mask
if valid.any():
    mse = mean_squared_error(y_test_reg[valid], y_pred_params[valid])
    mae = mean_absolute_error(y_test_reg[valid], y_pred_params[valid])
    print(f"[TEST Param | correct-ops] MSE={mse:.4f}  MAE={mae:.4f}")
else:
    print("No valid parameter predictions to score.")

[TEST] acc=0.279  f1_macro=0.214
              precision    recall  f1-score   support

   aggregate       0.00      0.00      0.00        20
    displace       0.42      0.33      0.37        15
      select       1.00      0.07      0.13        14
    simplify       0.24      0.68      0.35        19

    accuracy                           0.28        68
   macro avg       0.41      0.27      0.21        68
weighted avg       0.36      0.28      0.21        68

Parameter evaluation on 19/68 samples with correct operator prediction.
[TEST Param | correct-ops] MSE=390.9869  MAE=7.3307


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [60]:
joblib.dump(imp,        MODEL_OUT / "imputer.joblib")
joblib.dump(scaler,     MODEL_OUT / "scaler.joblib")
joblib.dump(le,         MODEL_OUT / "label_encoder.joblib")
joblib.dump(clf,        MODEL_OUT / "mlp_classifier.joblib")
joblib.dump(regressors, MODEL_OUT / "per_class_regressors.joblib")
print(f"Models saved to {MODEL_OUT}")

Models saved to ../data/output/models


In [61]:
def predict_pipeline(X_batch):
    """
    X_batch: (n, 1508) raw concatenated features
    Returns: dict with operator_probs, operator_label, params_pred
    """
    Xb = np.asarray(X_batch, dtype="float32")
    Xb[~np.isfinite(Xb)] = np.nan
    Xb = imp.transform(Xb)
    Xb = scaler.transform(Xb)
    Xb = np.clip(Xb, -CLIP_ABS, CLIP_ABS)

    proba = clf.predict_proba(Xb)                    # (n, 4)
    pred_idx = np.argmax(proba, axis=1)
    pred_labels = le.inverse_transform(pred_idx).tolist()

    params = []
    for i, lbl in enumerate(pred_labels):
        reg = regressors.get(lbl)
        if reg is None:
            params.append([np.nan])                 # single param
        else:
            pred = reg.predict(Xb[i:i+1])[0]
            params.append(pred.tolist() if hasattr(pred, "tolist") else [float(pred)])

    return {"operator_probs": proba, "operator_label": pred_labels, "params_pred": np.array(params)}