In [10]:
# --- Bootstrap ---
from pathlib import Path
import sys

ROOT = None
try:
    import src.utils.notebook_bootstrap as nb
    # Try common names
    for fn in ["find_repo_root", "find_root", "repo_root", "get_repo_root"]:
        if hasattr(nb, fn):
            ROOT = getattr(nb, fn)()
            break
except Exception:
    pass

if ROOT is None:
    # fallback: assume notebook is inside Thesis/notebooks/
    ROOT = Path.cwd()
    while ROOT.name != "Thesis" and ROOT != ROOT.parent:
        ROOT = ROOT.parent

sys.path.insert(0, str(ROOT))

print("ROOT:", ROOT)
print("sys.path[0]:", sys.path[0])


ROOT: /Users/amirdonyadide/Documents/GitHub/Thesis
sys.path[0]: /Users/amirdonyadide/Documents/GitHub/Thesis


In [13]:
import json
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error

from src import config as cfg

def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))



In [14]:
DATA = ROOT / "data" / "output"

EXPERIMENTS = {
    "exp_prompt_only": {
        "X": DATA / "train_out_prompt_only" / "X_prompt_only.npy",
        "pairs": DATA / "train_out_prompt_only" / "train_pairs_prompt_only.parquet",
        "bundle": DATA / "models" / "exp_prompt_only" / "cls_plus_regressors.joblib",
        "preproc": DATA / "models" / "exp_prompt_only" / "preproc.joblib",
    },
    "exp_use_map": {
        "X": DATA / "train_out_use" / "X_use_map.npy",
        "pairs": DATA / "train_out_use" / "train_pairs_use_map.parquet",
        "bundle": DATA / "models" / "exp_use_map" / "cls_plus_regressors.joblib",
        "preproc": DATA / "models" / "exp_use_map" / "preproc.joblib",
    },
    "exp_map_only": {
        "X": DATA / "train_out_map_only" / "X_map_only.npy",
        "pairs": DATA / "train_out_map_only" / "train_pairs_map_only.parquet",
        "bundle": DATA / "models" / "exp_map_only" / "cls_plus_regressors.joblib",
        "preproc": DATA / "models" / "exp_map_only" / "preproc.joblib",
    },
    "exp_openai_map": {
        "X": DATA / "train_out_openai" / "X_openai_map.npy",
        "pairs": DATA / "train_out_openai" / "train_pairs_openai_map.parquet",
        "bundle": DATA / "models" / "exp_openai_map" / "cls_plus_regressors.joblib",
        "preproc": DATA / "models" / "exp_openai_map" / "preproc.joblib",
    },
}

SPLITS_PATH = DATA / "train_out" / "splits" / "splits_shared.json"
USERSTUDY_XLSX = ROOT / "data" / "userstudy" / "UserStudy.xlsx"

print("Splits:", SPLITS_PATH.exists(), SPLITS_PATH)
print("UserStudy:", USERSTUDY_XLSX.exists(), USERSTUDY_XLSX)


Splits: True /Users/amirdonyadide/Documents/GitHub/Thesis/data/output/train_out/splits/splits_shared.json
UserStudy: True /Users/amirdonyadide/Documents/GitHub/Thesis/data/userstudy/UserStudy.xlsx


In [15]:
import inspect
from pathlib import Path


def _load_bundle(path):
    obj = joblib.load(path)
    # Common patterns: dict, namespace-like, dataclass-like
    return obj

def _get_from_bundle(bundle, *keys):
    """
    Tries keys in order in dict-like or attribute-like bundle.
    """
    for k in keys:
        if isinstance(bundle, dict) and k in bundle:
            return bundle[k]
        if hasattr(bundle, k):
            return getattr(bundle, k)
    raise KeyError(f"None of keys {keys} found in bundle. Type={type(bundle)}")

def _apply_preproc(preproc, X):
    """
    Works for sklearn transformers or your saved custom preproc bundle.
    """
    if hasattr(preproc, "transform"):
        return preproc.transform(X)
    # fallback: if it's a dict-like bundle from src/train/preprocessing.py
    if isinstance(preproc, dict):
        # try to use your internal helper if it exists
        try:
            from src.train.preprocessing import apply_preproc_bundle  # if you have it
            return apply_preproc_bundle(preproc, X)
        except Exception:
            pass
    raise TypeError("Don't know how to apply this preproc. "
                    "Expected sklearn transformer or supported bundle.")

def _read_splits(path: Path):
    with open(path, "r") as f:
        s = json.load(f)
    # Accept a few possible schemas
    # Expect keys like: train/val/test each containing list of map_ids
    for k in ["train", "val", "test"]:
        if k not in s:
            raise KeyError(f"Split file missing '{k}'. Keys={list(s.keys())}")
    return s

def _rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def _call_by_signature(func, **kwargs):
    """
    Calls func with only the kwargs it accepts (based on signature).
    Also checks for required keyword-only args and raises a helpful error.
    """
    sig = inspect.signature(func)

    accepted = {k: v for k, v in kwargs.items() if k in sig.parameters}

    # Detect missing required keyword-only parameters
    missing_required_kwonly = []
    for name, p in sig.parameters.items():
        if p.kind == inspect.Parameter.KEYWORD_ONLY and p.default is inspect._empty:
            if name not in accepted:
                missing_required_kwonly.append(name)

    if missing_required_kwonly:
        raise TypeError(
            f"{func.__name__} is missing required keyword-only args: {missing_required_kwonly}. "
            f"Provided: {sorted(list(accepted.keys()))}"
        )

    return func(**accepted)


In [17]:
import inspect
from src.train.load_training_data import load_training_data_with_dynamic_param_norm

print(inspect.signature(load_training_data_with_dynamic_param_norm))

# Required inputs (from your config)
distance_ops = cfg.DISTANCE_OPS
area_ops = cfg.AREA_OPS


(*, exp_name: 'str', feature_mode: 'FeatureMode', paths: 'Any', cfg: 'Any', distance_ops: 'Sequence[str]', area_ops: 'Sequence[str]', require_text: 'bool' = True) -> 'LoadedTrainingData'


In [29]:
import pandas as pd
import numpy as np
from typing import Tuple, Any, cast

from src.train.load_training_data import load_training_data_with_dynamic_param_norm

# Map exp folder name -> feature_mode expected by loader
FEATURE_MODE_BY_EXP = {
    "exp_prompt_only": "prompt_only",
    "exp_use_map": "use_map",
    "exp_map_only": "map_only",
    "exp_openai_map": "openai_map",
}

def _get_paths_obj():
    # Try common config patterns
    for k in ["PATHS", "paths"]:
        if hasattr(cfg, k):
            return getattr(cfg, k)
    for fn in ["get_paths", "make_paths", "build_paths"]:
        if hasattr(cfg, fn):
            return getattr(cfg, fn)()
    raise AttributeError(
        "Couldn't find a PATHS object in cfg. "
        "Check src/config.py for PATHS/paths or a get_paths() function."
    )

PATHS_OBJ = _get_paths_obj()
print("Using PATHS object:", type(PATHS_OBJ))

def load_labeled_exp(exp_name: str) -> Tuple[pd.DataFrame, np.ndarray]:
    # Load YOUR correct artifacts
    X = np.load(EXPERIMENTS[exp_name]["X"])
    pairs = pd.read_parquet(EXPERIMENTS[exp_name]["pairs"])

    feature_mode = cast(Any, FEATURE_MODE_BY_EXP[exp_name])

    # Call loader, but force it to use provided X+pairs (not PATHS internal filenames)
    out = load_training_data_with_dynamic_param_norm(
        exp_name=exp_name,
        cfg=cfg,
        feature_mode=feature_mode,
        paths=PATHS_OBJ,              # still needed for excel/userstudy settings
        distance_ops=cfg.DISTANCE_OPS,
        area_ops=cfg.AREA_OPS,
        X=X,                          # ✅ override missing train_out/X_exp_*.npy
        pairs=pairs,                  # ✅ override missing pairs in train_out/
        excel_path=str(USERSTUDY_XLSX),  # if your function accepts it, great; if not, it will be ignored
        sheet_name="UserStudy",          # same
    )

    # normalize returns
    if isinstance(out, tuple):
        df = out[0]
        X_out = out[1] if len(out) > 1 else X
    else:
        df = out
        X_out = X

    assert isinstance(df, pd.DataFrame), f"Expected DataFrame, got {type(df)}"
    assert isinstance(X_out, np.ndarray), f"Expected ndarray, got {type(X_out)}"

    return df.reset_index(drop=True), X_out


Using PATHS object: <class 'src.config.ProjectPaths'>


In [30]:
def unnormalize_param(op_series: pd.Series, param_norm: np.ndarray, df: pd.DataFrame) -> np.ndarray:
    op = op_series.values
    out = np.full_like(param_norm, np.nan, dtype=float)

    is_dist = np.isin(op, list(cfg.DISTANCE_OPS))
    is_area = np.isin(op, list(cfg.AREA_OPS))

    out[is_dist] = param_norm[is_dist] * df.loc[is_dist, cfg.EXTENT_DIAG_COL].to_numpy(dtype=float)
    out[is_area] = param_norm[is_area] * df.loc[is_area, cfg.EXTENT_AREA_COL].to_numpy(dtype=float)
    return out

def predict_param_norm_by_op(bundle, Xp: np.ndarray, ops: np.ndarray) -> np.ndarray:
    regressors = _get_from_bundle(bundle, "regressors", "op_to_regressor", "regressor_by_op", "op_regressors", "reg_models")

    target_scalers = None
    for k in ["target_scalers", "y_scalers", "scalers_by_op"]:
        if isinstance(bundle, dict) and k in bundle:
            target_scalers = bundle[k]
            break
        if hasattr(bundle, k):
            target_scalers = getattr(bundle, k)
            break

    log1p = False
    for k in ["use_log1p", "log1p", "log_target"]:
        if isinstance(bundle, dict) and k in bundle:
            log1p = bool(bundle[k])
            break
        if hasattr(bundle, k):
            log1p = bool(getattr(bundle, k))
            break

    yhat = np.full(len(ops), np.nan, dtype=float)

    for op in pd.unique(ops):
        mask = (ops == op)
        if op not in regressors:
            continue

        reg = regressors[op]
        pred = reg.predict(Xp[mask]).astype(float)

        if target_scalers is not None and op in target_scalers:
            sc = target_scalers[op]
            if hasattr(sc, "inverse_transform"):
                pred = sc.inverse_transform(pred.reshape(-1, 1)).ravel()

        if log1p:
            pred = np.expm1(pred)

        yhat[mask] = pred

    return yhat

def _safe_reg_metrics(y_true, y_pred):
    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    if mask.sum() == 0:
        return np.nan, np.nan
    return mean_absolute_error(y_true[mask], y_pred[mask]), _rmse(y_true[mask], y_pred[mask])

def evaluate_experiment(exp_name: str):
    df, X = load_labeled_exp(exp_name)

    bundle = _load_bundle(EXPERIMENTS[exp_name]["bundle"])
    preproc = joblib.load(EXPERIMENTS[exp_name]["preproc"])
    Xp = _apply_preproc(preproc, X)

    classifier = _get_from_bundle(bundle, "classifier", "clf", "cls", "model")

    splits = _read_splits(SPLITS_PATH)
    test_maps = set(splits["test"])
    test_mask = df["map_id"].isin(test_maps).to_numpy()

    idx = np.where(test_mask)[0]
    dfT = df.iloc[idx].reset_index(drop=True)
    XpT = Xp[idx]

    y_true = dfT["operator"].to_numpy()
    y_pred = classifier.predict(XpT)

    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")

    y_norm_true = dfT["param_norm"].to_numpy(dtype=float)
    y_real_true = dfT["param_value"].to_numpy(dtype=float)

    y_norm_oracle = predict_param_norm_by_op(bundle, XpT, y_true)
    y_real_oracle = unnormalize_param(pd.Series(y_true), y_norm_oracle, dfT)

    y_norm_pipe = predict_param_norm_by_op(bundle, XpT, y_pred)
    y_real_pipe = unnormalize_param(pd.Series(y_pred), y_norm_pipe, dfT)

    mae_norm_oracle, rmse_norm_oracle = _safe_reg_metrics(y_norm_true, y_norm_oracle)
    mae_norm_pipe, rmse_norm_pipe = _safe_reg_metrics(y_norm_true, y_norm_pipe)

    mae_real_oracle, rmse_real_oracle = _safe_reg_metrics(y_real_true, y_real_oracle)
    mae_real_pipe, rmse_real_pipe = _safe_reg_metrics(y_real_true, y_real_pipe)

    return {
        "exp": exp_name,
        "n_test": len(dfT),
        "cls_acc": acc,
        "cls_macro_f1": macro_f1,
        "oracle_mae_norm": mae_norm_oracle,
        "oracle_rmse_norm": rmse_norm_oracle,
        "pipe_mae_norm": mae_norm_pipe,
        "pipe_rmse_norm": rmse_norm_pipe,
        "oracle_mae_real": mae_real_oracle,
        "oracle_rmse_real": rmse_real_oracle,
        "pipe_mae_real": mae_real_pipe,
        "pipe_rmse_real": rmse_real_pipe,
    }


In [31]:
rows = []
for exp in EXPERIMENTS:
    try:
        res = evaluate_experiment(exp)
        rows.append(res)
    except Exception as e:
        rows.append({"exp": exp, "error": str(e)})

df_results = pd.DataFrame(rows)

# Put errors on top if any
if "error" in df_results.columns:
    display(df_results[df_results["error"].notna()])

if "error" in df_results.columns:
    df_ok = df_results[df_results["error"].isna()].copy()
else:
    df_ok = df_results.copy()

df_ok = df_ok.sort_values(["cls_macro_f1", "pipe_rmse_real"], ascending=[False, True])
df_ok



Unnamed: 0,exp,error
0,exp_prompt_only,load_training_data_with_dynamic_param_norm() got an unexpected keyword argument 'X'
1,exp_use_map,load_training_data_with_dynamic_param_norm() got an unexpected keyword argument 'X'
2,exp_map_only,load_training_data_with_dynamic_param_norm() got an unexpected keyword argument 'X'
3,exp_openai_map,load_training_data_with_dynamic_param_norm() got an unexpected keyword argument 'X'


KeyError: 'cls_macro_f1'

In [34]:
display(df_results)

for _, r in df_results.iterrows():
    if "error" in r and pd.notna(r["error"]):
        print("\n---", r["exp"], "---")
        print(r["error"])


Unnamed: 0,exp,error
0,exp_prompt_only,load_training_data_with_dynamic_param_norm() got an unexpected keyword argument 'X'
1,exp_use_map,load_training_data_with_dynamic_param_norm() got an unexpected keyword argument 'X'
2,exp_map_only,load_training_data_with_dynamic_param_norm() got an unexpected keyword argument 'X'
3,exp_openai_map,load_training_data_with_dynamic_param_norm() got an unexpected keyword argument 'X'



--- exp_prompt_only ---
load_training_data_with_dynamic_param_norm() got an unexpected keyword argument 'X'

--- exp_use_map ---
load_training_data_with_dynamic_param_norm() got an unexpected keyword argument 'X'

--- exp_map_only ---
load_training_data_with_dynamic_param_norm() got an unexpected keyword argument 'X'

--- exp_openai_map ---
load_training_data_with_dynamic_param_norm() got an unexpected keyword argument 'X'


In [None]:
def style_results(df):
    show = df.copy()
    num_cols = [c for c in show.columns if c not in ["exp"] and pd.api.types.is_numeric_dtype(show[c])]
    show[num_cols] = show[num_cols].astype(float)

    # Round
    show[num_cols] = show[num_cols].round(4)

    # Highlight: max for scores, min for errors
    score_cols = ["cls_acc", "cls_macro_f1"]
    err_cols = [c for c in show.columns if "mae" in c or "rmse" in c]

    sty = show.style
    for c in score_cols:
        if c in show.columns:
            sty = sty.highlight_max(subset=[c])
    for c in err_cols:
        if c in show.columns:
            sty = sty.highlight_min(subset=[c])

    return sty

style_results(df_ok)


In [None]:
from sklearn.metrics import classification_report

def per_class_report(exp_name: str):
    df, X = load_labeled_exp(exp_name)
    bundle = _load_bundle(EXPERIMENTS[exp_name]["bundle"])
    preproc = joblib.load(EXPERIMENTS[exp_name]["preproc"])
    Xp = _apply_preproc(preproc, X)
    classifier = _get_from_bundle(bundle, "classifier", "cls", "model")

    splits = _read_splits(SPLITS_PATH)
    test_maps = set(splits["test"])
    test_mask = df["map_id"].isin(test_maps).to_numpy()

    dfT = df.loc[test_mask].reset_index(drop=True)
    XpT = Xp[test_mask]
    y_true = dfT["operator"].to_numpy()
    y_pred = classifier.predict(XpT)

    rep = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    # keep only operator rows
    ops = sorted(set(y_true) | set(y_pred))
    rep_ops = pd.DataFrame({op: rep.get(op, {}) for op in ops}).T
    rep_ops = rep_ops[["precision", "recall", "f1-score", "support"]].sort_index()
    rep_ops.insert(0, "exp", exp_name)
    return rep_ops

all_reports = []
for exp in EXPERIMENTS:
    try:
        all_reports.append(per_class_report(exp))
    except Exception as e:
        print(exp, "failed:", e)

df_per_class = pd.concat(all_reports, axis=0).reset_index(names="operator")
df_per_class
