### Imports & Config

In [1]:
# 00) Imports & setup
from __future__ import annotations

import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd

# --- Project roots ---
NB_ROOT = Path.cwd()                 # .../notebooks
PROJ_ROOT = NB_ROOT.parent           # project root

if str(NB_ROOT) not in sys.path:
    sys.path.insert(0, str(NB_ROOT))
if str(PROJ_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJ_ROOT))

# --- Shared utilities ---
from shared.common.utils import (
    NumpyJSONEncoder,          # noqa: F401
    optimize_dtypes,           # noqa: F401
    log_basic_diagnostics,     # noqa: F401
    set_global_seed,
)
from shared.common.config import load_config, configure_logger
from shared.common.serving_transformers import GeoCanonizer, PriorsGuard, EnsureDerivedFeatures
from shared.n03_train_model.preprocessing import drop_leaky_and_target, list_required_serving_derivatives
from shared.common.sanity_checks import scale_gate_valuation_k, scale_gate_per_sqm
from shared.common.constants import (
    EXPECTED_VALUATION_TOTAL_KEUR_RANGE,
    EXPECTED_PRICE_PER_SQM_EUR_RANGE,
)

# (Models will be chosen later; avoid heavy imports here)

# --- Logger ---
LOG_LEVEL = os.getenv("NB_LOG_LEVEL", "INFO")
logger = configure_logger(name="model_trainer", level=LOG_LEVEL)

# --- Load config (optional). If YAML is missing, proceed with safe defaults. ---
CFG_PATH = NB_ROOT / "dataset_config.yaml"
if CFG_PATH.exists():
    CONFIG = load_config(str(CFG_PATH))
    logger.info("Loaded config YAML: %s", CFG_PATH.as_posix())
else:
    CONFIG = {}
    logger.warning("dataset_config.yaml not found; proceeding with defaults.")

TRAIN_CFG = CONFIG.get("training", {}) or {}

# --- Global seed ---
SEED = int(TRAIN_CFG.get("seed", CONFIG.get("seed", 42)))
set_global_seed(SEED)

# --- Output folders (relative to `notebooks/`) ---
BASE_OUT = NB_ROOT / "outputs"
MODEL_DIR = BASE_OUT / "modeling"
FIG_DIR   = MODEL_DIR / "figures"
ART_DIR   = MODEL_DIR / "artifacts"
PROP_DIR  = MODEL_DIR / "property"  # used by downstream registry/backends

for d in (BASE_OUT, MODEL_DIR, FIG_DIR, ART_DIR, PROP_DIR):
    d.mkdir(parents=True, exist_ok=True)

# --- Dataset path (can be overridden via YAML) ---
DATASET_PATH = Path(TRAIN_CFG.get("dataset_path", BASE_OUT / "dataset_generated.csv"))

# --- Display/QoL ---
pd.set_option("display.max_columns", 200)
np.set_printoptions(suppress=True)

logger.info("Setup OK | seed=%s | outputs_dir=%s", SEED, BASE_OUT.as_posix())

[2025-10-07 17:02:46,437] INFO model_trainer: Setup OK | seed=42 | outputs_dir=c:/Users/anven/OneDrive/Documenti/GitHub/axiomatic_oracle/notebooks/outputs


### Load Dataset

In [2]:
# 01) Load dataset from nb01 manifest (robust) + optimize + validate + immediate leakage cleanup
from __future__ import annotations

import json
from pathlib import Path
import re

import pandas as pd

from shared.common.utils import canonical_json_dumps, optimize_dtypes, log_basic_diagnostics
from shared.common.sanity_checks import validate_dataset

from shared.common.constants import VALUATION_K, PRICE_PER_SQM, PRICE_PER_SQM_CAPPED_VIOLATED
from shared.n03_train_model.preprocessing import ML_LEAKY_FEATURES as _ML_LEAKY

# --- helper: resolve relative paths against known bases (expects BASE_OUT/NB_ROOT/PROJ_ROOT already defined) ---
def _resolve_path(p: str | Path) -> Path | None:
    cand = Path(p)
    if cand.exists():
        return cand
    # if relative, try under standard bases
    for base in [BASE_OUT, NB_ROOT, PROJ_ROOT]:
        try:
            q = (base / str(p)).resolve()
            if q.exists():
                return q
        except Exception:
            continue
    return None

# 1) Find the latest nb01 manifest
snap_dir = BASE_OUT / "snapshots"
snap_dir.mkdir(parents=True, exist_ok=True)
manifests = sorted(snap_dir.glob("manifest_*.json"))
manifest01 = None
if manifests:
    try:
        manifest01 = json.loads(manifests[-1].read_text(encoding="utf-8"))
        logger.info("Found nb01 manifest: %s", manifests[-1].as_posix())
    except Exception as e:
        logger.warning("Unable to read latest manifest: %s", e)

# 2) Resolve dataset path from manifest (supports multiple keys)
data_path: Path | None = None
if isinstance(manifest01, dict):
    paths = (manifest01.get("paths") or {})  # type: ignore
    for k in ("dataset", "dataset_path", "output_path"):
        p = paths.get(k)
        if p:
            rp = _resolve_path(p)
            if rp:
                data_path = rp
                break

# 3) Fallbacks: use DATASET_PATH (from Cell 01) or search in BASE_OUT
if data_path is None or not data_path.exists():
    candidates = [
        Path(DATASET_PATH) if isinstance(DATASET_PATH, (str, Path)) else None,
        BASE_OUT / "dataset_generated.parquet",
        BASE_OUT / "dataset_generated.csv",
    ]
    candidates += sorted(BASE_OUT.glob("dataset_*.parquet"))
    candidates += sorted(BASE_OUT.glob("dataset_*.csv"))
    data_path = next((c for c in candidates if c and c.exists()), None)

if not data_path or not data_path.exists():
    raise FileNotFoundError(
        "Dataset not found. Check the nb01 manifest in notebooks/outputs/snapshots "
        "or ensure notebooks/outputs/dataset_generated.(csv|parquet) exists."
    )

logger.info("📄 Loading dataset from: %s", data_path.as_posix())

# 4) Load parquet/csv (graceful handling if parquet engine is missing)
if data_path.suffix.lower() in {".parquet", ".pq"}:
    try:
        df = pd.read_parquet(data_path)
    except Exception as e:
        raise RuntimeError(
            f"Failed to read Parquet at {data_path}. Ensure 'pyarrow' or 'fastparquet' is installed. Details: {e}"
        )
else:
    df = pd.read_csv(data_path)

# 5) Dtype optimization (log memory saving)
mem_before = df.memory_usage(deep=True).sum() / 1024**2
df = optimize_dtypes(df)
mem_after = df.memory_usage(deep=True).sum() / 1024**2
logger.info(
    "✅ Dtypes optimized: %.2f MB → %.2f MB (−%.2f MB, %.1f%%)",
    mem_before, mem_after, mem_before - mem_after,
    0.0 if mem_before == 0 else (mem_before - mem_after) / mem_before * 100.0
)

# 6) Quick diagnostics
log_basic_diagnostics(df, logger)

# 7) Schema validation (asset_type from nb01 config)
asset_type = str(CONFIG.get("generation", {}).get("asset_type", "property"))
try:
    val_report = validate_dataset(df, asset_type=asset_type, raise_on_failure=True)
    logger.info("✅ Schema validation passed")
except Exception as e:
    logger.warning("Schema validation warning: %s", e)
    val_report = {"overall_passed": False, "error": str(e)}

# 8) Persist validation report next to modeling outputs
(MODEL_DIR / "validation_nb03.json").write_text(
    canonical_json_dumps(val_report),
    encoding="utf-8"
)

# 9) Immediate LEAKAGE CLEANUP (on raw df, before building X/y)
# --- 1) Explicit removals (case-insensitive) ---
explicit_leaky = {
    PRICE_PER_SQM,
    "price_per_sqm",
    "price_per_sqm_vs_region_avg",
    "price_per_sqm_capped",
    "valuation_k_log",
    PRICE_PER_SQM_CAPPED_VIOLATED,
    "strongly_incoherent",
    "valuation_k_decile",
    "valuation_rank",
    "is_top_valuation",
}
# union with module defaults
explicit_leaky |= set(map(str, _ML_LEAKY))

# map lowercase -> original (preserve original casing when dropping)
lower_map = {c.lower(): c for c in df.columns}
present_explicit = [lower_map[n.lower()] for n in explicit_leaky if n and n.lower() in lower_map]

# --- 2) Pattern-based removals (regex, case-insensitive) ---
regex_patterns = [
    r"price_per_sqm",         # qualsiasi col contenente price_per_sqm
    r"_vs_region_avg$",       # *_vs_region_avg
    r"^valuation_k_.+$",      # derivati del target valuation_k
    r"^valuation(_|$)",       # eventuali 'valuation' grezzi
    r"(decile|rank)",         # indicatori di leakage "analitico"
]

present_regex = []
for col in df.columns:
    if col == VALUATION_K:  # NON rimuovere il target
        continue
    if any(re.search(pat, col, flags=re.IGNORECASE) for pat in regex_patterns):
        present_regex.append(col)

# --- 3) Apply removals ---
to_drop = sorted(set(present_explicit) | set(present_regex))
if to_drop:
    logger.warning("🔴 Removing leaky features: %s", to_drop)
    df.drop(columns=to_drop, inplace=True, errors="ignore")
    logger.info("✅ Dataset cleaned: %d columns remaining", df.shape[1])
else:
    logger.info("✅ No leaky features found in the dataset")

# --- 4) Final assertions ---
assert not any("price_per_sqm" in c.lower() for c in df.columns), \
    "ERROR: columns matching 'price_per_sqm*' are still present!"
assert not any(
    c.lower().startswith("valuation_k_") for c in df.columns if c.lower() != VALUATION_K.lower()
), "ERROR: 'valuation_k_*' derivatives are still present!"

# Minimal debug
logger.debug("Remaining columns: %s", list(df.columns))
print(f"Shape after cleanup: {df.shape}")
print(f"Numeric columns: {df.select_dtypes(include='number').columns.tolist()}")

[2025-10-07 17:02:46,457] INFO model_trainer: Found nb01 manifest: c:/Users/anven/OneDrive/Documenti/GitHub/axiomatic_oracle/notebooks/outputs/snapshots/manifest_20251005T102255Z.json
[2025-10-07 17:02:46,458] INFO model_trainer: 📄 Loading dataset from: C:/Users/anven/OneDrive/Documenti/GitHub/axiomatic_oracle/notebooks/outputs/dataset_generated.csv
[2025-10-07 17:02:46,576] INFO model_trainer: ✅ Dtypes optimized: 14.95 MB → 12.81 MB (−2.15 MB, 14.4%)
[2025-10-07 17:02:46,576] INFO model_trainer: [UTILS] Distribution by location:
location
Milan       3017
Rome        2700
Turin       1214
Naples      1190
Bologna      886
Genoa        773
Florence     770
Palermo      767
Venice       596
Bari         593
Verona       591
Padua        581
Catania      444
Cagliari     442
Trieste      436
[2025-10-07 17:02:46,576] INFO model_trainer: [UTILS] Valuation min: 53.70k€
[2025-10-07 17:02:46,583] INFO model_trainer: [UTILS] Valuation max: 2403.49k€
[2025-10-07 17:02:46,583] INFO model_trainer

Shape after cleanup: (15000, 43)
Numeric columns: ['valuation_k', 'listing_month', 'size_m2', 'rooms', 'bathrooms', 'year_built', 'age_years', 'floor', 'building_floors', 'is_top_floor', 'is_ground_floor', 'has_elevator', 'has_garden', 'has_balcony', 'garage', 'owner_occupied', 'public_transport_nearby', 'distance_to_center_km', 'parking_spot', 'cellar', 'attic', 'concierge', 'humidity_level', 'temperature_avg', 'noise_level', 'air_quality_index', 'condition_score', 'risk_score', 'luxury_score', 'env_score', 'confidence_score']


### Feature Engineering (derivations + priors + anomaly flags)

In [3]:
# 04) Feature Engineering — Derivations + Priors + Anomaly flags (single cell)
from __future__ import annotations

import json
from pathlib import Path

import numpy as np
import pandas as pd

from shared.common.utils import canonical_json_dumps
from shared.common.constants import (
    ASSET_ID, VALUATION_K,
    LAST_VERIFIED_TS, PREDICTION_TS, LAG_HOURS,
    CONDITION_SCORE, RISK_SCORE,
    SIZE_M2, ROOMS, FLOOR, BUILDING_FLOORS,
    HAS_ELEVATOR, PUBLIC_TRANSPORT_NEARBY, ZONE, REGION,
)
from shared.common.serving_transformers import PriorsGuard

REQUIRED_DERIVED = list_required_serving_derivatives()

# --------------------------------------------------------------------------------------
# A) Train/Valid/Test split
#    - group blocking on ASSET_ID (if present) + decile stratification on group medians
#    - otherwise row-level decile stratification on target
# --------------------------------------------------------------------------------------
def _strat_bins(y: pd.Series, q: int = 10) -> pd.Series:
    y_num = pd.to_numeric(y, errors="coerce")
    ranks = y_num.rank(method="first")
    unique = int(ranks.nunique())
    if unique < 2:
        return pd.Series(0, index=y.index, dtype=int)
    q_eff = max(2, min(int(q), unique))
    try:
        bins = pd.qcut(ranks, q=q_eff, labels=False, duplicates="drop")
    except Exception:
        bins = pd.Series(0, index=y.index, dtype=int)
    if bins.isna().any():
        mode_bin = int(bins.dropna().mode().iat[0]) if not bins.dropna().empty else 0
        bins = bins.fillna(mode_bin).astype(int)
    return bins.astype(int)

def _safe_stratify(labels: pd.Series | np.ndarray, min_per_class: int = 2):
    lab = pd.Series(labels)
    vc = lab.value_counts()
    if len(vc) < 2 or (vc < min_per_class).any():
        return None
    return lab.values

def _ensure_split(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    from sklearn.model_selection import train_test_split

    cfg = TRAIN_CFG if isinstance(TRAIN_CFG, dict) else {}
    test_size = float(cfg.get("test_size", 0.15))
    val_size  = float(cfg.get("val_size",  0.15))
    n_deciles = int(cfg.get("n_deciles",   10))
    group_col = str(cfg.get("group_col", ASSET_ID))

    # keep rows with numeric target
    mask_y = pd.to_numeric(df[VALUATION_K], errors="coerce").notna()
    if not mask_y.all():
        logger.warning("Rows without target removed before splitting: %d", (~mask_y).sum())
    df_clean = df.loc[mask_y].copy()

    if group_col in df_clean.columns and df_clean[group_col].notna().any():
        df_clean[group_col] = df_clean[group_col].astype(str)
        gstats = (
            df_clean[[group_col, VALUATION_K]]
            .groupby(group_col, as_index=False)[VALUATION_K]
            .median()
            .rename(columns={VALUATION_K: f"{VALUATION_K}__group_median"})
        )

        g_all = gstats[group_col].values
        g_bins_all = _strat_bins(gstats[f"{VALUATION_K}__group_median"], q=n_deciles).values
        strat_all = _safe_stratify(g_bins_all)

        g_tmp, g_test = train_test_split(
            g_all,
            test_size=test_size,
            random_state=SEED,
            stratify=strat_all,
        )

        val_rel = float(val_size / max(1e-9, (1.0 - test_size)))
        val_rel = min(max(val_rel, 0.05), 0.8)

        tmp_mask = np.isin(gstats[group_col].values, g_tmp)
        gstats_tmp = gstats.loc[tmp_mask].copy()
        bins_tmp = _strat_bins(gstats_tmp[f"{VALUATION_K}__group_median"], q=n_deciles).values
        bin_map_tmp = dict(zip(gstats_tmp[group_col].values, bins_tmp))
        y_tmp_bins = np.array([bin_map_tmp.get(g, 0) for g in g_tmp])
        strat_tmp = _safe_stratify(y_tmp_bins)

        g_train, g_valid = train_test_split(
            g_tmp,
            test_size=val_rel,
            random_state=SEED,
            stratify=strat_tmp,
        )

        G_TRAIN, G_VALID, G_TEST = set(g_train), set(g_valid), set(g_test)
        df_train = df_clean[df_clean[group_col].isin(G_TRAIN)].copy()
        df_valid = df_clean[df_clean[group_col].isin(G_VALID)].copy()
        df_test  = df_clean[df_clean[group_col].isin(G_TEST)].copy()
    else:
        logger.warning(
            "%s missing/invalid for grouping: falling back to row-level stratification.", group_col
        )
        bins_all = _strat_bins(df_clean[VALUATION_K], q=n_deciles)
        df_tmp, df_test = train_test_split(
            df_clean,
            test_size=test_size,
            random_state=SEED,
            stratify=_safe_stratify(bins_all),
        )

        val_rel = float(val_size / max(1e-9, (1.0 - test_size)))
        val_rel = min(max(val_rel, 0.05), 0.8)
        bins_tmp = _strat_bins(df_tmp[VALUATION_K], q=n_deciles)
        df_train, df_valid = train_test_split(
            df_tmp,
            test_size=val_rel,
            random_state=SEED,
            stratify=_safe_stratify(bins_tmp),
        )

    # disjointness checks
    for a_name, A in (("train", df_train), ("valid", df_valid), ("test", df_test)):
        logger.info("%s split: %d rows, %d cols", a_name, len(A), A.shape[1])
    assert len(set(df_train.index) & set(df_valid.index)) == 0
    assert len(set(df_train.index) & set(df_test.index)) == 0
    assert len(set(df_valid.index) & set(df_test.index)) == 0

    return df_train, df_valid, df_test

if not all(n in globals() for n in ("df_train", "df_valid", "df_test")):
    df_train, df_valid, df_test = _ensure_split(df)

# --------------------------------------------------------------------------------------
# B) Derivations (applied to all splits) – no target leakage
# --------------------------------------------------------------------------------------
def _ensure_datetime_and_lag(df_: pd.DataFrame) -> pd.DataFrame:
    out = df_.copy()
    if (LAG_HOURS not in out.columns) and ({LAST_VERIFIED_TS, PREDICTION_TS} <= set(out.columns)):
        out[LAST_VERIFIED_TS] = pd.to_datetime(out[LAST_VERIFIED_TS], utc=True, errors="coerce")
        out[PREDICTION_TS]   = pd.to_datetime(out[PREDICTION_TS],   utc=True, errors="coerce")
        lag = (out[PREDICTION_TS] - out[LAST_VERIFIED_TS]).dt.total_seconds().div(3600)
        out[LAG_HOURS] = lag.where(lag >= 0, other=pd.NA).astype("Float32")
    elif LAG_HOURS in out.columns:
        out[LAG_HOURS] = pd.to_numeric(out[LAG_HOURS], errors="coerce").astype("Float32")
    return out

def _derive_core_features(df_: pd.DataFrame) -> pd.DataFrame:
    out = df_.copy()

    # condition_minus_risk
    if (CONDITION_SCORE in out.columns) and (RISK_SCORE in out.columns):
        cs = pd.to_numeric(out[CONDITION_SCORE], errors="coerce")
        rs = pd.to_numeric(out[RISK_SCORE], errors="coerce")
        out["condition_minus_risk"] = (cs - rs).astype("Float32")

    # listing_month from prediction ts
    if ("listing_month" not in out.columns) and (PREDICTION_TS in out.columns):
        ts = pd.to_datetime(out[PREDICTION_TS], utc=True, errors="coerce")
        out["listing_month"] = ts.dt.month.astype("Int16")

    # rooms per 100 sqm
    s = pd.to_numeric(out.get(SIZE_M2), errors="coerce").replace(0, np.nan)
    r = pd.to_numeric(out.get(ROOMS), errors="coerce")
    out["rooms_per_100sqm"] = (100.0 * r / s).astype("Float32")

    # penalty for high floor without elevator
    f = pd.to_numeric(out.get(FLOOR), errors="coerce")
    e = pd.to_numeric(out.get(HAS_ELEVATOR), errors="coerce").fillna(0)
    out["no_elev_high_floor"] = ((1 - e) * np.maximum(f - 1, 0)).astype("Float32")

    # floor ratio in [0,1]
    bf = pd.to_numeric(out.get(BUILDING_FLOORS), errors="coerce")
    denom = (bf - 1.0).where((bf - 1.0) > 0, other=np.nan)
    out["floor_ratio"] = (f / denom).clip(lower=0.0, upper=1.0).astype("Float32")

    # public-transport importance (stronger in periphery)
    if (ZONE in out.columns) and (PUBLIC_TRANSPORT_NEARBY in out.columns):
        z_norm = out[ZONE].astype("string").str.strip().str.lower()
        mult = (
            z_norm.map({"center": 1.00, "semi_center": 1.15, "periphery": 1.30})
            .fillna(1.10)
            .astype("Float32")
        )
        pt = pd.to_numeric(out[PUBLIC_TRANSPORT_NEARBY], errors="coerce").fillna(0).astype("Float32")
        out["pt_importance"] = (pt * mult).astype("Float32")
        out["pt_x_periphery"]   = (pt * (z_norm == "periphery").astype("Int8")).astype("Int8")
        out["pt_x_semi_center"] = (pt * (z_norm == "semi_center").astype("Int8")).astype("Int8")
        out["pt_x_center"]      = (pt * (z_norm == "center").astype("Int8")).astype("Int8")

    return out

def _apply_derivations(df_: pd.DataFrame) -> pd.DataFrame:
    out = _ensure_datetime_and_lag(df_)
    out = _derive_core_features(out)
    if VALUATION_K not in out.columns:
        raise ValueError(f"{VALUATION_K} missing: cannot train.")
    out[VALUATION_K] = pd.to_numeric(out[VALUATION_K], errors="coerce").astype("Float32")
    return out

for _n in ("df_train", "df_valid", "df_test"):
    globals()[_n] = _apply_derivations(globals()[_n])

# --------------------------------------------------------------------------------------
# C) Priors (city_zone_prior, region_index_prior) — consistent with generation logic
#     + robust pre-creation of prior columns (Series) and manual fallback
# --------------------------------------------------------------------------------------
GEN_CFG = CONFIG.get("generation", {}) if isinstance(CONFIG, dict) else {}
_city_base = GEN_CFG.get("city_base_prices", {}) or {}
region_index_defaults = {"north": 1.05, "center": 1.00, "south": 0.92}
region_index_map = GEN_CFG.get("region_index", {}) or region_index_defaults

if _city_base:
    _city_base_norm = {
        str(c).strip().lower(): {str(z).strip().lower(): float(v) for z, v in d.items()}
        for c, d in _city_base.items()
    }
    all_zones = {z for d in _city_base_norm.values() for z in d}
    zone_medians = {
        z: float(np.nanmedian([d.get(z, np.nan) for d in _city_base_norm.values()]))
        for z in all_zones
    }
    global_median = float(np.nanmedian([v for d in _city_base_norm.values() for v in d.values()]))
else:
    _city_base_norm, zone_medians, global_median = {}, {}, 0.0

def _ensure_prior_cols(df_: pd.DataFrame) -> pd.DataFrame:
    """Make sure prior columns exist as Series (not scalars) and geo keys are normalized."""
    out = df_.copy()
    # create as nullable float series if missing
    for col in ("city_zone_prior", "region_index_prior"):
        if col not in out.columns:
            out[col] = pd.Series(pd.NA, index=out.index, dtype="Float32")
        else:
            out[col] = pd.to_numeric(out[col], errors="coerce").astype("Float32")
    # ensure city from location if missing
    if "city" not in out.columns and "location" in out.columns:
        out["city"] = out["location"]
    # normalize geo keys
    for c in ("city", "zone", "region"):
        if c in out.columns:
            out[c] = out[c].astype("string").str.strip().str.lower()
    return out

def _fallback_priors(df_: pd.DataFrame) -> pd.DataFrame:
    """Manual priors computation if PriorsGuard fails."""
    out = _ensure_prior_cols(df_)
    ci = out.get("city", pd.Series(index=out.index, dtype="string")).astype("string").str.lower()
    zo = out.get("zone", pd.Series(index=out.index, dtype="string")).astype("string").str.lower()
    re = out.get("region", pd.Series(index=out.index, dtype="string")).astype("string").str.lower()

    vals = []
    for c, z in zip(ci, zo):
        v = _city_base_norm.get(str(c), {}).get(str(z), np.nan)
        if pd.isna(v):
            v = zone_medians.get(str(z), global_median)
        vals.append(v)
    out["city_zone_prior"] = pd.to_numeric(vals, errors="coerce").astype("Float32")
    out["region_index_prior"] = re.map(region_index_map).astype("Float32")
    return out

# pre-create prior columns & normalize geo keys
for _n in ("df_train", "df_valid", "df_test"):
    globals()[_n] = _ensure_prior_cols(globals()[_n])

priors = PriorsGuard(
    city_base=_city_base_norm,
    region_index=region_index_map,
    zone_medians=zone_medians,
    global_cityzone_median=global_median,
)

# apply PriorsGuard with fallback
for _n in ("df_train", "df_valid", "df_test"):
    try:
        globals()[_n] = priors.transform(globals()[_n])
    except Exception as e:
        logger.warning("PriorsGuard.transform failed on %s (%s). Falling back to manual priors.", _n, e)
        globals()[_n] = _fallback_priors(globals()[_n])

# --------------------------------------------------------------------------------------
# D) Anti-leakage guards (post-derivations/priors)
# --------------------------------------------------------------------------------------
for split_name, split_df in (("train", df_train), ("valid", df_valid), ("test", df_test)):
    assert not any("price_per_sqm" in c.lower() for c in split_df.columns), \
        f"LEAKAGE in df_{split_name}: columns matching 'price_per_sqm*' are present!"
    assert not any(
        c.lower().startswith("valuation_k_") for c in split_df.columns if c.lower() != VALUATION_K.lower()
    ), f"LEAKAGE in df_{split_name}: 'valuation_k_*' derivatives are present!"

# --------------------------------------------------------------------------------------
# E) Train-only anomaly flags + sample_weight (no leakage)
# --------------------------------------------------------------------------------------
try:
    from shared.n02_explore_dataset.eda_core import AnomalyDetector  # type: ignore
except Exception:
    AnomalyDetector = None  # type: ignore

num_cols = [c for c in df_train.columns if pd.api.types.is_numeric_dtype(df_train[c])]
exclude = {VALUATION_K, "sample_weight"}

preferred = [
    "condition_minus_risk", SIZE_M2, "rooms_per_100sqm",
    "no_elev_high_floor", "floor_ratio",
    "pt_importance", "pt_x_periphery",
    "city_zone_prior", "region_index_prior",
    LAG_HOURS, "distance_to_center_km",
    "air_quality_index", "noise_level", "humidity_level", "temperature_avg",
]
feat_candidates = [c for c in preferred if c in num_cols and c not in exclude]

if len(feat_candidates) < 3:
    pool = []
    for c in num_cols:
        if c in exclude:
            continue
        s = pd.to_numeric(df_train[c], errors="coerce")
        if s.nunique(dropna=True) >= 10 and np.nanvar(s.values) > 0:
            pool.append((c, float(np.nanvar(s.values))))
    pool.sort(key=lambda x: x[1], reverse=True)
    feat_candidates = [c for c, _ in pool[:8]]

contamination       = float(TRAIN_CFG.get("anomaly_contamination", 0.03)) if isinstance(TRAIN_CFG, dict) else 0.03
strong_z_threshold  = float(TRAIN_CFG.get("anomaly_strong_z", 2.5)) if isinstance(TRAIN_CFG, dict) else 2.5
severity_percentile = float(TRAIN_CFG.get("anomaly_severity_pct", 90.0)) if isinstance(TRAIN_CFG, dict) else 90.0
n_estimators        = int(TRAIN_CFG.get("anomaly_n_estimators", 200)) if isinstance(TRAIN_CFG, dict) else 200

if feat_candidates:
    logger.info("Anomaly features (train only): %s", feat_candidates)

    if AnomalyDetector is not None:
        anom = AnomalyDetector(
            contamination=contamination,
            strong_z_threshold=strong_z_threshold,
            severity_percentile=severity_percentile,
            n_estimators=n_estimators,
            random_state=SEED,
        )
        df_train_ext, anom_rep = anom.detect_anomalies(
            df_train,
            feature_candidates=feat_candidates,
            exclude_features=set(),
        )
        for col in ("anomaly_flag", "anomaly_refined", "severity_score"):
            if col in df_train_ext.columns:
                df_train[col] = df_train_ext[col]
    else:
        X = df_train[feat_candidates].apply(pd.to_numeric, errors="coerce")
        mu = X.mean(axis=0)
        sd = X.std(axis=0).replace(0, np.nan)
        z  = (X - mu) / sd
        z_abs = z.abs()
        z_mean = z_abs.mean(axis=1)
        thr = np.nanpercentile(z_mean.dropna().values, severity_percentile)
        flags_raw = (z_abs > strong_z_threshold).any(axis=1)
        flags_ref = (z_mean >= thr)

        df_train["anomaly_flag"]    = flags_raw.astype("Int8")
        df_train["anomaly_refined"] = flags_ref.astype("Int8")
        df_train["severity_score"]  = z_mean.fillna(0).astype("Float32")

        anom_rep = {
            "method": "fallback_zscore",
            "features": feat_candidates,
            "strong_z_threshold": strong_z_threshold,
            "severity_percentile": severity_percentile,
            "n_anomalies_raw": int(flags_raw.sum()),
            "n_anomalies_refined": int(flags_ref.sum()),
        }

    # sample_weight (mean ≈ 1.0)
    if "severity_score" in df_train.columns and df_train["severity_score"].notna().any():
        sev = pd.to_numeric(df_train["severity_score"], errors="coerce").clip(lower=0).astype("Float32")
        w   = 1.0 / (1.0 + sev)
        w   = w.clip(lower=0.2, upper=1.0)
        w   = (w * (1.0 / max(w.mean(), 1e-6))).astype("Float32")
        df_train["sample_weight"] = w
    elif "confidence_score" in df_train.columns and df_train["confidence_score"].notna().any():
        w = pd.to_numeric(df_train["confidence_score"], errors="coerce").clip(0.2, 1.0).astype("Float32")
        w = (w * (1.0 / max(w.mean(), 1e-6))).astype("Float32")
        df_train["sample_weight"] = w
    else:
        df_train["sample_weight"] = np.float32(1.0)

    # persist anomaly report
    try:
        (ART_DIR / "anomaly_train_report.json").write_text(
            canonical_json_dumps(anom_rep),
            encoding="utf-8"
        )
    except Exception:
        pass
else:
    logger.info("Anomaly detection skipped: no valid candidate features.")
    df_train["sample_weight"] = np.float32(1.0)

# --------------------------------------------------------------------------------------
# F) Snapshot
# --------------------------------------------------------------------------------------
print("=" * 60)
print("FEATURE ENGINEERING COMPLETED — quick peek")
print("=" * 60)
display(df_train.head(3))
print(f"\ntrain/valid/test shapes: {df_train.shape} / {df_valid.shape} / {df_test.shape}")

[2025-10-07 17:02:49,035] INFO model_trainer: train split: 10500 rows, 43 cols
[2025-10-07 17:02:49,035] INFO model_trainer: valid split: 2250 rows, 43 cols
[2025-10-07 17:02:49,035] INFO model_trainer: test split: 2250 rows, 43 cols
[2025-10-07 17:02:57,768] INFO model_trainer: Anomaly features (train only): ['condition_minus_risk', 'size_m2', 'rooms_per_100sqm', 'no_elev_high_floor', 'floor_ratio', 'pt_importance', 'pt_x_periphery', 'city_zone_prior', 'region_index_prior', 'distance_to_center_km', 'air_quality_index', 'noise_level', 'humidity_level', 'temperature_avg']


FEATURE ENGINEERING COMPLETED — quick peek


Unnamed: 0,asset_id,asset_type,location,valuation_k,last_verified_ts,listing_month,region,urban_type,zone,size_m2,rooms,bathrooms,year_built,age_years,floor,building_floors,is_top_floor,is_ground_floor,has_elevator,has_garden,has_balcony,garage,owner_occupied,public_transport_nearby,distance_to_center_km,parking_spot,cellar,attic,concierge,energy_class,humidity_level,temperature_avg,noise_level,air_quality_index,condition_score,risk_score,luxury_score,env_score,orientation,view,condition,heating,confidence_score,condition_minus_risk,rooms_per_100sqm,no_elev_high_floor,floor_ratio,pt_importance,pt_x_periphery,pt_x_semi_center,pt_x_center,city_zone_prior,region_index_prior,city,anomaly_flag,anomaly_refined,severity_score,sample_weight
1,asset_000001,property,Turin,961.320007,2025-10-05 10:22:24+00:00,10,north,urban,center,170,3,2,2014,11,4,7,0,0,1,1,1,0,1,0,1.4,1,0,0,0,E,49.0,14.3,79,55,0.801,0.179,0.6,0.7,South-East,street,renovated,autonomous,0.7205,0.622,1.764706,0.0,0.666667,0.0,0,0,0,0.0,1.05,turin,False,False,-0.179232,1.00106
5,asset_000005,property,Cagliari,67.540001,2025-10-05 10:22:24+00:00,10,south,urban,semi_center,43,2,1,1953,72,0,8,0,1,1,0,1,1,1,1,2.29,0,0,0,0,G,62.0,18.4,55,97,0.684,0.326,0.4,0.0,South-East,street,needs_renovation,autonomous,0.462,0.358,4.651163,0.0,0.0,1.15,0,1,0,0.0,0.92,cagliari,False,False,-0.075212,1.00106
6,asset_000006,property,Venice,232.050003,2025-10-05 10:22:24+00:00,10,north,urban,periphery,77,2,1,1991,34,0,9,0,1,1,0,1,0,1,1,6.55,0,1,0,0,D,34.299999,24.4,41,77,0.792,0.24,0.2,0.7,East,sea,needs_renovation,none,0.596,0.552,2.597403,0.0,0.0,1.3,1,0,0,0.0,1.05,venice,False,False,-0.058198,1.00106



train/valid/test shapes: (10500, 58) / (2250, 54) / (2250, 54)


### Feature Preparation & Single Pipeline

In [6]:
# 05) Feature Preparation & Single Pipeline (replaces legacy A/B)
from __future__ import annotations

import json
from pathlib import Path
from typing import List
import re

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor

from shared.common.utils import canonical_json_dumps
from shared.common.constants import (
    VALUATION_K,
    LAST_VERIFIED_TS, PREDICTION_TS,
)
from shared.n03_train_model.preprocessing import ML_LEAKY_FEATURES as _ML_LEAKY  # static list

# -----------------------------
# A) Choose feature columns
# -----------------------------

# 0) Anti-leakage guard (regex, case-insensitive)
_LEAKY_PATTERNS = tuple(
    re.compile(p, re.IGNORECASE)
    for p in (
        r"^(y|label|target)$",    # target alias
        r"^valuation(_|$)",       # 'valuation', 'valuation_*'
        r"^valuation_k(_|$)",     # 'valuation_k*'
        r"^price_per_sqm",        # any price_per_sqm*
        r"_vs_region_avg$",       # *_vs_region_avg
        r"(decile|rank)",         # derived ranking/bins
    )
)
def _is_name_leaky(name: str) -> bool:
    if name in _ML_LEAKY:
        return True
    return any(rx.search(name) for rx in _LEAKY_PATTERNS)

# 1) Colonne da escludere sempre
EXCLUDE_ALWAYS = {
    VALUATION_K,                  # target
    "sample_weight",
    LAST_VERIFIED_TS, PREDICTION_TS,  # timestamps grezzi
    # artefatti & colonne analitiche
    "anomaly_flag", "anomaly_refined",
}

# 2) Prefer explicit engineered/robust features quando presenti
PREFERRED_NUMERIC = [
    # engineered in previous cell
    "condition_minus_risk",
    "rooms_per_100sqm",
    "no_elev_high_floor",
    "floor_ratio",
    "pt_importance", "pt_x_periphery", "pt_x_semi_center", "pt_x_center",
    "city_zone_prior", "region_index_prior",
    # common numeric predictors
    "size_m2", "rooms", "bathrooms",
    "floor", "building_floors",
    "distance_to_center_km",
    "humidity_level", "temperature_avg", "noise_level", "air_quality_index",
    "lag_hours",
]
PREFERRED_CATEGORICAL = [
    "location", "region", "urban_type", "zone",
    "energy_class", "orientation", "view", "condition", "heating",
    # binari che a volte arrivano come object/category
    "has_elevator", "has_garden", "has_balcony", "garage",
    "owner_occupied", "public_transport_nearby", "parking_spot",
    "cellar", "attic", "concierge",
    # time-derived discreto
    "listing_month",
]

def _pick_existing(cols: List[str], df_cols: pd.Index) -> List[str]:
    seen, out = set(), []
    for c in cols:
        if c in df_cols and c not in seen and (c not in EXCLUDE_ALWAYS) and (not _is_name_leaky(c)):
            seen.add(c); out.append(c)
    return out

# Base pick (rispettando anti-leakage)
cat_cols = _pick_existing(PREFERRED_CATEGORICAL, df_train.columns)
num_cols = _pick_existing(PREFERRED_NUMERIC, df_train.columns)

# 3) Auto-discover ulteriori candidati sicuri (no duplicati, no leakage)
_auto_num = [
    c for c in df_train.columns
    if pd.api.types.is_numeric_dtype(df_train[c])
    and c not in EXCLUDE_ALWAYS
    and not _is_name_leaky(c)
    and c not in num_cols
]
_auto_cat = [
    c for c in df_train.columns
    if (pd.api.types.is_object_dtype(df_train[c]) or pd.api.types.is_categorical_dtype(df_train[c]))
    and c not in EXCLUDE_ALWAYS
    and not _is_name_leaky(c)
    and c not in cat_cols
]

num_cols += [c for c in _auto_num if c not in num_cols]
cat_cols += [c for c in _auto_cat if c not in cat_cols]

# 4) Lista finale (l’ordine conta per importances downstream)
FEATURES = cat_cols + [c for c in num_cols if c not in set(cat_cols)]
if not FEATURES:
    raise RuntimeError("No features selected for training (after leakage guards).")

# Persist a small spec per riproducibilità
feature_spec = {
    "categorical": cat_cols,
    "numeric": num_cols,
    "all_features": FEATURES,
}
(ART_DIR / "feature_spec.json").write_text(canonical_json_dumps(feature_spec), encoding="utf-8")
print(f"Feature spec saved → {ART_DIR / 'feature_spec.json'}")
print(f"Using {len(cat_cols)} categorical + {len(num_cols)} numeric features ({len(FEATURES)} total).")

# --------------------------------
# B) Build preprocessing pipeline
# --------------------------------
# Categorical: impute missing con moda + Ordinal encode (gestione unknown)
cat_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
])

# Numeric: impute con mediana + standardize
num_pipe = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler(with_mean=True, with_std=True)),
])

prep = ColumnTransformer(
    transformers=[
        ("cat", cat_pipe, cat_cols),
        ("num", num_pipe, num_cols),
    ],
    remainder="drop",
    sparse_threshold=0.0,
    n_jobs=None,
    verbose_feature_names_out=False,
)

# --------------------------------
# C) Estimator (single “champion”)
# --------------------------------
rf_params = {
    "n_estimators": int(TRAIN_CFG.get("rf_n_estimators", 400)) if isinstance(TRAIN_CFG, dict) else 400,
    "max_depth": TRAIN_CFG.get("rf_max_depth", None) if isinstance(TRAIN_CFG, dict) else None,
    "min_samples_leaf": int(TRAIN_CFG.get("rf_min_samples_leaf", 2)) if isinstance(TRAIN_CFG, dict) else 2,
    "n_jobs": -1,
    "random_state": SEED,
}
rf = RandomForestRegressor(**rf_params)

# Train in log-space (stabile) e predici in scala naturale (k€) via TTR
ttr = TransformedTargetRegressor(
    regressor=Pipeline(steps=[
        ("prep", prep),
        ("model", rf),
    ]),
    func=np.log1p,
    inverse_func=np.expm1,
    check_inverse=False,
)

# --------------------------------
# D) Prepare train/valid sets (aligned columns)
# --------------------------------
def _ensure_cols(df_part: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    dfp = df_part.copy()
    for c in cols:
        if c not in dfp.columns:
            dfp[c] = np.nan
    return dfp[cols]  # keep order

X_train = _ensure_cols(df_train, FEATURES)
y_train = pd.to_numeric(df_train[VALUATION_K], errors="coerce").astype("float64")

X_valid = _ensure_cols(df_valid, FEATURES)
y_valid = pd.to_numeric(df_valid[VALUATION_K], errors="coerce").astype("float64")

# sample weights (train only)
w_train = None
if "sample_weight" in df_train.columns:
    w = pd.to_numeric(df_train["sample_weight"], errors="coerce").astype("float64")
    w = w.replace([np.inf, -np.inf], np.nan).fillna(1.0).clip(lower=0.0)
    # normalize to mean≈1 per stabilità
    mean_w = float(w.mean()) if float(w.mean()) > 0 else 1.0
    w_train = (w / mean_w).to_numpy()

# Persist a small preview per sanity
preview = {
    "n_features": len(FEATURES),
    "categorical": len(cat_cols),
    "numeric": len(num_cols),
    "rf_params": rf_params,
}
(ART_DIR / "pipeline_preview.json").write_text(canonical_json_dumps(preview), encoding="utf-8")
print(f"Pipeline preview saved → {ART_DIR / 'pipeline_preview.json'}")

print("Prepared X/y:")
print(f"  X_train={X_train.shape}, y_train={y_train.shape}, sample_weight={'yes' if w_train is not None else 'no'}")
print(f"  X_valid={X_valid.shape}, y_valid={y_valid.shape}")

# Expose objects for next cells
pipeline = ttr          # main model object to fit
preprocessor = prep     # for introspection/feature names
chosen_features = FEATURES

Feature spec saved → c:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\modeling\artifacts\feature_spec.json
Using 23 categorical + 41 numeric features (53 total).
Pipeline preview saved → c:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\modeling\artifacts\pipeline_preview.json
Prepared X/y:
  X_train=(10500, 53), y_train=(10500,), sample_weight=yes
  X_valid=(2250, 53), y_valid=(2250,)


  if (pd.api.types.is_object_dtype(df_train[c]) or pd.api.types.is_categorical_dtype(df_train[c]))


### Train & Validation (single champion)

In [7]:
# 06) Train & Validation (single champion) — fixed sample_weight routing
from __future__ import annotations

import json
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
import joblib  # type: ignore

from shared.common.utils import canonical_json_dumps
from shared.common.constants import VALUATION_K

# --------------------------
# Safety checks
# --------------------------
req_objs = ["pipeline", "X_train", "y_train", "X_valid", "y_valid"]
missing = [o for o in req_objs if o not in globals()]
if missing:
    raise RuntimeError(f"Missing objects before training: {missing}")

# --------------------------
# Helper: robust sample_weight routing
# --------------------------
def _final_step_name(p: Pipeline) -> str:
    return list(p.named_steps.keys())[-1]

def _make_fit_kwargs(est, sample_weight):
    """
    Route sample_weight correctly across:
    1) Pipeline(..., ('ttr', TTR(regressor=Pipeline(..., ('model', RF))))) -> {'ttr__model__sample_weight': w}
    2) Pipeline(..., ('ttr', TTR(regressor=RF)))                           -> {'ttr__sample_weight': w}
    3) Pipeline(..., ('rf', RF))                                           -> {'rf__sample_weight': w}
    4) TransformedTargetRegressor(regressor=Pipeline(..., ('model', RF)))  -> {'model__sample_weight': w}
    5) TransformedTargetRegressor(regressor=RF)                            -> {'sample_weight': w}
    6) Plain estimator                                                     -> {'sample_weight': w}
    """
    # Case A: top-level is a Pipeline
    if isinstance(est, Pipeline):
        last_name = _final_step_name(est)
        last_obj  = est.named_steps[last_name]

        # A1) final is TTR
        if isinstance(last_obj, TransformedTargetRegressor):
            reg = getattr(last_obj, "regressor_", None) or getattr(last_obj, "regressor", None)
            # A1a) regressor is a Pipeline -> need two-level namespacing
            if isinstance(reg, Pipeline):
                inner_last = _final_step_name(reg)
                return {f"{last_name}__{inner_last}__sample_weight": sample_weight}
            # A1b) regressor is a plain estimator -> pass to TTR step
            return {f"{last_name}__sample_weight": sample_weight}

        # A2) final is a plain estimator
        return {f"{last_name}__sample_weight": sample_weight}

    # Case B: top-level is a TTR
    if isinstance(est, TransformedTargetRegressor):
        reg = getattr(est, "regressor_", None) or getattr(est, "regressor", None)
        if isinstance(reg, Pipeline):
            inner_last = _final_step_name(reg)
            return {f"{inner_last}__sample_weight": sample_weight}
        return {"sample_weight": sample_weight}

    # Case C: plain estimator
    return {"sample_weight": sample_weight}

# --------------------------
# A) Fit champion pipeline
# --------------------------
# Build/locate training weights if available
w_train = None
if "w_train" in globals() and w_train is not None:
    w_train = np.asarray(w_train, dtype=float).ravel()
elif "df_train" in globals() and isinstance(df_train, pd.DataFrame) and "sample_weight" in df_train.columns:
    # align by index if X_train has an index from df_train
    idx = getattr(X_train, "index", None)
    if idx is not None and set(idx).issubset(set(df_train.index)):
        w_train = df_train.loc[idx, "sample_weight"].to_numpy(dtype=float, copy=False).ravel()
    else:
        w_train = df_train["sample_weight"].to_numpy(dtype=float, copy=False).ravel()

fit_kwargs = {}
if w_train is not None:
    fit_kwargs = _make_fit_kwargs(pipeline, w_train)
    logger.info("Passing sample_weight with key(s): %s", list(fit_kwargs.keys()))

logger.info("Fitting champion pipeline…")
pipeline.fit(X_train, y_train, **fit_kwargs)
logger.info("Fit completed.")

# --------------------------
# B) Validation evaluation
# --------------------------
y_pred_valid = pipeline.predict(X_valid)
# guard tiny negatives
y_pred_valid = np.maximum(y_pred_valid, 0.0)

mae  = float(mean_absolute_error(y_valid, y_pred_valid))
rmse = float(mean_squared_error(y_valid, y_pred_valid))
r2   = float(r2_score(y_valid, y_pred_valid))

eval_report = {
    "split": "valid",
    "metrics": {"MAE": mae, "RMSE": rmse, "R2": r2},
    "n_valid": int(len(y_valid)),
    "target": VALUATION_K,
}
print("Validation metrics:", eval_report["metrics"])
(ART_DIR / "eval_valid.json").write_text(canonical_json_dumps(eval_report), encoding="utf-8")
logger.info("Saved validation report → %s", (ART_DIR / "eval_valid.json").as_posix())

# --------------------------
# C) Quick diagnostic plots
# --------------------------
FIG_DIR.mkdir(parents=True, exist_ok=True)

preds_valid = pd.DataFrame({
    "y_true": y_valid,
    "y_pred": y_pred_valid,
}, index=getattr(X_valid, "index", None))
preds_valid.to_csv(ART_DIR / "predictions_valid.csv", index=True)
logger.info("Saved validation predictions → %s", (ART_DIR / "predictions_valid.csv").as_posix())

# Scatter: y_true vs y_pred
plt.figure(figsize=(6, 6))
lim_lo = float(np.nanmin([preds_valid["y_true"].min(), preds_valid["y_pred"].min(), 0]))
lim_hi = float(np.nanmax([preds_valid["y_true"].max(), preds_valid["y_pred"].max()]))
plt.scatter(preds_valid["y_true"], preds_valid["y_pred"], s=8, alpha=0.5)
plt.plot([lim_lo, lim_hi], [lim_lo, lim_hi], lw=1)  # y=x
plt.xlabel("True valuation_k")
plt.ylabel("Predicted valuation_k")
plt.title("Validation — y_true vs y_pred")
plt.tight_layout()
plt.savefig(FIG_DIR / "valid_scatter_true_vs_pred.png", dpi=150)
plt.close()
print("Saved:", FIG_DIR / "valid_scatter_true_vs_pred.png")

# Residuals histogram
resid = preds_valid["y_pred"] - preds_valid["y_true"]
plt.figure(figsize=(7, 4))
plt.hist(resid, bins=40)
plt.xlabel("Residual (pred - true)")
plt.ylabel("Count")
plt.title("Validation residuals")
plt.tight_layout()
plt.savefig(FIG_DIR / "valid_residuals_hist.png", dpi=150)
plt.close()
print("Saved:", FIG_DIR / "valid_residuals_hist.png")

# --------------------------
# D) Persist model artifact
# --------------------------
MODEL_DIR.mkdir(parents=True, exist_ok=True)
model_path = MODEL_DIR / "champion_model.joblib"

feature_spec_safe = globals().get("feature_spec", {}) or {}
chosen_features_safe = globals().get("chosen_features", []) or []

joblib.dump(
    {
        "model": pipeline,
        "features": {
            "categorical": feature_spec_safe.get("categorical", []),
            "numeric": feature_spec_safe.get("numeric", []),
            "all": chosen_features_safe,
        },
        "seed": int(SEED) if "SEED" in globals() else None,
        "metrics_valid": eval_report["metrics"],
    },
    model_path,
)
logger.info("Saved champion model → %s", model_path.as_posix())

# Expose artifacts to later cells
champion = pipeline
valid_report = eval_report

[2025-10-07 17:05:55,496] INFO model_trainer: Passing sample_weight with key(s): ['model__sample_weight']
[2025-10-07 17:05:55,497] INFO model_trainer: Fitting champion pipeline…
[2025-10-07 17:05:59,245] INFO model_trainer: Fit completed.
[2025-10-07 17:05:59,331] INFO model_trainer: Saved validation report → c:/Users/anven/OneDrive/Documenti/GitHub/axiomatic_oracle/notebooks/outputs/modeling/artifacts/eval_valid.json
[2025-10-07 17:05:59,339] INFO model_trainer: Saved validation predictions → c:/Users/anven/OneDrive/Documenti/GitHub/axiomatic_oracle/notebooks/outputs/modeling/artifacts/predictions_valid.csv


Validation metrics: {'MAE': 55.02990196764213, 'RMSE': 5837.736915453582, 'R2': 0.9250257802906402}
Saved: c:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\modeling\figures\valid_scatter_true_vs_pred.png
Saved: c:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\modeling\figures\valid_residuals_hist.png


[2025-10-07 17:06:00,922] INFO model_trainer: Saved champion model → c:/Users/anven/OneDrive/Documenti/GitHub/axiomatic_oracle/notebooks/outputs/modeling/champion_model.joblib


### Feature Importance (on champion)

In [8]:
# 07) Feature importances (champion) — robust, single-pipeline
from __future__ import annotations

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.compose import TransformedTargetRegressor
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline

from shared.common.constants import VALUATION_K

TOPN = 20  # top bars in charts

# ----------------------------
# Helpers
# ----------------------------
def _extract_parts(model_like):
    """
    Return (inner_pipe, preproc, final_est, final_step_name, is_ttr)
    - inner_pipe: sklearn Pipeline with .named_steps
    - preproc   : ColumnTransformer in step 'prep' (if present)
    - final_est : last estimator (e.g. RandomForestRegressor)
    - final_step_name: name of the last step (e.g. 'model'/'rf')
    - is_ttr    : True if model_like is a TransformedTargetRegressor
    """
    is_ttr = isinstance(model_like, TransformedTargetRegressor)
    inner = (
        model_like.regressor_ if is_ttr and hasattr(model_like, "regressor_")
        else (model_like.regressor if is_ttr and hasattr(model_like, "regressor") else model_like)
    )
    if not isinstance(inner, Pipeline):
        raise RuntimeError("Expected a Pipeline with a 'prep' step and a final estimator.")

    preproc = inner.named_steps.get("prep", None)

    # pick the last step as final estimator (prefer common names if present)
    final_step_name = None
    for cand in ("model", "rf", "regressor"):
        if cand in inner.named_steps:
            final_step_name = cand
            break
    if final_step_name is None:
        final_step_name = list(inner.named_steps.keys())[-1]
    final_est = inner.named_steps[final_step_name]
    return inner, preproc, final_est, final_step_name, is_ttr


def _feature_names_from_ct(preproc, fallback_cat: list[str], fallback_num: list[str]) -> list[str]:
    """Extract feature names in ColumnTransformer order (cat then num)."""
    if preproc is None or not hasattr(preproc, "transformers"):
        # fallback to provided lists
        seen, ordered = set(), []
        for c in list(fallback_cat) + list(fallback_num):
            if c not in seen:
                seen.add(c); ordered.append(c)
        return ordered

    cat_cols_ct, num_cols_ct = [], []
    for name, est, cols in preproc.transformers:
        if name == "cat":
            cat_cols_ct = list(cols) if isinstance(cols, (list, tuple, np.ndarray, pd.Index)) else list(fallback_cat)
        elif name == "num":
            num_cols_ct = list(cols) if isinstance(cols, (list, tuple, np.ndarray, pd.Index)) else list(fallback_num)

    seen, ordered = set(), []
    for c in list(cat_cols_ct) + list(num_cols_ct):
        if c not in seen:
            seen.add(c); ordered.append(c)
    return ordered


def _ensure_cols(df_part: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    """Add missing columns as NaN; return with exact column order."""
    dfp = df_part.copy()
    missing = [c for c in cols if c not in dfp.columns]
    for c in missing:
        dfp[c] = np.nan
    return dfp[cols]


def _to_parquet_optional(df: pd.DataFrame, path: Path):
    """Try writing Parquet; skip quietly if engine is missing."""
    try:
        df.to_parquet(path, index=False)
    except Exception as e:
        logger.info("Parquet export skipped for %s: %s", path, e)


# ----------------------------
# Choose champion (single pipeline)
# ----------------------------
if "champion" in globals() and champion is not None:
    chosen = champion
elif "pipeline" in globals() and pipeline is not None:
    chosen = pipeline
else:
    raise RuntimeError("No trained pipeline found. Expected variable 'champion' or 'pipeline'.")

inner_pipe, preproc, final_est, final_step, is_ttr = _extract_parts(chosen)

# Feature lists from our spec created in the previous cell
if "feature_spec" not in globals():
    raise RuntimeError("feature_spec is missing (expected from the previous cell).")

cat_fallback = list(feature_spec.get("categorical", []))
num_fallback = list(feature_spec.get("numeric", []))
feat_in_use  = _feature_names_from_ct(preproc, cat_fallback, num_fallback)

# Build aligned X_test
if "df_test" not in globals():
    raise RuntimeError("df_test is missing.")
X_tst_use = _ensure_cols(df_test, feat_in_use)

# ----------------------------
# Built-in importance (if available)
# ----------------------------
builtin_imp = None
fi_raw = getattr(final_est, "feature_importances_", None)
if fi_raw is not None:
    try:
        imp = np.asarray(fi_raw)
        if imp.ndim == 1 and imp.size > 0:
            feat_names = list(feat_in_use) if len(feat_in_use) == imp.shape[0] else [f"f{i}" for i in range(int(imp.shape[0]))]
            builtin_imp = (
                pd.DataFrame({"feature": feat_names, "importance": imp.astype(float, copy=False)})
                .sort_values("importance", ascending=False)
                .reset_index(drop=True)
            )
            builtin_imp.to_csv(MODEL_DIR / "feature_importance_builtin.csv", index=False)
            _to_parquet_optional(builtin_imp, MODEL_DIR / "feature_importance_builtin.parquet")

            plt.figure(figsize=(10, 6))
            _top = min(TOPN, len(builtin_imp))
            ax = builtin_imp.head(_top).plot(kind="bar", x="feature", y="importance", legend=False, rot=45)
            ax.set_title(f"Built-in Feature Importance (top {_top})")
            ax.set_ylabel("Importance")
            plt.tight_layout()
            plt.savefig(FIG_DIR / "feature_importance_builtin.png", dpi=150)
            plt.close()
            print("Saved:", FIG_DIR / "feature_importance_builtin.png")
        else:
            print("⚠️ feature_importances_ present but empty → skipping built-in.")
    except Exception as e:
        print(f"⚠️ Unable to compute built-in importance: {e}")
else:
    print("ℹ️ feature_importances_ not available on the final estimator → skipping built-in.")

# ----------------------------
# Permutation importance
# ----------------------------
# y scale:
#  - if using TTR, predictions are on natural scale → pass natural y
#  - if the pipeline was trained on log1p(y) directly, pass log1p(y)
if is_ttr:
    y_perm = df_test[VALUATION_K].to_numpy(dtype="float64", copy=False)
else:
    y_perm = np.log1p(df_test[VALUATION_K].to_numpy(dtype="float64", copy=False))

perm = permutation_importance(
    estimator=chosen,           # TTR or Pipeline
    X=X_tst_use,
    y=y_perm,
    n_repeats=8,
    random_state=SEED if "SEED" in globals() else 42,
    n_jobs=-1,
    scoring="r2",
)

feat_names_pi = list(feat_in_use) if len(feat_in_use) == perm.importances_mean.shape[0] \
                else [f"f{i}" for i in range(perm.importances_mean.shape[0])]

perm_imp = (
    pd.DataFrame({
        "feature": feat_names_pi,
        "importance": perm.importances_mean.astype(float, copy=False),
        "std": perm.importances_std.astype(float, copy=False),
    })
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)

perm_imp.to_csv(MODEL_DIR / "feature_importance_permutation.csv", index=False)
_to_parquet_optional(perm_imp, MODEL_DIR / "feature_importance_permutation.parquet")

plt.figure(figsize=(10, 6))
_top = min(TOPN, len(perm_imp))
ax = perm_imp.head(_top).plot(kind="bar", x="feature", y="importance", yerr="std", legend=False, rot=45)
ax.set_title(f"Permutation Importance (top {_top})")
ax.set_ylabel("Importance (mean ΔR²)")
plt.tight_layout()
plt.savefig(FIG_DIR / "feature_importance_permutation.png", dpi=150)
plt.close()
print("Saved:", FIG_DIR / "feature_importance_permutation.png")

# Quick preview
if builtin_imp is not None:
    display(builtin_imp.head(12))
display(perm_imp.head(12))

Saved: c:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\modeling\figures\feature_importance_builtin.png
Saved: c:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\modeling\figures\feature_importance_permutation.png


Unnamed: 0,feature,importance
0,f33,0.512114
1,f38,0.126428
2,f3,0.065747
3,f63,0.047259
4,f1,0.036277
5,f32,0.032433
6,f4,0.03034
7,f22,0.022865
8,f0,0.022375
9,f45,0.018183


Unnamed: 0,feature,importance,std
0,size_m2,0.911539,0.023997
1,distance_to_center_km,0.199011,0.006644
2,zone,0.077972,0.002329
3,region,0.077057,0.003963
4,city,0.064862,0.004178
5,energy_class,0.064219,0.002874
6,location,0.062918,0.004143
7,region_index_prior,0.050719,0.002218
8,age_years,0.022628,0.001173
9,year_built,0.021321,0.001114


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

### Segment Valuations & Predictions Save

In [9]:
# 08) Segment valuations & save predictions — robust, single-pipeline
from __future__ import annotations

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

from shared.common.constants import VALUATION_K, ASSET_ID, LOCATION

FIG_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# ---------------- helpers ----------------
def _expm1_safe(z, cap: float = 12.0):
    z = np.asarray(z, dtype=np.float64)
    z = np.clip(z, -20.0, cap)  # cap to avoid overflow on extreme log-preds
    out = np.expm1(z)
    out[out < 0] = 0.0
    return out

def _to_parquet_optional(df: pd.DataFrame, path: Path):
    """Try Parquet; skip quietly if engine is missing."""
    try:
        df.to_parquet(path, index=False)
    except Exception as e:
        logger.info("Parquet export skipped for %s: %s", path, e)

def _ensure_cols(df_part: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    dfp = df_part.copy()
    miss = [c for c in cols if c not in dfp.columns]
    if miss:
        for c in miss:
            dfp[c] = np.nan
        logger.info("Added %d missing columns to TEST (imputed as NaN): %s", len(miss), miss[:10])
    return dfp[cols]

def _extract_parts(model_like):
    """
    Return (inner_pipe, preproc, final_est, final_step_name, is_ttr)
    - inner_pipe: sklearn Pipeline with .named_steps
    - preproc   : ColumnTransformer in step 'prep' (if present)
    - final_est : last estimator (e.g. RandomForestRegressor)
    - final_step_name: name of last step
    - is_ttr    : True if model_like is a TransformedTargetRegressor
    """
    is_ttr = isinstance(model_like, TransformedTargetRegressor)
    inner = (
        model_like.regressor_ if is_ttr and hasattr(model_like, "regressor_")
        else (model_like.regressor if is_ttr and hasattr(model_like, "regressor") else model_like)
    )
    if not isinstance(inner, Pipeline):
        raise RuntimeError("Expected a Pipeline with a 'prep' step and a final estimator.")
    preproc = inner.named_steps.get("prep", None)
    # choose last step as final estimator unless a common alias is present
    final_step_name = None
    for cand in ("model", "rf", "regressor"):
        if cand in inner.named_steps:
            final_step_name = cand
            break
    if final_step_name is None:
        final_step_name = list(inner.named_steps.keys())[-1]
    final_est = inner.named_steps[final_step_name]
    return inner, preproc, final_est, final_step_name, is_ttr

def _feature_names_from_ct(preproc, fallback_cat: list[str], fallback_num: list[str]) -> list[str]:
    """Extract feature names in ColumnTransformer order (cat then num)."""
    if preproc is None or not hasattr(preproc, "transformers"):
        seen, ordered = set(), []
        for c in list(fallback_cat) + list(fallback_num):
            if c not in seen:
                seen.add(c); ordered.append(c)
        return ordered

    cat_cols_ct, num_cols_ct = [], []
    for name, est, cols in preproc.transformers:
        if name == "cat":
            cat_cols_ct = list(cols) if isinstance(cols, (list, tuple, np.ndarray, pd.Index)) else list(fallback_cat)
        elif name == "num":
            num_cols_ct = list(cols) if isinstance(cols, (list, tuple, np.ndarray, pd.Index)) else list(fallback_num)

    seen, ordered = set(), []
    for c in list(cat_cols_ct) + list(num_cols_ct):
        if c not in seen:
            seen.add(c); ordered.append(c)
    return ordered

def _predict_nat_from_champion(champ, X_df: pd.DataFrame, cols: list[str]) -> np.ndarray:
    """
    Predict on TEST in natural scale (k€), handling:
      - TransformedTargetRegressor (natural output)
      - Plain pipeline trained on log1p(y) (expm1 with safety)
    """
    LOG_CAP = float(TRAIN_CFG.get("log_cap_clip", 12.0)) if "TRAIN_CFG" in globals() else 12.0
    X_use = _ensure_cols(X_df, cols)
    if isinstance(champ, TransformedTargetRegressor):
        return np.asarray(champ.predict(X_use), dtype=np.float64)
    # pipeline trained on log1p target
    log_pred = champ.predict(X_use)
    return _expm1_safe(log_pred, cap=LOG_CAP)

def _rmse(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def _group_apply_safe(df: pd.DataFrame, key: str, func):
    gb = df.groupby(key, observed=True)
    try:
        out = gb.apply(func, include_groups=False)  # pandas >= 2.2
    except TypeError:
        out = gb[["y_true", "y_pred"]].apply(func)  # pandas < 2.2
    return out.reset_index()

# ---------------- choose champion & feature order ----------------
if "champion" not in globals() or champion is None:
    raise RuntimeError("Missing trained 'champion' pipeline from the training cell.")

if "feature_spec" not in globals():
    raise RuntimeError("feature_spec is missing (expected from the Feature Preparation cell).")

inner_pipe, preproc, final_est, final_step, is_ttr = _extract_parts(champion)
cat_fallback = list(feature_spec.get("categorical", []))
num_fallback = list(feature_spec.get("numeric", []))
feat_in_use  = _feature_names_from_ct(preproc, cat_fallback, num_fallback)

# ---------------- 1) TEST predictions & residual plots ----------------
if "df_test" not in globals():
    raise RuntimeError("df_test is missing — cannot compute residuals.")

y_true_t = df_test[VALUATION_K].to_numpy(dtype="float64", copy=False)
X_tst_use = _ensure_cols(df_test, feat_in_use)
y_pred_t = _predict_nat_from_champion(champion, X_tst_use, feat_in_use)

# sanitize
mask = np.isfinite(y_true_t) & np.isfinite(y_pred_t)
valid_n = int(mask.sum())
if valid_n < max(30, int(0.3 * len(y_true_t))):
    bad_idx = np.where(~mask)[0][:10].tolist()
    raise RuntimeError(
        f"Non-finite TEST predictions/targets: valid {valid_n}/{len(mask)}. "
        f"Example bad idx: {bad_idx}. Check feature alignment and prediction scale."
    )

y_true_t = y_true_t[mask]
y_pred_t = y_pred_t[mask]
residuals = y_true_t - y_pred_t

# residual hist
plt.figure(figsize=(8, 5))
plt.hist(residuals, bins=60, density=True)
plt.title("Residuals (TEST)")
plt.xlabel("y − ŷ (k€)")
plt.ylabel("Density")
plt.tight_layout()
out_res = FIG_DIR / "residuals_test_hist.png"
plt.savefig(out_res, dpi=150, bbox_inches="tight")
plt.close()
print("Saved:", out_res)

# residual vs pred
plt.figure(figsize=(8, 5))
plt.scatter(y_pred_t, residuals, s=10, alpha=0.6)
plt.axhline(0.0, linestyle="--")
plt.title("Residuals vs Predictions (TEST)")
plt.xlabel("ŷ (k€)")
plt.ylabel("y − ŷ (k€)")
plt.tight_layout()
out_sc = FIG_DIR / "residuals_vs_pred_test.png"
plt.savefig(out_sc, dpi=150, bbox_inches="tight")
plt.close()
print("Saved:", out_sc)

# ---------------- 2) Segmented metrics & predictions export ----------------
# Build clean metric dataframe on valid rows
idx_valid = df_test.index[mask]
cols_keep = []
if ASSET_ID in df_test.columns: cols_keep.append(ASSET_ID)
if LOCATION in df_test.columns: cols_keep.append(LOCATION)

dfm = df_test.loc[idx_valid, cols_keep].copy()
dfm["y_true"] = y_true_t
dfm["y_pred"] = y_pred_t

# deciles on natural target
try:
    dfm["decile"] = pd.qcut(dfm["y_true"], q=10, labels=False, duplicates="drop")
except Exception:
    dfm["decile"] = 0

def _agg_metrics(g: pd.DataFrame) -> pd.Series:
    return pd.Series({
        "n": int(len(g)),
        "MAE": float(mean_absolute_error(g["y_true"], g["y_pred"])),
        "RMSE": float(_rmse(g["y_true"], g["y_pred"])),
        "R2": float(r2_score(g["y_true"], g["y_pred"])) if len(g) > 1 else np.nan,
    })

# by decile
dec_rep = _group_apply_safe(dfm, "decile", _agg_metrics)
dec_rep.to_csv(MODEL_DIR / "metrics_by_decile.csv", index=False)
_to_parquet_optional(dec_rep, MODEL_DIR / "metrics_by_decile.parquet")
print("Saved:", MODEL_DIR / "metrics_by_decile.csv")

# by location (if present)
if LOCATION in dfm.columns:
    loc_rep = _group_apply_safe(dfm, LOCATION, _agg_metrics)
else:
    loc_rep = pd.DataFrame([{
        LOCATION: "NA",
        "n": int(len(dfm)),
        "MAE": float(mean_absolute_error(dfm["y_true"], dfm["y_pred"])) if len(dfm) else np.nan,
        "RMSE": float(_rmse(dfm["y_true"], dfm["y_pred"])) if len(dfm) else np.nan,
        "R2": float(r2_score(dfm["y_true"], dfm["y_pred"])) if len(dfm) > 1 else np.nan,
    }])

loc_rep.to_csv(MODEL_DIR / "metrics_by_location.csv", index=False)
_to_parquet_optional(loc_rep, MODEL_DIR / "metrics_by_location.parquet")
print("Saved:", MODEL_DIR / "metrics_by_location.csv")

# predictions export
pred_cols = []
if ASSET_ID in dfm.columns: pred_cols.append(ASSET_ID)
if LOCATION in dfm.columns: pred_cols.append(LOCATION)
pred_df = dfm[pred_cols + ["y_true", "y_pred"]].rename(columns={"y_true": VALUATION_K})

pred_df.to_csv(MODEL_DIR / "predictions_test.csv", index=False, encoding="utf-8")
_to_parquet_optional(pred_df, MODEL_DIR / "predictions_test.parquet")
print("Saved:", MODEL_DIR / "predictions_test.csv")

display(pred_df.head(10))

[2025-10-07 17:06:28,280] INFO model_trainer: Added 1 missing columns to TEST (imputed as NaN): ['severity_score']


Saved: c:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\modeling\figures\residuals_test_hist.png
Saved: c:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\modeling\figures\residuals_vs_pred_test.png
Saved: c:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\modeling\metrics_by_decile.csv
Saved: c:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\modeling\metrics_by_location.csv
Saved: c:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\modeling\predictions_test.csv


Unnamed: 0,asset_id,location,valuation_k,y_pred
0,asset_000000,Venice,237.570007,208.332203
2,asset_000002,Bari,374.720001,318.62558
3,asset_000003,Genoa,572.309998,677.990431
13,asset_000013,Verona,176.839996,217.349668
22,asset_000022,Cagliari,118.82,152.746103
35,asset_000035,Milan,794.280029,816.461326
51,asset_000051,Milan,224.869995,186.147949
58,asset_000058,Milan,1378.780029,1188.465781
79,asset_000079,Venice,467.339996,398.150185
88,asset_000088,Milan,1228.660034,1205.232037


### Model Persistence & Manifest Training

In [10]:
# 09) Model persistence & training manifest — single champion, no shim
from __future__ import annotations

import os, json, hashlib
from datetime import datetime, timezone
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.utils.validation import check_is_fitted

# NOTE: path standardizzato (no "notebooks.shared")
from shared.n03_train_model import metrics
from shared.common.constants import VALUATION_K, ASSET_ID, LOCATION
from shared.common.utils import canonical_json_dumps
from shared.common.serving_transformers import GeoCanonizer, PriorsGuard, EnsureDerivedFeatures
from shared.n03_train_model.preprocessing import list_required_serving_derivatives

# ---------------------------------------------------------------------------
# Thread limits (stability on some Windows/Python builds)
# ---------------------------------------------------------------------------
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")

# ---------------------------------------------------------------------------
# Pre-flight checks & helpers
# ---------------------------------------------------------------------------
if "champion" not in globals() or champion is None:
    raise RuntimeError("Missing trained 'champion' pipeline. Run the training cell first.")

def _extract_parts(model_like):
    """Return (inner_pipe, preproc, final_est, final_step_name, is_ttr)."""
    from sklearn.compose import TransformedTargetRegressor
    is_ttr = isinstance(model_like, TransformedTargetRegressor)
    inner = (
        model_like.regressor_ if is_ttr and hasattr(model_like, "regressor_")
        else (model_like.regressor if is_ttr and hasattr(model_like, "regressor") else model_like)
    )
    if not isinstance(inner, Pipeline):
        raise RuntimeError("Expected champion to be a Pipeline (possibly wrapped by TTR).")
    preproc = inner.named_steps.get("prep", None)
    # pick the last step unless a common alias exists
    final_step_name = None
    for cand in ("model", "rf", "regressor"):
        if cand in inner.named_steps:
            final_step_name = cand
            break
    if final_step_name is None:
        final_step_name = list(inner.named_steps.keys())[-1]
    final_est = inner.named_steps[final_step_name]
    return inner, preproc, final_est, final_step_name, is_ttr

def _feature_names_from_ct(preproc, fallback_cat: list[str], fallback_num: list[str]) -> list[str]:
    """ColumnTransformer order (cat then num)."""
    if preproc is None or not hasattr(preproc, "transformers"):
        seen, ordered = set(), []
        for c in list(fallback_cat) + list(fallback_num):
            if c not in seen:
                seen.add(c); ordered.append(c)
        return ordered

    cat_cols_ct, num_cols_ct = [], []
    for name, est, cols in preproc.transformers:
        if name == "cat":
            cat_cols_ct = list(cols) if isinstance(cols, (list, tuple, np.ndarray, pd.Index)) else list(fallback_cat)
        elif name == "num":
            num_cols_ct = list(cols) if isinstance(cols, (list, tuple, np.ndarray, pd.Index)) else list(fallback_num)

    seen, ordered = set(), []
    for c in list(cat_cols_ct) + list(num_cols_ct):
        if c not in seen:
            seen.add(c); ordered.append(c)
    return ordered

def _sha256_file(p: Path, chunk: int = 1 << 20) -> str:
    h = hashlib.sha256()
    with p.open("rb") as f:
        for ch in iter(lambda: f.read(chunk), b""):
            h.update(ch)
    return h.hexdigest()

def _to_parquet_optional(df: pd.DataFrame, path: Path):
    try:
        df.to_parquet(path, index=False)
    except Exception as e:
        logger.info("Parquet export skipped for %s: %s", path, e)

# ---------------------------------------------------------------------------
# Resolve champion parts & expected feature order
# ---------------------------------------------------------------------------
inner_pipe, preproc, final_est, final_step_name, is_ttr = _extract_parts(champion)

# Prefer explicit feature_spec from the “Feature Preparation” cell
if "feature_spec" not in globals():
    raise RuntimeError("feature_spec is missing (expected from the Feature Preparation cell).")

cat_cols = list(feature_spec.get("categorical", []))
num_cols = list(feature_spec.get("numeric", []))
feature_order = _feature_names_from_ct(preproc, cat_cols, num_cols)

# sanity: check the champion is fitted
try:
    check_is_fitted(inner_pipe)
except Exception as e:
    raise RuntimeError(f"Champion pipeline does not look fitted: {e}")

# ---------------------------------------------------------------------------
# Build serving pipeline (no refit) — Geo → Priors → Derived → core
# ---------------------------------------------------------------------------
# Pull priors from CONFIG (same source used in generation & FE)
GEN_CFG = CONFIG.get("generation", {}) if isinstance(CONFIG, dict) else {}
_city_base_raw = GEN_CFG.get("city_base_prices", {}) or {}
region_index_defaults = {"north": 1.05, "center": 1.00, "south": 0.92}
_region_index = GEN_CFG.get("region_index", {}) or region_index_defaults

CITY_BASE = {
    str(c).strip().lower(): {str(z).strip().lower(): float(v) for z, v in d.items()}
    for c, d in _city_base_raw.items()
}
_ZONE_KEYS = set(z for d in CITY_BASE.values() for z in d.keys())
_ZONE_MED = {z: float(np.nanmedian([d.get(z, np.nan) for d in CITY_BASE.values()])) for z in _ZONE_KEYS} if CITY_BASE else {}
_GLOBAL_CITYZONE_MED = float(np.nanmedian([v for d in CITY_BASE.values() for v in d.values()])) if CITY_BASE else 0.0

# quali derivate servono in serving (train==serve)
REQUIRED_DERIVED = globals().get("REQUIRED_DERIVED") or list_required_serving_derivatives()

serving_pipe = Pipeline(steps=[
    ("canon_geo",   GeoCanonizer()),
    ("priors_guard",PriorsGuard(
        city_base=CITY_BASE,
        region_index=_region_index,
        zone_medians=_ZONE_MED,
        global_cityzone_median=_GLOBAL_CITYZONE_MED,
    )),
    ("derive",      EnsureDerivedFeatures(
        city_base=CITY_BASE,
        region_index=_region_index,
        required_cols=list(REQUIRED_DERIVED),
    )),
    ("core",        champion),  # <- trained model/pipeline (TTR log1p/expm1 → output in k€)
])

# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
BASE_OUT = NB_ROOT / "outputs" if "NB_ROOT" in globals() else Path("outputs")
MODEL_DIR = BASE_OUT / "modeling"
PROP_DIR  = MODEL_DIR / "property"
FIG_DIR   = MODEL_DIR / "figures"
ART_DIR   = MODEL_DIR / "artifacts"
for d in (MODEL_DIR, PROP_DIR, FIG_DIR, ART_DIR):
    d.mkdir(parents=True, exist_ok=True)

# ---------------------------------------------------------------------------
# Save artifacts: feature_order, pipeline, meta, manifest
# ---------------------------------------------------------------------------
FEATURES_FILE = PROP_DIR / "feature_order.json"
FEATURES_FILE.write_text(json.dumps(feature_order, ensure_ascii=False, separators=(",", ":")), encoding="utf-8")
feature_order_sha256 = hashlib.sha256(FEATURES_FILE.read_bytes()).hexdigest()

pipe_path = PROP_DIR / "value_regressor_v2.joblib"
joblib.dump(serving_pipe, pipe_path)
_loaded = joblib.load(pipe_path)
try:
    # check that the reloaded serving pipeline has a fitted core
    _inner_loaded, *_ = _extract_parts(_loaded.named_steps["core"])
    check_is_fitted(_inner_loaded)
except Exception as e:
    raise RuntimeError(f"Saved pipeline failed fitted check: {e}")
pipeline_sha = _sha256_file(pipe_path)

meta_path = PROP_DIR / "value_regressor_v2_meta.json"
model_meta = {
    "asset_type": "property",
    "task": "value_regressor",
    "model_version": "v2",
    "model_class": type(final_est).__name__,
    "trained_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
    "schema_version": "2.0",
    "pipeline_sha256": pipeline_sha,
    "feature_order_sha256": feature_order_sha256,
    "n_features": int(len(feature_order)),
    "features_categorical": [c for c in feature_order if c in set(cat_cols)],
    "features_numeric": [c for c in feature_order if c in set(num_cols)],
    "target_name": VALUATION_K,
    "unit": "k_eur",
    # MUST: esplicita TTR (train in log1p, serve in expm1 → output in k€)
    "ttr": {"forward": "log1p", "inverse": "expm1"},
    # MUST: il pipeline include i transformers di serving
    "includes_preprocessing": True,
    # MUST: derivate che il serving garantirà
    "required_derived_features": list(REQUIRED_DERIVED),
    "feature_order_path": str(FEATURES_FILE.resolve().as_posix()),
    "pipeline_path": str(pipe_path.resolve().as_posix()),
    # (utile per debugging/telemetria)
    "serving_stages": ["GeoCanonizer", "PriorsGuard", "EnsureDerivedFeatures", "core_model"],
}
meta_path.write_text(json.dumps(model_meta, ensure_ascii=False, separators=(",", ":")), encoding="utf-8")

manifest_path = PROP_DIR / "training_manifest.json"
paths = {
    "pipeline": str(pipe_path.resolve().as_posix()),
    "manifest": str(manifest_path.resolve().as_posix()),
    "feature_order": str(FEATURES_FILE.resolve().as_posix()),
}
# include metrics if a dict named `metrics` was produced in the Train & Validation cell
metrics_blob = (metrics if "metrics" in globals() and isinstance(metrics, dict) else {})

manifest = {
    "generated_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
    "schema_version": "2.0",
    "asset_type": "property",
    "task": "value_regressor",
    "paths": paths,
    "model_meta": {
        "model_version": model_meta["model_version"],
        "model_class": model_meta["model_class"],
        "pipeline_sha256": model_meta["pipeline_sha256"],
        "feature_order_sha256": feature_order_sha256,
        # duplicate criticals for BE
        "target_name": model_meta["target_name"],
        "unit": model_meta["unit"],
        "includes_preprocessing": model_meta["includes_preprocessing"],
        "ttr": model_meta["ttr"],
        "required_derived_features": model_meta["required_derived_features"],
    },
    "metrics": metrics_blob,
    "feature_order": feature_order,
    "expected_features": {
        "categorical": model_meta["features_categorical"],
        "numeric": model_meta["features_numeric"],
    },
}
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, separators=(",", ":")), encoding="utf-8")

print("✅ Saved serving pipeline (no-refit):", pipe_path.name)
print("✅ Saved feature_order:", FEATURES_FILE.name)
print("✅ Saved meta:", meta_path.name)
print("✅ Saved manifest:", manifest_path.name)

# ---------------------------------------------------------------------------
# Worst-k slice (10%) on final TEST dataframe `dfm` with y_true/y_pred
# ---------------------------------------------------------------------------
def _worst_k(y_true: np.ndarray, y_pred: np.ndarray, k: float = 0.10) -> dict:
    err = np.abs(y_true - y_pred).astype(float)
    n = max(1, int(len(err) * k))
    top = np.partition(err, -n)[-n:]
    return {
        "worst_k": float(k),
        "worst_k_mean_abs_err": float(top.mean()) if len(top) else float("nan"),
        "worst_k_max_abs_err": float(top.max()) if len(top) else float("nan"),
        "worst_k_count": int(n),
    }

# Prefer using dfm from the previous “Segment metrics & predictions” cell
if "dfm" in globals() and isinstance(dfm, pd.DataFrame) and {"y_true","y_pred"} <= set(dfm.columns):
    y_true_np = dfm["y_true"].to_numpy()
    y_pred_np = dfm["y_pred"].to_numpy()
else:
    # Minimal fallback: recompute natural predictions on TEST (aligned with champion)
    if "df_test" not in globals():
        raise RuntimeError("dfm is missing and df_test not available to recompute worst-k.")
    from sklearn.compose import TransformedTargetRegressor
    def _ensure_cols(df_part: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
        dfp = df_part.copy()
        miss = [c for c in cols if c not in dfp.columns]
        for c in miss: dfp[c] = np.nan
        return dfp[cols]
    X_use = _ensure_cols(df_test, feature_order)
    if is_ttr:
        y_pred_np = np.asarray(champion.predict(X_use), dtype=np.float64)
    else:
        LOG_CAP = float(TRAIN_CFG.get("log_cap_clip", 12.0)) if "TRAIN_CFG" in globals() else 12.0
        z = np.asarray(champion.predict(X_use), dtype=np.float64)
        z = np.clip(z, -20.0, LOG_CAP)
        y_pred_np = np.expm1(z); y_pred_np[y_pred_np < 0] = 0.0
    y_true_np = df_test[VALUATION_K].to_numpy(dtype=np.float64, copy=False)

wk = _worst_k(y_true_np, y_pred_np, k=0.10)
wk_path = MODEL_DIR / "worst_k.json"
wk_path.write_text(canonical_json_dumps(wk), encoding="utf-8")

# update manifest with worst-k
m = json.loads(manifest_path.read_text(encoding="utf-8"))
m.setdefault("metrics", {})["worst_k_10pct"] = wk
manifest_path.write_text(canonical_json_dumps(m), encoding="utf-8")
print("✅ Saved worst-k:", wk_path.name)

# ---------------------------------------------------------------------------
# Post-training location drift (baseline vs expected OR train vs test)
# ---------------------------------------------------------------------------
try:
    from shared.n03_train_model.metrics import compute_location_drift
    _HAS_CLD = True
except Exception:
    _HAS_CLD = False

TRAIN_CFG = globals().get("TRAIN_CFG", {}) or {}
TOL = float(TRAIN_CFG.get("drift_tolerance", 0.05))  # default 5%

def _norm_weights(d: dict) -> dict[str, float]:
    clean = {str(k): float(v) for k, v in (d or {}).items() if pd.api.types.is_number(v) and float(v) >= 0.0}
    s = float(sum(clean.values()))
    if s <= 0:
        return {k: 0.0 for k in clean}
    return {k: v / s for k, v in clean.items()}

def _empirical_weights(df_like: pd.DataFrame, col: str) -> dict[str, float]:
    if not isinstance(df_like, pd.DataFrame) or col not in df_like.columns:
        return {}
    vc = df_like[col].dropna().astype(str).value_counts(normalize=True)
    return {k: float(v) for k, v in vc.items()}

def _fallback_drift(df_like: pd.DataFrame, target_w: dict[str, float]) -> dict:
    emp = _empirical_weights(df_like, LOCATION)
    keys = sorted(set(emp) | set(target_w))
    p = np.array([emp.get(k, 0.0) for k in keys], dtype=float)
    q = np.array([target_w.get(k, 0.0) for k in keys], dtype=float)
    eps = 1e-12
    p = np.clip(p, eps, 1.0); q = np.clip(q, eps, 1.0)
    p /= p.sum(); q /= q.sum()
    m_mid = 0.5 * (p + q)
    jsd = float(0.5 * (np.sum(p * (np.log(p) - np.log(m_mid))) + np.sum(q * (np.log(q) - np.log(m_mid)))))
    tvd = float(0.5 * np.abs(p - q).sum())
    report = {"method": "fallback_jsd_tvd", "JSD": jsd, "TVD": tvd, "per_location": {}}
    for k in keys:
        emp_k = emp.get(k, 0.0); tgt_k = target_w.get(k, 0.0)
        diff = emp_k - tgt_k
        report["per_location"][k] = {
            "target_weight": tgt_k,
            "empirical_weight": emp_k,
            "difference": diff,
            "drifted": bool(abs(diff) > TOL),
            "ratio": (emp_k / tgt_k) if tgt_k > 0 else float("inf")
        }
    return report

baseline_cfg = (TRAIN_CFG.get("expected_profile", {}) or {}).get("location_distribution", {}) or None

if baseline_cfg and "df" in globals():
    # Compare entire dataset vs expected baseline
    target_w = _norm_weights(baseline_cfg)
    drift_result = compute_location_drift(df, target_w, TOL) if _HAS_CLD else _fallback_drift(df, target_w)
    out_path = MODEL_DIR / "location_drift_vs_expected.json"
    out_key  = "location_drift_vs_expected"
elif "df_train" in globals() and "df_test" in globals():
    # Train vs Test — use TEST distribution as target
    tgt = _empirical_weights(df_test, LOCATION)
    target_w = _norm_weights(tgt)
    drift_result = compute_location_drift(df_train, target_w, TOL) if _HAS_CLD else _fallback_drift(df_train, target_w)
    out_path = MODEL_DIR / "location_drift_train_vs_test.json"
    out_key  = "location_drift_train_vs_test"
else:
    raise RuntimeError("No data available to compute post-training location drift.")

out_path.write_text(canonical_json_dumps(drift_result), encoding="utf-8")

m = json.loads(manifest_path.read_text(encoding="utf-8"))
m.setdefault("metrics", {})[out_key] = drift_result
manifest_path.write_text(canonical_json_dumps(m), encoding="utf-8")
print(f"✅ Saved drift metrics → {out_path.name}  (manifest updated)")

✅ Saved serving pipeline (no-refit): value_regressor_v2.joblib
✅ Saved feature_order: feature_order.json
✅ Saved meta: value_regressor_v2_meta.json
✅ Saved manifest: training_manifest.json
✅ Saved worst-k: worst_k.json
✅ Saved drift metrics → location_drift_train_vs_test.json  (manifest updated)


### ModelReportRunner

In [11]:
# 10) Model Report Runner — consistent with training (OrdinalEncoder + TTR) + manifest signing + encoder introspection
from __future__ import annotations

import json
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# ------------------------------ helpers -------------------------------------
def _rmse(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        # ancient sklearn fallback
        return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def _to_parquet_optional(df: pd.DataFrame, path: Path):
    try:
        df.to_parquet(path, index=False)
    except Exception as e:
        print(f"[info] Parquet export skipped for {path.name}: {e}")

def _unique_preserve_order(seq):
    out, seen = [], set()
    for x in seq:
        x = str(x)
        if x not in seen:
            seen.add(x); out.append(x)
    return out

# -------------------------- 0) Load v2 manifest -----------------------------
PROP_DIRS = [Path("notebooks/outputs/modeling/property"), Path("outputs/modeling/property")]
PROP_DIR = next((d for d in PROP_DIRS if d.exists()), PROP_DIRS[0])
MF_PATH = PROP_DIR / "training_manifest.json"
assert MF_PATH.exists(), f"training_manifest.json not found at {MF_PATH}"

mf = json.loads(MF_PATH.read_text(encoding="utf-8"))
ef = mf.get("expected_features") or {}

cat_cols = list(ef.get("categorical") or [])
num_cols = list(ef.get("numeric") or [])

# Derived features that might appear in manifest but not be present in the raw dataset
DERIVED = {
    # legacy variants
    "log_size_m2", "sqm_per_room", "baths_per_100sqm", "elev_x_floor",
    # current engineered features (from our FE cell)
    "rooms_per_100sqm", "no_elev_high_floor", "floor_ratio",
    "pt_importance", "pt_x_periphery", "pt_x_semi_center", "pt_x_center",
    "city_zone_prior", "region_index_prior",
}
# For the runner we only use features that truly exist in the chosen base DF.
# (We’re not re-deriving here—this is a quick report runner.)
num_raw = [c for c in num_cols if c not in DERIVED]

# ------------------ 1) Choose base dataframe (df → df_train → disk) --------
if "df" in globals() and isinstance(df, pd.DataFrame):
    base_df = df.copy()
elif "df_train" in globals() and isinstance(df_train, pd.DataFrame):
    base_df = df_train.copy()
else:
    CAND = [
        Path("notebooks/outputs/dataset_generated.parquet"),
        Path("notebooks/outputs/dataset_generated.csv"),
        Path("outputs/dataset_generated.parquet"),
        Path("outputs/dataset_generated.csv"),
    ]
    src = next((p for p in CAND if p.exists()), None)
    if not src:
        raise RuntimeError("No dataframe found: define df/df_train or ensure dataset_generated.* exists.")
    base_df = pd.read_parquet(src) if src.suffix.lower() in (".parquet", ".pq") else pd.read_csv(src)

# --- Deduplicate duplicated column names (keep first) ---
dup_mask = base_df.columns.duplicated(keep="first")
if dup_mask.any():
    dups = base_df.columns[dup_mask].tolist()
    print(f"⚠️ Dropping duplicated columns (keeping first occurrence): {dups[:12]}{'…' if len(dups)>12 else ''}")
    base_df = base_df.loc[:, ~dup_mask]

# Keep only features that actually exist in the DF and make them unique (order preserved)
cat_cols = _unique_preserve_order([c for c in cat_cols if c in base_df.columns])
num_raw  = _unique_preserve_order([c for c in num_raw  if c in base_df.columns])

# Remove overlaps (prefer numeric if a name appears in both)
overlap = set(cat_cols) & set(num_raw)
if overlap:
    cat_cols = [c for c in cat_cols if c not in overlap]
    print(f"ℹ️ Removed overlaps from categorical (kept numeric priority): {sorted(overlap)[:12]}{'…' if len(overlap)>12 else ''}")

ALL = cat_cols + num_raw

print(f"[manifest] cat={len(cat_cols)}  num(total)={len(num_cols)}  num(raw used here)={len(num_raw)}")
print(f"[df] rows={len(base_df)}  cols={len(base_df.columns)}; usable features in df={len(ALL)}")
print(f"[sample features] {ALL[:12]}")

assert "valuation_k" in base_df.columns, "Target 'valuation_k' missing in dataframe"
assert len(ALL) > 0, "No usable features found (cat+num_raw) for the runner."

df_runner = base_df.copy()

# --------- 2) Pipeline consistent with training (Ordinal + imputers) + TTR ---
pre_all = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("enc", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
        ]), cat_cols) if cat_cols else ("cat", "drop", []),
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
        ]), num_raw) if num_raw else ("num", "drop", []),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

rf_all = RandomForestRegressor(
    n_estimators=300, random_state=42, n_jobs=-1, max_depth=None, min_samples_leaf=2
)

pipe_all = Pipeline([
    ("prep", pre_all),
    ("ttr", TransformedTargetRegressor(
        regressor=rf_all,
        func=np.log1p, inverse_func=np.expm1, check_inverse=False
    )),
])

# ---------------- 3) Simple random split evaluation (raw features) ----------
X_all = df_runner[ALL].copy()
y_nat = df_runner["valuation_k"].astype(float).to_numpy()

X_tr, X_te, y_tr, y_te = train_test_split(X_all, y_nat, test_size=0.20, random_state=42)
pipe_all.fit(X_tr, y_tr)
y_hat = pipe_all.predict(X_te)

r2_all  = r2_score(y_te, y_hat)
mae_all = mean_absolute_error(y_te, y_hat)
rmse_all = _rmse(y_te, y_hat)
print(f"Random split → R²(all)={r2_all:.4f}  MAE={mae_all:.2f}  RMSE={rmse_all:.2f}")

# ---------------- 4) Numeric-only baseline (optional comparison) ------------
if num_raw:
    pre_num = ColumnTransformer(
        [("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_raw)],
        remainder="drop",
        verbose_feature_names_out=False,
    )
    rf_num = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1, min_samples_leaf=2)
    pipe_num = Pipeline([
        ("prep", pre_num),
        ("ttr", TransformedTargetRegressor(
            regressor=rf_num, func=np.log1p, inverse_func=np.expm1, check_inverse=False
        )),
    ])
    Xn_tr, Xn_te, yn_tr, yn_te = train_test_split(df_runner[num_raw].copy(), y_nat, test_size=0.20, random_state=42)
    pipe_num.fit(Xn_tr, yn_tr)
    y_num = pipe_num.predict(Xn_te)

    r2_num  = r2_score(yn_te, y_num)
    mae_num = mean_absolute_error(yn_te, y_num)
    rmse_num = _rmse(yn_te, y_num)
    print(f"Random split → R²(num_raw)={r2_num:.4f}  MAE={mae_num:.2f}  RMSE={rmse_num:.2f}")
    print(f"ΔR² (all − num_raw): {r2_all - r2_num:+.4f}")

# ----------- 5) More robust estimate: GroupShuffleSplit by location ---------
if "location" in df_runner.columns and len(ALL) > 0:
    gss = GroupShuffleSplit(n_splits=5, test_size=0.20, random_state=42)
    r2s, maes, rmses = [], [], []
    groups = df_runner["location"].astype(str).to_numpy()
    for tr_idx, te_idx in gss.split(df_runner[ALL], y_nat, groups=groups):
        pipe_all.fit(df_runner.iloc[tr_idx][ALL], y_nat[tr_idx])
        y_g = pipe_all.predict(df_runner.iloc[te_idx][ALL])
        r2s.append(r2_score(y_nat[te_idx], y_g))
        maes.append(mean_absolute_error(y_nat[te_idx], y_g))
        rmses.append(_rmse(y_nat[te_idx], y_g))
    print("GSS 5× (group=location) → "
          f"R²={np.mean(r2s):.4f}±{np.std(r2s):.4f}  "
          f"MAE={np.mean(maes):.2f}±{np.std(maes):.2f}  "
          f"RMSE={np.mean(rmses):.2f}±{np.std(rmses):.2f}")

# ------------- 6) RF feature importance from the TTR-internal RF ------------
try:
    try:
        feat_names = list(pipe_all.named_steps["prep"].get_feature_names_out())
    except Exception:
        feat_names = [*cat_cols, *num_raw]

    rf_fitted = pipe_all.named_steps["ttr"].regressor_
    importances = getattr(rf_fitted, "feature_importances_", None)
    if importances is None:
        raise RuntimeError("feature_importances_ not available on the regressor.")

    imp = np.asarray(importances, dtype=float)
    if len(feat_names) != len(imp):
        feat_names = [f"f{i}" for i in range(len(imp))]

    fi_df = (pd.DataFrame({"feature": feat_names, "importance": imp})
               .sort_values("importance", ascending=False)
               .reset_index(drop=True))

    print("\nTop 10 feature importance (Ordinal+CT):")
    print(fi_df.head(10).to_string(index=False))

    # optional save next to modeling dir
    MODEL_DIRS = [Path("notebooks/outputs/modeling"), Path("outputs/modeling")]
    MODEL_DIR = next((d for d in MODEL_DIRS if d.exists()), MODEL_DIRS[-1])
    fi_csv = MODEL_DIR / "report_runner_feature_importance.csv"
    fi_parq = MODEL_DIR / "report_runner_feature_importance.parquet"
    fi_df.to_csv(fi_csv, index=False)
    _to_parquet_optional(fi_df, fi_parq)
    print("Saved:", fi_csv)
except Exception as e:
    print("Feature importance not available:", e)

# ---------------- Manifest signing (SHA-256 + created_utc) -------------------
from shared.common.utils import canonical_json_dumps, sha256_hex
from datetime import datetime, timezone

manifest_path = MF_PATH
m = json.loads(manifest_path.read_text(encoding="utf-8")) if manifest_path.exists() else {}
manifest_canon = canonical_json_dumps(m)
m["manifest_sha256"] = sha256_hex(manifest_canon)
if "created_utc" not in m:
    m["created_utc"] = datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00","Z")

manifest_path.write_text(canonical_json_dumps(m), encoding="utf-8")
print("✅ training_manifest signed:", m["manifest_sha256"], "| created_utc:", m["created_utc"])

# -------- Encoder introspection (robust: finds CT even inside serving pipe) --
import joblib

PIPE_PATHS = [
    PROP_DIR / "value_regressor_v2.joblib",
    Path("outputs/modeling/property/value_regressor_v2.joblib"),
]
PIPE_PATH = next((p for p in PIPE_PATHS if p.exists()), None)
assert PIPE_PATH is not None, "value_regressor_v2.joblib not found"
pipe_saved = joblib.load(PIPE_PATH)

def find_ct(obj):
    # direct CT
    if isinstance(obj, ColumnTransformer):
        return obj
    # TTR → inspect regressor/regressor_
    if obj.__class__.__name__ == "TransformedTargetRegressor":
        reg = getattr(obj, "regressor_", None) or getattr(obj, "regressor", None)
        if reg is not None:
            ct = find_ct(reg); 
            if ct is not None: return ct
    # Pipeline → walk steps (handles 'core' wrapper)
    if isinstance(obj, Pipeline):
        for _, st in obj.steps:
            ct = find_ct(st)
            if ct is not None: return ct
    # Named attributes commonly used
    for attr in ("core","prep","preprocessor","feature_processor","processor"):
        if hasattr(obj, attr):
            ct = find_ct(getattr(obj, attr))
            if ct is not None: return ct
    return None

prep = find_ct(pipe_saved)
assert isinstance(prep, ColumnTransformer), "No ColumnTransformer found inside the saved pipeline."

# Build a small fit sample if CT wasn't fitted yet (rare, but safe)
def build_fit_sample(prep: ColumnTransformer) -> pd.DataFrame:
    # prefer df_train / df; else fallback to dataset on disk
    if "df_train" in globals() and isinstance(df_train, pd.DataFrame):
        base = df_train.copy()
    elif "df" in globals() and isinstance(df, pd.DataFrame):
        base = df.copy()
    else:
        ds_candidates = [
            Path("notebooks/outputs/dataset_generated.parquet"),
            Path("notebooks/outputs/dataset_generated.csv"),
            Path("outputs/dataset_generated.parquet"),
            Path("outputs/dataset_generated.csv"),
        ]
        ds_path = next((p for p in ds_candidates if p.exists()), None)
        assert ds_path is not None, "Dataset for mini-fit not found"
        base = pd.read_parquet(ds_path) if ds_path.suffix.lower() in {".parquet",".pq"} else pd.read_csv(ds_path)

    req = []
    for _, _, cols in getattr(prep, "transformers", []):
        if cols is None or cols == "drop":
            continue
        req.extend(list(cols) if isinstance(cols,(list,tuple,np.ndarray,pd.Index)) else [cols])
    req = list(dict.fromkeys(map(str, req)))

    X = base.reindex(columns=req)
    if len(X) > 2000:
        X = X.sample(n=2000, random_state=42)
    return X

if not hasattr(prep, "transformers_"):
    X_small = build_fit_sample(prep)
    try:
        prep.fit(X_small)
    except Exception as e:
        print("[prep] fit failed on mini sample:", e)
else:
    X_small = build_fit_sample(prep)

def get_cat_branch_and_encoder(prep: ColumnTransformer):
    cat_branch, cat_cols_in = None, None
    if hasattr(prep, "transformers_"):
        for name, trans, cols in prep.transformers_:
            if name == "cat":
                cat_branch = trans
                cat_cols_in = list(cols) if isinstance(cols,(list,tuple,np.ndarray,pd.Index)) else ([cols] if cols is not None else [])
                break
        # fallback: first pipeline with a known encoder
        if cat_branch is None:
            for _, trans, cols in prep.transformers_:
                if isinstance(trans, (Pipeline, OneHotEncoder, OrdinalEncoder)):
                    cat_branch = trans
                    cat_cols_in = list(cols) if isinstance(cols,(list,tuple,np.ndarray,pd.Index)) else [cols]
                    break
    enc = None
    if isinstance(cat_branch, Pipeline):
        for key in ("enc","encode","ohe","ordinal","encoder"):
            if key in cat_branch.named_steps and isinstance(cat_branch.named_steps[key], (OneHotEncoder, OrdinalEncoder)):
                enc = cat_branch.named_steps[key]
                break
        if enc is None:
            for _, st in cat_branch.named_steps.items():
                if isinstance(st, (OneHotEncoder, OrdinalEncoder)):
                    enc = st; break
    elif isinstance(cat_branch, (OneHotEncoder, OrdinalEncoder)):
        enc = cat_branch
    return cat_branch, enc, (cat_cols_in or [])

cat_branch, enc, cat_cols_in = get_cat_branch_and_encoder(prep)
assert enc is not None, "Categorical encoder not found in the 'cat' branch."

def derive_categories_from_data(X: pd.DataFrame, cols: list[str]) -> dict[str, list]:
    out = {}
    for c in cols:
        if c in X.columns:
            vals = pd.Series(X[c]).astype("object")
            out[c] = sorted(pd.unique(vals[vals.notna()]).tolist())
        else:
            out[c] = []
    return out

if isinstance(enc, OneHotEncoder):
    if hasattr(enc, "categories_"):
        cats_map = { (cat_cols_in[i] if i < len(cat_cols_in) else f"cat_{i}"): list(c)
                     for i, c in enumerate(enc.categories_) }
    else:
        cats_map = derive_categories_from_data(X_small, cat_cols_in)
    print("Encoder: OneHotEncoder")
    print("handle_unknown:", getattr(enc, "handle_unknown", None))
    # light preview
    for probe in ("region","Region","zone","Zone"):
        if probe in cats_map:
            print(f"categories — {probe} (sample):", cats_map[probe][:12]); break

elif isinstance(enc, OrdinalEncoder):
    if hasattr(enc, "categories_"):
        cats = enc.categories_
        names = cat_cols_in or [f"cat_{i}" for i in range(len(cats))]
        print("Encoder: OrdinalEncoder")
        print("handle_unknown:", getattr(enc, "handle_unknown", None))
        for i, cat in enumerate(cats[:min(5, len(cats))]):
            cname = names[i] if i < len(names) else f"cat_{i}"
            print(f"  {cname}: {list(cat)[:12]}")
    else:
        cats_map = derive_categories_from_data(X_small, cat_cols_in)
        print("Encoder: OrdinalEncoder (fallback categories inferred from data)")
        for k, v in list(cats_map.items())[:5]:
            print(f"  {k}: {v[:12]}")

print("\n✅ Model Report Runner completed.")

ℹ️ Removed overlaps from categorical (kept numeric priority): ['attic', 'cellar', 'concierge', 'garage', 'has_balcony', 'has_elevator', 'has_garden', 'listing_month', 'owner_occupied', 'parking_spot', 'public_transport_nearby']
[manifest] cat=11  num(total)=41  num(raw used here)=30
[df] rows=15000  cols=43; usable features in df=41
[sample features] ['location', 'region', 'urban_type', 'zone', 'energy_class', 'orientation', 'view', 'condition', 'heating', 'asset_id', 'asset_type', 'has_elevator']
Random split → R²(all)=0.9300  MAE=52.95  RMSE=75.85
Random split → R²(num_raw)=0.7015  MAE=113.71  RMSE=156.60
ΔR² (all − num_raw): +0.2285
GSS 5× (group=location) → R²=0.8270±0.0596  MAE=69.09±19.84  RMSE=100.07±26.79

Top 10 feature importance (Ordinal+CT):
              feature  importance
              size_m2    0.499311
distance_to_center_km    0.146674
               region    0.089758
             location    0.087149
                 zone    0.060673
         energy_class    0.03194