### Imports & Config

In [None]:
# 00) Import & setup — esecuzione da dentro `notebooks/`
from __future__ import annotations

import os
import sys
import json
from pathlib import Path

import numpy as np
import pandas as pd

# Siamo già in notebooks/, quindi `shared/` è un pacchetto sibling
NB_ROOT = Path.cwd()                 # .../notebooks
PROJ_ROOT = NB_ROOT.parent           # project root

if str(NB_ROOT) not in sys.path:
    sys.path.insert(0, str(NB_ROOT))
if str(PROJ_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJ_ROOT))

# ── shared imports (coerenti con i tuoi notebook)
from notebooks.shared.common.utils import (
    NumpyJSONEncoder,
    optimize_dtypes,
    log_basic_diagnostics,
    set_global_seed,
)
from notebooks.shared.common.config import load_config, configure_logger
from notebooks.shared.common.constants import (
    VALUATION_K,
    LAST_VERIFIED_TS, PREDICTION_TS, LAG_HOURS,
    CONDITION_SCORE, RISK_SCORE,
)

# sklearn (alcuni import usati in celle successive)
from sklearn.ensemble import RandomForestRegressor  # noqa: F401

# ── Logger
LOG_LEVEL = os.getenv("NB_LOG_LEVEL", "INFO")
logger = configure_logger(name="model_trainer", level=LOG_LEVEL)

# ── Config (opzionale): se non esiste, lavoriamo in fallback senza YAML
CFG_PATH = NB_ROOT / "dataset_config.yaml"
if CFG_PATH.exists():
    CONFIG = load_config(str(CFG_PATH))
    logger.info("Config YAML caricato: %s", CFG_PATH.as_posix())
else:
    CONFIG = {}
    logger.warning("dataset_config.yaml NON trovato: uso fallback di default.")

TRAIN_CFG = CONFIG.get("training", {}) or {}

# ── Seed globale
SEED = int(TRAIN_CFG.get("seed", CONFIG.get("seed", 42)))
set_global_seed(SEED)

# ── Cartelle output (da usare SEMPRE relative a `notebooks/`)
BASE_OUT = NB_ROOT / "outputs"                  # <- 👈 corretto: siamo già in notebooks/
MODEL_DIR = BASE_OUT / "modeling"
FIG_DIR   = MODEL_DIR / "figures"
ART_DIR   = MODEL_DIR / "artifacts"
PROP_DIR  = MODEL_DIR / "property"              # cartella “servita” dal backend/registry

for d in (BASE_OUT, MODEL_DIR, FIG_DIR, ART_DIR, PROP_DIR):
    d.mkdir(parents=True, exist_ok=True)

# ── Dataset path canonico (override da YAML se presente)
DATASET_PATH = Path(TRAIN_CFG.get("dataset_path", BASE_OUT / "dataset_generated.csv"))

# QoL
pd.set_option("display.max_columns", 200)
np.set_printoptions(suppress=True)

logger.info("Setup OK | seed=%s | outputs_dir=%s", SEED, BASE_OUT.as_posix())

### Load Dataset

In [None]:
# 01) Carica dataset dal manifest di nb01 (robusto) + ottimizza + valida
from __future__ import annotations

import json
from pathlib import Path

import pandas as pd

# sanity (preferisci notebooks.shared, fallback a shared)
try:
    from notebooks.shared.common.sanity_checks import validate_dataset
except Exception:
    from notebooks.shared.common.sanity_checks import validate_dataset  # type: ignore

# --- helper: risolvi path relativo verso posizioni note ---
def _resolve_path(p: str | Path) -> Path | None:
    cand = Path(p)
    if cand.exists():
        return cand
    # se è relativo, prova sotto base output e progetto
    for base in [BASE_OUT, NB_ROOT, PROJ_ROOT]:
        q = (base / str(p)).resolve()
        if q.exists():
            return q
    return None

# 1) Trova ultimo manifest di nb01
snap_dir = BASE_OUT / "snapshots"            # notebooks/outputs/snapshots
snap_dir.mkdir(parents=True, exist_ok=True)
manifests = sorted(snap_dir.glob("manifest_*.json"))
manifest01 = None
if manifests:
    try:
        manifest01 = json.loads(manifests[-1].read_text(encoding="utf-8"))
        logger.info("Manifest nb01 trovato: %s", manifests[-1].as_posix())
    except Exception as e:
        logger.warning("Impossibile leggere il manifest più recente: %s", e)

# 2) Determina data_path dal manifest (supporta varie chiavi)
data_path: Path | None = None
if isinstance(manifest01, dict):
    paths = (manifest01.get("paths") or {})  # type: ignore
    for k in ("dataset_path", "dataset", "output_path"):
        p = paths.get(k)
        if p:
            rp = _resolve_path(p)
            if rp:
                data_path = rp
                break

# 3) Fallback: usa DATASET_PATH (da Cella 01) o cerca in BASE_OUT
if data_path is None or not data_path.exists():
    candidates = [
        Path(DATASET_PATH) if isinstance(DATASET_PATH, (str, Path)) else None,
        BASE_OUT / "dataset_generated.parquet",
        BASE_OUT / "dataset_generated.csv",
    ]
    # estendi con eventuali file simili
    candidates += sorted(BASE_OUT.glob("dataset_*.parquet"))
    candidates += sorted(BASE_OUT.glob("dataset_*.csv"))
    data_path = next((c for c in candidates if c and c.exists()), None)

if not data_path or not data_path.exists():
    raise FileNotFoundError(
        "Dataset non trovato. Verifica manifest nb01 in notebooks/outputs/snapshots "
        "oppure che esista notebooks/outputs/dataset_generated.(csv|parquet)."
    )

logger.info("📄 Caricamento dataset da: %s", data_path.as_posix())

# 4) Caricamento parquet/csv
if data_path.suffix.lower() in {".parquet", ".pq"}:
    df = pd.read_parquet(data_path)
else:
    df = pd.read_csv(data_path)

# 5) Ottimizzazione dtypes (log risparmio)
mem_before = df.memory_usage(deep=True).sum() / 1024**2
df = optimize_dtypes(df)
mem_after = df.memory_usage(deep=True).sum() / 1024**2
logger.info(
    "✅ Dtypes optimized: %.2f MB → %.2f MB (−%.2f MB, %.1f%%)",
    mem_before, mem_after, mem_before - mem_after,
    0.0 if mem_before == 0 else (mem_before - mem_after) / mem_before * 100.0
)

# 6) Diagnostica rapida
log_basic_diagnostics(df, logger)

# 7) Validazione schema (asset_type da config, come nb01)
asset_type = str(CONFIG.get("generation", {}).get("asset_type", "property"))
try:
    val_report = validate_dataset(df, asset_type=asset_type, raise_on_failure=True)
    logger.info("✅ Schema validation passed")
except Exception as e:
    logger.warning("Schema validation warning: %s", e)
    val_report = {"overall_passed": False, "error": str(e)}

# 8) Persistenza report vicino ai modeling outputs
(MODEL_DIR / "validation_nb03.json").write_text(
    json.dumps(val_report, cls=NumpyJSONEncoder, indent=2, ensure_ascii=False),
    encoding="utf-8"
)

In [None]:
# 02) PULIZIA LEAKAGE IMMEDIATA (robusta)
from __future__ import annotations

import re

# --- costanti: preferisci notebooks.shared, fallback a shared, poi fallback a stringhe ---
try:
    from notebooks.shared.common.constants import VALUATION_K as _VAL_K
except Exception:
    try:
        from notebooks.shared.common.constants import VALUATION_K as _VAL_K  # type: ignore
    except Exception:
        _VAL_K = "valuation_k"

try:
    from notebooks.shared.common.constants import PRICE_PER_SQM as _PPS, PRICE_PER_SQM_CAPPED_VIOLATED as _PPSV
except Exception:
    try:
        from notebooks.shared.common.constants import PRICE_PER_SQM as _PPS, PRICE_PER_SQM_CAPPED_VIOLATED as _PPSV  # type: ignore
    except Exception:
        _PPS = "price_per_sqm"
        _PPSV = "price_per_sqm_capped_violated"

VALUTION_K = _VAL_K  # compat alias usato più sotto
PRICE_PER_SQM = _PPS
PRICE_PER_SQM_CAPPED_VIOLATED = _PPSV

# --- lista centrale di feature leaky (se presente) ---
try:
    from notebooks.shared.n03_train_model.preprocessing import ML_LEAKY_FEATURES as _ML_LEAKY
except Exception:
    try:
        from notebooks.shared.n03_train_model.preprocessing import ML_LEAKY_FEATURES as _ML_LEAKY  # type: ignore
    except Exception:
        _ML_LEAKY = {
            "price_per_sqm", "price_per_sqm_vs_region_avg", "price_per_sqm_capped",
            "valuation_k_log", "_viz_price_per_sqm", "valuation_k_decile",
            "valuation_rank", "is_top_valuation"
        }

# --- 1) Rimozione esplicita (case-insensitive) ---
explicit_leaky = {
    PRICE_PER_SQM,
    "price_per_sqm",
    "price_per_sqm_vs_region_avg",
    "price_per_sqm_capped",
    "valuation_k_log",
    PRICE_PER_SQM_CAPPED_VIOLATED,
    "strongly_incoherent",
    "valuation_k_decile",
    "valuation_rank",
    "is_top_valuation",
}
explicit_leaky |= set(map(str, _ML_LEAKY))

# mappa lowercase -> originale
lower_map = {c.lower(): c for c in df.columns}
present_explicit = [lower_map[n.lower()] for n in explicit_leaky if n and n.lower() in lower_map]

# --- 2) Rimozione pattern-based (regex, case-insensitive) ---
regex_patterns = [
    r"price_per_sqm",       # qualunque col contenga price_per_sqm
    r"^valuation_k_.+$",    # derivate del target
]
present_regex = []
for col in df.columns:
    if col == VALUTION_K:
        continue
    if any(re.search(pat, col, flags=re.IGNORECASE) for pat in regex_patterns):
        present_regex.append(col)

# --- 3) Applica rimozione ---
to_drop = sorted(set(present_explicit) | set(present_regex))
if to_drop:
    logger.warning("🔴 RIMOZIONE FEATURES LEAKY: %s", to_drop)
    df.drop(columns=to_drop, inplace=True, errors="ignore")
    logger.info("✅ Dataset pulito: %d colonne rimanenti", df.shape[1])
else:
    logger.info("✅ Nessuna feature leaky trovata nel dataset")

# --- 4) Verifiche finali ---
assert not any("price_per_sqm" in c.lower() for c in df.columns), "ERRORE: colonne 'price_per_sqm*' ancora presenti!"
assert not any(c.lower().startswith("valuation_k_") for c in df.columns if c != VALUTION_K), "ERRORE: derivate 'valuation_k_*' ancora presenti!"

# Debug essenziale
logger.debug("Colonne rimanenti: %s", list(df.columns))
print(f"Shape dopo pulizia: {df.shape}")
print(f"Colonne numeriche: {df.select_dtypes(include='number').columns.tolist()}")

### Preparation (derivations)

In [None]:
# 04) FEATURE DERIVATE
from __future__ import annotations
import pandas as pd

try:
    from notebooks.shared.common.constants import PRICE_PER_SQM  # type: ignore
except Exception:
    PRICE_PER_SQM = "price_per_sqm"

# 1) LAG_HOURS se mancante (da timestamp UTC)
if (LAG_HOURS not in df.columns) and ({LAST_VERIFIED_TS, PREDICTION_TS} <= set(df.columns)):
    # parse tollerante (accetta naive → le rende UTC)
    df[LAST_VERIFIED_TS] = pd.to_datetime(df[LAST_VERIFIED_TS], utc=True, errors="coerce")
    df[PREDICTION_TS]   = pd.to_datetime(df[PREDICTION_TS],   utc=True, errors="coerce")

    lag = (df[PREDICTION_TS] - df[LAST_VERIFIED_TS]).dt.total_seconds().div(3600)
    # valori negativi o assurdi → NaN; poi cast a float32
    lag = lag.where(lag >= 0, other=pd.NA)
    df[LAG_HOURS] = lag.astype("Float32")
    logger.info("Creato %s da %s & %s", LAG_HOURS, LAST_VERIFIED_TS, PREDICTION_TS)

elif LAG_HOURS in df.columns:
    df[LAG_HOURS] = pd.to_numeric(df[LAG_HOURS], errors="coerce").astype("Float32")
else:
    logger.warning("Impossibile derivare %s: mancano %s o %s", LAG_HOURS, LAST_VERIFIED_TS, PREDICTION_TS)

# 2) condition_minus_risk (utile e non-leaky)
if (CONDITION_SCORE in df.columns) and (RISK_SCORE in df.columns):
    df[CONDITION_SCORE] = pd.to_numeric(df[CONDITION_SCORE], errors="coerce").astype("Float32")
    df[RISK_SCORE]      = pd.to_numeric(df[RISK_SCORE],      errors="coerce").astype("Float32")
    df["condition_minus_risk"] = (df[CONDITION_SCORE] - df[RISK_SCORE]).astype("Float32")
    logger.info("Creato feature derivata: condition_minus_risk")
else:
    logger.debug("condition_minus_risk non creato: mancano %s o %s", CONDITION_SCORE, RISK_SCORE)

if ("listing_month" not in df.columns) and (PREDICTION_TS in df.columns):
    if pd.api.types.is_datetime64_any_dtype(df[PREDICTION_TS]) or pd.api.types.is_object_dtype(df[PREDICTION_TS]):
        try:
            ts = pd.to_datetime(df[PREDICTION_TS], utc=True, errors="coerce")
            df["listing_month"] = ts.dt.month.astype("Int16")
            logger.info("Creato listing_month da %s", PREDICTION_TS)
        except Exception:
            logger.debug("listing_month non creato (parse fallita)")

# 4) Target: check + coercizione numerica
if VALUATION_K not in df.columns:
    raise ValueError(f"{VALUATION_K} mancante: impossibile allenare.")
df[VALUATION_K] = pd.to_numeric(df[VALUATION_K], errors="coerce").astype("Float32")

# 5) Verifica finale assenza leakage
assert not any("price_per_sqm" in c.lower() for c in df.columns), "LEAKAGE: colonne 'price_per_sqm*' presenti!"
assert not any(c.lower().startswith("valuation_k_") for c in df.columns if c != VALUATION_K), \
       "LEAKAGE: derivate 'valuation_k_*' ancora presenti!"

# 6) Snapshot
print("=" * 60)
print("DATASET PULITO - PRIME 3 RIGHE")
print("=" * 60)
display(df.head(3))
print(f"\nShape: {df.shape}")
print(f"Target (valuation_k) range: [{df[VALUATION_K].min():.2f}, {df[VALUATION_K].max():.2f}]")

In [None]:
# 05) ANALISI CORRELAZIONI CON IL TARGET (no-leakage, robusta)
from __future__ import annotations

import json
import numpy as np
import pandas as pd
from pathlib import Path

SUSPICIOUS_THR = 0.95
corr_json_path = ART_DIR / "target_correlations.json"
corr_csv_path  = ART_DIR / "target_correlations.csv"

# 0) Safety: il target deve essere numerico (coercizzato in cella 04)
if VALUATION_K not in df.columns:
    logger.error("Target %s non trovato nel dataset", VALUATION_K)
    corr_json_path.write_text(json.dumps({"error": "target missing"}), encoding="utf-8")
else:
    # 1) Colonne numeriche (post-pulizia) + sanity
    numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    if VALUATION_K not in numeric_cols:
        logger.warning("Il target non risulta numerico: provo a forzare il cast.")
        df[VALUATION_K] = pd.to_numeric(df[VALUATION_K], errors="coerce")
        if not pd.api.types.is_numeric_dtype(df[VALUATION_K]):
            raise TypeError(f"{VALUATION_K} non numerico; impossibile calcolare correlazioni.")

    if len(numeric_cols) < 2:
        logger.warning("Poche colonne numeriche per calcolare correlazioni.")
        corr_json_path.write_text(json.dumps({"error": "not enough numeric columns"}), encoding="utf-8")
    else:
        # 2) Pearson
        corr_mat = df[numeric_cols].corr(method="pearson")
        if VALUATION_K not in corr_mat.columns:
            raise RuntimeError("Correlazione Pearson non calcolabile sul target (tutti NaN?).")

        correlations = corr_mat[VALUATION_K].drop(labels=[VALUATION_K], errors="ignore").sort_values(ascending=False)

        print("=" * 60)
        print("TOP 15 CORRELAZIONI POSITIVE (Pearson) CON IL TARGET")
        print("=" * 60)
        print(correlations.head(15))

        print("\n" + "=" * 60)
        print("TOP 15 CORRELAZIONI NEGATIVE (Pearson) CON IL TARGET")
        print("=" * 60)
        print(correlations.tail(15))

        suspicious = correlations[correlations.abs() > SUSPICIOUS_THR]
        if not suspicious.empty:
            print("\n🔴 ATTENZIONE: Correlazioni sospette |r| >", SUSPICIOUS_THR)
            for feat, corr in suspicious.items():
                print(f"  - {feat}: {corr:.4f}")
            logger.warning("Possibile leakage o duplicati semantici: %s", list(suspicious.index))
        else:
            print("\n✅ Nessuna correlazione sospetta (>|r| >", SUSPICIOUS_THR, ")")

        # 3) Report strutturato + CSV
        corr_df = pd.DataFrame(
            {"feature": correlations.index, "correlation_pearson": correlations.values}
        )
        payload = {
            "meta": {
                "method": "pearson",
                "n_numeric_features": int(len(numeric_cols) - 1),
                "target": VALUATION_K,
                "suspicious_threshold": SUSPICIOUS_THR,
            },
            "correlations": corr_df.to_dict("records"),
            "suspicious": suspicious.to_dict() if not suspicious.empty else {},
        }
        corr_json_path.write_text(
            json.dumps(payload, indent=2, ensure_ascii=False, cls=NumpyJSONEncoder),
            encoding="utf-8",
        )
        corr_df.to_csv(corr_csv_path, index=False)
        logger.info("Correlations saved: %s (JSON) | %s (CSV)", corr_json_path, corr_csv_path)

        # 4) Spearman (best-effort, robusto a monotonia non lineare)
        try:
            spearman = df[numeric_cols].corr(method="spearman")[VALUATION_K].drop(labels=[VALUATION_K], errors="ignore")
            print("\n" + "=" * 60)
            print("TOP 10 CORRELAZIONI SPEARMAN (assolute) CON IL TARGET")
            print("=" * 60)
            print(spearman.reindex(spearman.abs().sort_values(ascending=False).index).head(10))
        except Exception as e:
            logger.debug("Spearman correlation failed: %s", e)

In [None]:
# 06) SPLIT TRAIN/VALID/TEST (strat. su decili) + blocco duplicati per gruppo (default: ASSET_ID)
from __future__ import annotations

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# costanti (PRICE_PER_SQM può non essere importato in questo notebook)
try:
    from notebooks.shared.common.constants import VALUATION_K, ASSET_ID, PRICE_PER_SQM  # type: ignore
except Exception:
    from notebooks.shared.common.constants import VALUATION_K, ASSET_ID  # type: ignore
    PRICE_PER_SQM = "price_per_sqm"

TARGET = VALUATION_K

# --- Verifica preliminare
print(f"Dataset shape prima dello split: {df.shape}")
print(f"Colonne totali: {len(df.columns)}")

# --- 0) Parametri da config (con fallback)
if "TRAIN_CFG" not in globals() or not isinstance(TRAIN_CFG, dict):
    TRAIN_CFG = {}
TEST_SIZE = float(TRAIN_CFG.get("test_size", 0.15))
VAL_SIZE  = float(TRAIN_CFG.get("val_size",  0.15))
N_DECILES = int(TRAIN_CFG.get("n_deciles",   10))
GROUP_COL = str(TRAIN_CFG.get("group_col", ASSET_ID))  # puoi mettere 'location' in config se vuoi

if not (0.01 <= TEST_SIZE <= 0.9) or not (0.01 <= VAL_SIZE <= 0.9):
    logger.warning("test_size/val_size fuori range → fallback 0.15/0.15")
    TEST_SIZE, VAL_SIZE = 0.15, 0.15

# --- 1) Pulizia target per lo split
mask_y = pd.to_numeric(df[TARGET], errors="coerce").notna()
if not mask_y.all():
    logger.warning("Righe senza target rimosse dallo split: %d", (~mask_y).sum())
df_clean = df.loc[mask_y].copy()

# --- helper: stratificazione per decili “robusta”
def _strat_bins(y: pd.Series, q: int = 10) -> pd.Series:
    """Decili robusti sul target (usa rank per duplicati). Fallback a singola classe."""
    y_num = pd.to_numeric(y, errors="coerce")
    ranks = y_num.rank(method="first")
    unique = int(ranks.nunique())
    if unique < 2:
        return pd.Series(0, index=y.index, dtype=int)
    q_eff = max(2, min(int(q), unique))
    try:
        bins = pd.qcut(ranks, q=q_eff, labels=False, duplicates="drop")
    except Exception:
        bins = pd.Series(0, index=y.index, dtype=int)
    # riempi eventuali NaN con la moda
    if bins.isna().any():
        mode_bin = int(bins.dropna().mode().iat[0]) if not bins.dropna().empty else 0
        bins = bins.fillna(mode_bin).astype(int)
    return bins.astype(int)

def _safe_stratify(labels: pd.Series | np.ndarray, min_per_class: int = 2):
    """Ritorna labels se idonee alla stratificazione, altrimenti None."""
    lab = pd.Series(labels)
    vc = lab.value_counts()
    if len(vc) < 2 or (vc < min_per_class).any():
        return None
    return lab.values

# --- 2) Split con blocco duplicati per GROUP_COL (default: ASSET_ID), fallback classico se manca
if GROUP_COL in df_clean.columns and df_clean[GROUP_COL].notna().any():
    # mediana target per gruppo → decili a livello gruppo
    df_clean[GROUP_COL] = df_clean[GROUP_COL].astype(str)
    gstats = (
        df_clean[[GROUP_COL, TARGET]]
        .groupby(GROUP_COL, as_index=False)[TARGET]
        .median()
        .rename(columns={TARGET: f"{TARGET}__group_median"})
    )

    g_all = gstats[GROUP_COL].values
    g_bins_all = _strat_bins(gstats[f"{TARGET}__group_median"], q=N_DECILES).values
    strat_all = _safe_stratify(g_bins_all)

    # primo split: test groups
    g_tmp, g_test = train_test_split(
        g_all,
        test_size=TEST_SIZE,
        random_state=SEED,
        stratify=strat_all,
    )

    # secondo split: valid dal residuo
    val_rel = VAL_SIZE / max(1e-9, (1.0 - TEST_SIZE))
    val_rel = float(min(max(val_rel, 0.05), 0.8))

    tmp_mask = np.isin(gstats[GROUP_COL].values, g_tmp)
    gstats_tmp = gstats.loc[tmp_mask].copy()
    # decili solo sui gruppi rimasti
    bins_tmp = _strat_bins(gstats_tmp[f"{TARGET}__group_median"], q=N_DECILES).values
    # mappa gruppo→bin per stratify
    bin_map_tmp = dict(zip(gstats_tmp[GROUP_COL].values, bins_tmp))
    y_tmp_bins = np.array([bin_map_tmp.get(g, 0) for g in g_tmp])
    strat_tmp = _safe_stratify(y_tmp_bins)

    g_train, g_valid = train_test_split(
        g_tmp,
        test_size=val_rel,
        random_state=SEED,
        stratify=strat_tmp,
    )

    G_TRAIN, G_VALID, G_TEST = set(g_train), set(g_valid), set(g_test)
    df_train = df_clean[df_clean[GROUP_COL].isin(G_TRAIN)].copy()
    df_valid = df_clean[df_clean[GROUP_COL].isin(G_VALID)].copy()
    df_test  = df_clean[df_clean[GROUP_COL].isin(G_TEST)].copy()

    # overlap check
    def _overlap(a, b):
        return set(a[GROUP_COL].astype(str)) & set(b[GROUP_COL].astype(str))
    ov_tv = _overlap(df_train, df_valid)
    ov_tt = _overlap(df_train, df_test)
    ov_vt = _overlap(df_valid, df_test)
    assert len(ov_tv) == 0 and len(ov_tt) == 0 and len(ov_vt) == 0, (
        f"Overlap {GROUP_COL} tra split! "
        f"train∩valid={list(ov_tv)[:5]}, train∩test={list(ov_tt)[:5]}, valid∩test={list(ov_vt)[:5]}"
    )

else:
    logger.warning(
        "%s assente/non valido: uso fallback senza grouping (possibile leakage se ci sono duplicati).",
        GROUP_COL,
    )

    # stratify riga-level su decili target
    bins_all = _strat_bins(df_clean[TARGET], q=N_DECILES)
    df_tmp, df_test = train_test_split(
        df_clean,
        test_size=TEST_SIZE,
        random_state=SEED,
        stratify=_safe_stratify(bins_all),
    )

    val_rel = VAL_SIZE / max(1e-9, (1.0 - TEST_SIZE))
    val_rel = float(min(max(val_rel, 0.05), 0.8))
    bins_tmp = _strat_bins(df_tmp[TARGET], q=N_DECILES)
    df_train, df_valid = train_test_split(
        df_tmp,
        test_size=val_rel,
        random_state=SEED,
        stratify=_safe_stratify(bins_tmp),
    )

# --- 3) Log e verifiche generali
for name, part in (("train", df_train), ("valid", df_valid), ("test", df_test)):
    logger.info("%s: %d rows, %d cols", name, len(part), part.shape[1])

# partition disjoint per index
assert len(set(df_train.index) & set(df_valid.index)) == 0
assert len(set(df_train.index) & set(df_test.index)) == 0
assert len(set(df_valid.index) & set(df_test.index)) == 0

# --- 4) Airbag anti-leakage sugli split
for split_name, split_df in [("train", df_train), ("valid", df_valid), ("test", df_test)]:
    if any("price_per_sqm" in c.lower() for c in split_df.columns):
        logger.error("🔴 LEAKAGE: colonne 'price_per_sqm*' in df_%s!", split_name)

print("\n✅ Split completato:")
print(f"  Train: {df_train.shape}")
print(f"  Valid: {df_valid.shape}")
print(f"  Test:  {df_test.shape}")
print(f"  Group column: {GROUP_COL}")

In [None]:
# A) PRE-CHAIN GLOBALE: canonizza geo + crea/riempi prior/derivate minime
from notebooks.shared.common.constants import SIZE_M2


try:
    from notebooks.shared.common.config import ASSET_CONFIG  # type: ignore
    _PROP = ASSET_CONFIG["property"]
    _CITY_BASE = {c.lower(): {z.lower(): v for z, v in d.items()}
                  for c, d in (_PROP.get("city_base_prices") or {}).items()}
    _REGION_INDEX = {k.lower(): v for k, v in (_PROP.get("region_index") or {
        "north": 1.05, "center": 1.00, "south": 0.92
    }).items()}
except Exception:
    _CITY_BASE = {}
    _REGION_INDEX = {"north": 1.05, "center": 1.00, "south": 0.92}

# mediane di fallback per zona e globale
_ZONE_KEYS = set(z for d in _CITY_BASE.values() for z in d.keys())
_ZONE_MED = {z: float(np.nanmedian([d.get(z, np.nan) for d in _CITY_BASE.values()])) for z in _ZONE_KEYS} if _CITY_BASE else {}
_GLOBAL_CITYZONE_MED = float(np.nanmedian([v for d in _CITY_BASE.values() for v in d.values()])) if _CITY_BASE else 0.0

def _canon_geo(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    if "city" not in out.columns and "location" in out.columns:
        out["city"] = out["location"]
    if "zone" not in out.columns:
        out["zone"] = "semi_center"
    if "region" not in out.columns:
        out["region"] = "center"
    for col in ("city","zone","region"):
        if col in out.columns:
            out[col] = out[col].astype(str).str.strip().str.lower()
    return out

def _ensure_priors_and_min_derivatives(df: pd.DataFrame) -> pd.DataFrame:
    """Crea/riempi: no_elev_high_floor, rooms_per_100sqm, city_zone_prior, region_index_prior."""
    out = _canon_geo(df)

    # no_elev_high_floor = penalità (niente ascensore & piano > 1)
    f = pd.to_numeric(out.get("floor"), errors="coerce")
    e = pd.to_numeric(out.get("has_elevator"), errors="coerce").fillna(0)
    out["no_elev_high_floor"] = ((1 - e) * np.maximum(f - 1, 0)).astype("float64")

    # rooms_per_100sqm
    s = pd.to_numeric(out.get(SIZE_M2), errors="coerce").replace(0, np.nan)
    r = pd.to_numeric(out.get("rooms"), errors="coerce")
    out["rooms_per_100sqm"] = (100.0 * r / s).astype("float64")

    # city_zone_prior (CITY_BASE con fallback zona→globale)
    ci = out.get("city", pd.Series(index=out.index, dtype=str)).astype(str).str.lower()
    zo = out.get("zone", pd.Series(index=out.index, dtype=str)).astype(str).str.lower()
    vals = []
    for c, z in zip(ci, zo):
        v = _CITY_BASE.get(c, {}).get(z, np.nan)
        if pd.isna(v):
            v = _ZONE_MED.get(z, _GLOBAL_CITYZONE_MED)
        vals.append(v)
    out["city_zone_prior"] = np.asarray(vals, dtype="float64")

    # region_index_prior (macroarea)
    out["region_index_prior"] = out["region"].astype(str).str.lower().map(_REGION_INDEX).astype("float64")

    return out

# applica a tutti gli split (non è leakage: sono feature ex-ante, no target)
for _name in ("df_train","df_valid","df_test"):
    if _name in globals() and isinstance(globals()[_name], pd.DataFrame):
        globals()[_name] = _ensure_priors_and_min_derivatives(globals()[_name])

try:
    n_nan_cz = int(pd.to_numeric(df_train["city_zone_prior"], errors="coerce").isna().sum())
    n_nan_ri = int(pd.to_numeric(df_train["region_index_prior"], errors="coerce").isna().sum())
    n_nan_ne = int(pd.to_numeric(df_train["no_elev_high_floor"], errors="coerce").isna().sum())
    n_nan_rr = int(pd.to_numeric(df_train["rooms_per_100sqm"], errors="coerce").isna().sum())
    (ART_DIR / "prechain_checks.txt").write_text(
        f"train NaN city_zone_prior={n_nan_cz}, region_index_prior={n_nan_ri}, "
        f"no_elev_high_floor={n_nan_ne}, rooms_per_100sqm={n_nan_rr}\n",
        encoding="utf-8"
    )
except Exception:
    pass

### Anomaly Flags (train only)

In [None]:
# 04) Flags di outlier/anomalie SOLO sul TRAIN → feature/pesi (no leakage)
from __future__ import annotations
import json
import numpy as np
import pandas as pd

# --- costanti con import robusto
try:
    from notebooks.shared.common.constants import ENV_SCORE, LUXURY_SCORE, SIZE_M2, VALUATION_K  # type: ignore
except Exception:
    ENV_SCORE, LUXURY_SCORE, SIZE_M2, VALUATION_K = "env_score", "luxury_score", "size_m2", "valuation_k"

try:
    from notebooks.shared.common.constants import LAG_HOURS  # type: ignore
except Exception:
    LAG_HOURS = "lag_hours"

# feature “leaky”/derivate definite nell’EDA (se presenti)
try:
    from notebooks.shared.n02_explore_dataset.eda_core import AnomalyDetector, LEAKY_FEATURES, TARGET_DERIVED_FEATURES  # type: ignore
except Exception:
    AnomalyDetector = None  # fallback sotto
    LEAKY_FEATURES = {"price_per_sqm", "price_per_sqm_capped", "price_per_sqm_vs_region_avg", "valuation_k_log"}
    TARGET_DERIVED_FEATURES = {"_viz_price_per_sqm", "valuation_k_decile", "valuation_rank", "is_top_valuation"}

# Parametri con override da config
if "TRAIN_CFG" not in globals() or not isinstance(TRAIN_CFG, dict):
    TRAIN_CFG = {}
contamination       = float(TRAIN_CFG.get("anomaly_contamination", 0.03))
strong_z_threshold  = float(TRAIN_CFG.get("anomaly_strong_z", 2.5))
severity_percentile = float(TRAIN_CFG.get("anomaly_severity_pct", 90.0))
n_estimators        = int(TRAIN_CFG.get("anomaly_n_estimators", 200))

# 4.1 Scelta feature candidate (numeric, no target/leaky/derived)
num_cols = [c for c in df_train.columns if pd.api.types.is_numeric_dtype(df_train[c])]
exclude  = set(LEAKY_FEATURES) | set(TARGET_DERIVED_FEATURES) | {VALUATION_K, "price_per_sqm"}

prefer   = [
    "condition_minus_risk", SIZE_M2, LUXURY_SCORE, ENV_SCORE,
    "building_age_years", "distance_to_center_km", LAG_HOURS,
    "air_quality_index", "noise_level", "humidity_level", "temperature_avg",
]
feat_cand = [c for c in prefer if c in df_train.columns and c in num_cols and c not in exclude]

# fallback: scegli le prime N numeriche con var > 0 e almeno 10 valori unici
if len(feat_cand) < 3:
    cand_pool = []
    for c in num_cols:
        if c in exclude or c == "sample_weight":
            continue
        s = pd.to_numeric(df_train[c], errors="coerce")
        if s.nunique(dropna=True) >= 10 and np.nanvar(s.values) > 0:
            cand_pool.append((c, float(np.nanvar(s.values))))
    cand_pool.sort(key=lambda x: x[1], reverse=True)
    feat_cand = [c for c, _ in cand_pool[:8]]  # max 8

if feat_cand:
    logger.info("Anomaly features (train only): %s", feat_cand)

    # 4.2 Rilevamento anomalie (classe ufficiale → fallback z-score)
    if AnomalyDetector is not None:
        anom = AnomalyDetector(
            contamination=contamination,
            strong_z_threshold=strong_z_threshold,
            severity_percentile=severity_percentile,
            n_estimators=n_estimators,
            random_state=SEED,
        )
        df_train_anom, anom_rep = anom.detect_anomalies(
            df_train,
            feature_candidates=feat_cand,
            exclude_features=set(),  # già esclusi a monte
        )
    else:
        # --- Fallback semplice: z-score medio + percentile su features candidate
        X = df_train[feat_cand].copy()
        X = X.apply(pd.to_numeric, errors="coerce")
        mu = X.mean(axis=0)
        sd = X.std(axis=0).replace(0, np.nan)
        z  = (X - mu) / sd
        z_abs = z.abs()
        z_mean = z_abs.mean(axis=1)  # severità media
        thr = np.nanpercentile(z_mean.dropna().values, severity_percentile)
        flags_raw = (z_abs > strong_z_threshold).any(axis=1)
        flags_ref = (z_mean >= thr)

        df_train_anom = df_train.copy()
        df_train_anom["anomaly_flag"]    = flags_raw.astype(np.int8)
        df_train_anom["anomaly_refined"] = flags_ref.astype(np.int8)
        df_train_anom["severity_score"]  = z_mean.fillna(0).astype("float32")

        n_raw = int(flags_raw.sum())
        n_ref = int(flags_ref.sum())
        anom_rep = {
            "method": "fallback_zscore",
            "features": feat_cand,
            "strong_z_threshold": strong_z_threshold,
            "severity_percentile": severity_percentile,
            "n_anomalies_raw": n_raw,
            "n_anomalies_refined": n_ref,
        }

    logger.info("Anomalie raw: %s | refined: %s",
                anom_rep.get("n_anomalies_raw"), anom_rep.get("n_anomalies_refined"))

    # 4.3 trasferisci colonne utili SOLO su train (no leakage)
    for col in ("anomaly_flag", "anomaly_refined", "severity_score"):
        if col in df_train_anom.columns:
            df_train.loc[df_train_anom.index, col] = df_train_anom[col]

    # 4.4 salva report
    (ART_DIR / "anomaly_train_report.json").write_text(
        json.dumps(anom_rep, cls=NumpyJSONEncoder, indent=2, ensure_ascii=False),
        encoding="utf-8"
    )
else:
    logger.info("Anomaly detection skipped: nessuna feature candidata valida.")

# 4.5 sample_weight (fallback = 1.0) — SOLO TRAIN
if "severity_score" in df_train.columns and df_train["severity_score"].notna().any():
    sev = pd.to_numeric(df_train["severity_score"], errors="coerce").clip(lower=0).astype("float32")
    w   = 1.0 / (1.0 + sev)              # decresce con severità
    w   = w.clip(lower=0.2, upper=1.0)   # evita pesi troppo piccoli
    w   = w * (1.0 / max(w.mean(), 1e-6))  # normalize mean≈1.0
    df_train["sample_weight"] = w.astype("float32")
    logger.info("sample_weight da severity_score (mean=%.3f, min=%.3f, max=%.3f)",
                float(w.mean()), float(w.min()), float(w.max()))
elif "confidence_score" in df_train.columns and df_train["confidence_score"].notna().any():
    w = pd.to_numeric(df_train["confidence_score"], errors="coerce").clip(0.2, 1.0).astype("float32")
    w = w * (1.0 / max(w.mean(), 1e-6))
    df_train["sample_weight"] = w
    logger.info("sample_weight da confidence_score (mean=%.3f, min=%.3f, max=%.3f)",
                float(w.mean()), float(w.min()), float(w.max()))
else:
    df_train["sample_weight"] = np.float32(1.0)
    logger.info("sample_weight uniforme (1.0) — nessuna metrica disponibile.")

### Feature Preparation & Pipelines A/B

In [None]:
# === FEATURE PREP + ANALISI + AUTO-UPDATE (UNIFICATA) ========================
from __future__ import annotations
import os, re, json
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

# ── util / costanti minime
try:
    from notebooks.shared.n03_train_model.preprocessing import ML_LEAKY_FEATURES  # type: ignore
except Exception:
    ML_LEAKY_FEATURES = {
        "price_per_sqm","price_per_sqm_vs_region_avg","price_per_sqm_capped",
        "valuation_k_log","_viz_price_per_sqm",
        "valuation_k_decile","valuation_rank","is_top_valuation"
    }

try:
    NumpyJSONEncoder
except NameError:
    class NumpyJSONEncoder(json.JSONEncoder):
        def default(self, obj):
            import numpy as _np
            if isinstance(obj, (_np.integer,)):  return int(obj)
            if isinstance(obj, (_np.floating,)): return float(obj)
            if isinstance(obj, (_np.ndarray,)):  return obj.tolist()
            return super().default(obj)

try:
    _ensure_columns
except NameError:
    def _ensure_columns(df_part: pd.DataFrame, required: list[str]) -> pd.DataFrame:
        missing = [c for c in required if c not in df_part.columns]
        if missing:
            for c in missing:
                df_part[c] = np.nan
        return df_part[required]

from notebooks.shared.common.constants import VALUATION_K, ASSET_ID
from notebooks.shared.common.config import ASSET_CONFIG

# ── helpers
def _uniq(xs: list[str]) -> list[str]: return list(dict.fromkeys(xs))
def _matches_any(col: str, pats: list[str]) -> bool: return any(re.search(p, col, re.I) for p in pats)
def _is_numeric(s: pd.Series) -> bool: return pd.api.types.is_numeric_dtype(s)

# ==== 0) Policy allowlist + esclusioni (guided by config) ====================
_cfg = ASSET_CONFIG["property"]
CFG_CAT = list(_cfg.get("categorical", []))
CFG_NUM = list(_cfg.get("numeric", []))
CFG_EXC = set(_cfg.get("exclude", []))

EXTRA_KEEP = ["rooms","has_elevator","has_garage","has_garden","has_balcony",
              "is_top_floor","listing_month","city","zone","urban_type"]
FEATURE_ALLOWLIST = _uniq([*(CFG_CAT + CFG_NUM), *EXTRA_KEEP])

ALWAYS_EXCLUDE = set(CFG_EXC) | {
    VALUATION_K,
    ASSET_ID, "record_id","listing_id","asset_type_id",
    "source","source_name","source_url","dataset_version",
    "prediction_ts","last_verified_ts","ingestion_ts",
    "created_at","updated_at","listing_ts","lag_hours",
    "sample_weight","weight","severity_score",
    "outlier_count","n_outlier_sources","outlier_source",
    "confidence_score",
    "y_pred","predicted_valuation_k","valuation_k_hat",
    "valuation_k_log","valuation_k_decile","valuation_rank","is_top_valuation",
    "strongly_incoherent","price_per_sqm_capped_violated",
}

EXCLUDE_PATTERNS = [
    r"price_per_sqm",
    r"^valuation_k_.+",
    r"(?:^|_)id$",
    r"(?:^|_)(created|updated|ingestion|prediction|last_verified|listing)_ts$",
    r"_url$|_hash$",
    r"^y_pred$|_pred(?:iction)?_",
    r"(?:^|_)(avg|mean|median|benchmark|zscore|rank|decile)(?:_|$).*?(price|valuation)",
    r"(price|valuation).*?(avg|mean|median|benchmark|zscore|rank|decile)",
    r"(?:^|_)drift(?:_|$)|(?:^|_)caps?(?:_|$)|(?:^|_)vs_(?:_|$)",
]

# leakage hard-stop
leaky_check = [c for c in ML_LEAKY_FEATURES if c in df_train.columns]
if leaky_check:
    raise ValueError(f"Leakage detected in training set: {leaky_check}")

DYNAMIC_EXCLUDE = {c for c in df_train.columns if _matches_any(c, EXCLUDE_PATTERNS)}
EXCLUDE_ALL = set(ALWAYS_EXCLUDE) | set(DYNAMIC_EXCLUDE)

# ==== 1) Split iniziale cat / num (allowlist-aware) ==========================
# Nota: per evitare overweight della macroarea, NON obblighiamo 'region' come categorica.
CATEGORICAL_FEATURES = _uniq(["city","zone","urban_type"] + [c for c in CFG_CAT if c not in {"location"}])
cat_cols = [c for c in CATEGORICAL_FEATURES if c in df_train.columns and c not in EXCLUDE_ALL]
num_cols = [c for c in df_train.columns if (c not in EXCLUDE_ALL) and _is_numeric(df_train[c])]

allow = set([c for c in FEATURE_ALLOWLIST if c in df_train.columns])
cat_cols = [c for c in _uniq(cat_cols) if c in allow]
num_cols = [c for c in _uniq(num_cols) if (c not in set(cat_cols)) and (c in allow)]

MIN_FEATS = int(os.getenv("MIN_FEATS_ALLOWLIST", "12"))
if len(cat_cols) + len(num_cols) < MIN_FEATS:
    more_cat = [c for c in CATEGORICAL_FEATURES if c in df_train.columns and c not in EXCLUDE_ALL and c not in cat_cols]
    more_num = []
    for c in df_train.columns:
        if c in EXCLUDE_ALL or c in cat_cols or c in num_cols: 
            continue
        s = pd.to_numeric(df_train[c], errors="coerce")
        if _is_numeric(s) and s.nunique(dropna=True) >= 10 and np.nanvar(s.values) > 0:
            more_num.append(c)
    cat_cols = _uniq(cat_cols + more_cat)[:15]
    num_cols = _uniq(num_cols + more_num)[:25]

constant_cols = [c for c in num_cols if df_train[c].nunique(dropna=True) <= 1]
if constant_cols:
    num_cols = [c for c in num_cols if c not in constant_cols]

# ==== 2) (opzionale) step di derivazione — SOLO via transformer importabile ===
# Niente logiche/priors qui: se il transformer c'è lo usiamo, altrimenti si procede senza.
_include_derive = False
DERIVED_FEATURES = [
    "log_size_m2","sqm_per_room","baths_per_100sqm",
    "elev_x_floor","no_elev_high_floor","rooms_per_100sqm",
    "city_zone_prior","region_index_prior",
]
feature_deriver = "passthrough"
try:
    # prova più namespace
    try:
        from notebooks.shared.common.transformers import PropertyDerivedFeatures  # type: ignore
    except Exception:
        from notebooks.shared.common.transformers import PropertyDerivedFeatures  # type: ignore  # noqa
    feature_deriver = PropertyDerivedFeatures()  # usa i default interni
    _include_derive = True
except Exception:
    _include_derive = False

# se il deriver è attivo, dichiara le derivate tra le numeriche (verranno create nello step precedente al prep)
if _include_derive:
    num_cols = _uniq(num_cols + DERIVED_FEATURES)

# ==== 3) Preprocessori (OHE compat) =========================================
def _build_ohe(min_freq=None, as_sparse=True):
    kw = dict(handle_unknown="ignore")
    if isinstance(min_freq, (int, float)):
        try: kw["min_frequency"] = min_freq
        except TypeError: pass
    try:
        return OneHotEncoder(sparse_output=as_sparse, **kw)  # sklearn >=1.2
    except TypeError:
        return OneHotEncoder(sparse=as_sparse, **kw)         # sklearn <1.2

min_freq = TRAIN_CFG.get("ohe_min_frequency", None)

cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="constant", fill_value="__MISSING__")),
    ("encode", _build_ohe(min_freq)),
])
num_pipe = Pipeline([("impute", SimpleImputer(strategy="median"))])

# ==== 4) Colonne & pipeline A/B =============================================
num_cols_B = [c for c in num_cols if c != "confidence_score"]

preproc_A = ColumnTransformer([("cat", cat_pipe, cat_cols), ("num", num_pipe, num_cols)], remainder="drop")
preproc_B = ColumnTransformer([("cat", cat_pipe, cat_cols), ("num", num_pipe, num_cols_B)], remainder="drop")

features_A = _uniq(cat_cols + num_cols)
features_B = _uniq(cat_cols + num_cols_B)

df_train_A = _ensure_columns(df_train.copy(), features_A)
df_valid_A = _ensure_columns(df_valid.copy(), features_A)
df_test_A  = _ensure_columns(df_test.copy(),  features_A)

df_train_B = _ensure_columns(df_train.copy(), features_B)
df_valid_B = _ensure_columns(df_valid.copy(), features_B)
df_test_B  = _ensure_columns(df_test.copy(),  features_B)

X_train = df_train_A[features_A].copy()
X_valid = df_valid_A[features_A].copy()
X_test  = df_test_A[features_A].copy()

Xtr_B = df_train_B[features_B].copy()
Xva_B = df_valid_B[features_B].copy()
Xte_B = df_test_B[features_B].copy()

MODEL_KIND = str(TRAIN_CFG.get("model", os.getenv("MODEL_KIND","rf"))).lower()
ModelA = RandomForestRegressor; ModelB = RandomForestRegressor
MODEL_FAMILY_A = MODEL_FAMILY_B = "RandomForest"

if MODEL_KIND in {"xgb","xgboost"}:
    try:
        from xgboost import XGBRegressor  # type: ignore
        ModelA = ModelB = XGBRegressor
        MODEL_FAMILY_A = MODEL_FAMILY_B = "XGBRegressor"
    except Exception:
        MODEL_FAMILY_A = MODEL_FAMILY_B = "RandomForest"

if MODEL_FAMILY_A == "RandomForest":
    model_A = ModelA(n_estimators=400, random_state=SEED, n_jobs=-1, max_depth=None, min_samples_leaf=2)
    model_B = ModelB(n_estimators=400, random_state=SEED, n_jobs=-1, max_depth=None, min_samples_leaf=2)
else:
    model_A = ModelA(n_estimators=500, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8,
                     reg_alpha=0.0, reg_lambda=1.0, random_state=SEED, tree_method="hist")
    model_B = ModelB(n_estimators=500, max_depth=6, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8,
                     reg_alpha=0.0, reg_lambda=1.0, random_state=SEED, tree_method="hist")

steps_A = [("prep", preproc_A), ("model", model_A)]
steps_B = [("prep", preproc_B), ("model", model_B)]
if _include_derive:
    steps_A = [("derive", feature_deriver)] + steps_A
    steps_B = [("derive", feature_deriver)] + steps_B

pipe_A = Pipeline(steps_A)
pipe_B = Pipeline(steps_B)

# ==== 5) Analisi semplice & artefatti =======================================
ART_DIR.mkdir(parents=True, exist_ok=True)
card_rows = []
for c in features_A:
    if c not in df_train.columns: continue
    s = df_train[c]
    card_rows.append({
        "feature": c, "dtype": str(s.dtype),
        "n_unique": int(s.nunique(dropna=True)),
        "pct_missing": float(s.isna().mean()*100.0),
        "is_categorical_like": bool(s.dtype.name in ("object","category","bool")),
        "is_numeric": bool(pd.api.types.is_numeric_dtype(s)),
    })
pd.DataFrame(card_rows).sort_values(["is_categorical_like","n_unique"], ascending=[False,True]) \
  .to_csv(ART_DIR / "mlprep_cardinality_missing.csv", index=False)

# ==== 6) Allinea target/pesi (senza manipolarli qui) =========================
def _align_targets_and_weights():
    global y_train, y_valid, y_test, y_val_orig, y_test_orig, w_train
    y_train = df_train.loc[X_train.index, VALUATION_K].to_numpy()
    y_valid = df_valid.loc[X_valid.index, VALUATION_K].to_numpy()
    y_test  = df_test.loc[X_test.index,  VALUATION_K].to_numpy()
    y_val_orig  = y_valid.copy(); y_test_orig = y_test.copy()
    if "Xtr_B" in globals():
        if "sample_weight" in df_train.columns:
            w_train = df_train.loc[Xtr_B.index, "sample_weight"].astype("float32").to_numpy()
        else:
            w_train = np.ones(len(Xtr_B), dtype="float32")
    assert len(X_train)==len(y_train) and len(X_valid)==len(y_val_orig) and len(X_test)==len(y_test_orig)
_align_targets_and_weights()

print("\n===== VERIFICA FEATURES =====")
print("Categoriche:", cat_cols)
print("Numeriche  :", num_cols)
print("Deriver attivo?:", _include_derive)
print("Model     A:", MODEL_FAMILY_A, "| n_features:", len(features_A))
print("Model     B:", MODEL_FAMILY_B, "| n_features:", len(features_B))

In [None]:
# === PATCH EVAL: assicurati che le derivate/priors ci siano (no all-NaN) ===
from __future__ import annotations
import numpy as np, pandas as pd

# prova a leggere i mapping da config (fallback sicuri)
try:
    from notebooks.shared.common.config import ASSET_CONFIG  # type: ignore
    _PROP = ASSET_CONFIG["property"]
    _CITY_BASE = {c.lower(): {z.lower(): v for z, v in d.items()}
                  for c, d in (_PROP.get("city_base_prices") or {}).items()}
    _REGION_INDEX = {k.lower(): float(v) for k, v in (_PROP.get("region_index") or {
        "north": 1.05, "center": 1.00, "south": 0.92
    }).items()}
except Exception:
    _CITY_BASE = {}
    _REGION_INDEX = {"north": 1.05, "center": 1.00, "south": 0.92}

# mediane di fallback per zona e globale (se serve)
_ZONE_KEYS = set(z for d in _CITY_BASE.values() for z in d.keys())
_ZONE_MED = {z: float(np.nanmedian([d.get(z, np.nan) for d in _CITY_BASE.values()]))
             for z in _ZONE_KEYS} if _CITY_BASE else {}
_GLOBAL_CITYZONE_MED = (float(np.nanmedian([v for d in _CITY_BASE.values() for v in d.values()]))
                        if _CITY_BASE else 0.0)

_DERIVED_ALL = [
    "log_size_m2","sqm_per_room","baths_per_100sqm","elev_x_floor",
    "no_elev_high_floor","rooms_per_100sqm","city_zone_prior","region_index_prior",
]

def _needs(col: str, df: pd.DataFrame) -> bool:
    """Serve calcolarla? Solo se attesa nelle features e assente o tutta NaN."""
    if 'features_A' not in globals():
        return False
    if col not in features_A:
        return False
    if col not in df.columns:
        return True
    s = pd.to_numeric(df[col], errors="coerce")
    return not s.notna().any()

def _ensure_eval_derivatives(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # basi comode
    size = pd.to_numeric(out.get("size_m2"), errors="coerce")
    rooms = pd.to_numeric(out.get("rooms"), errors="coerce")
    baths = pd.to_numeric(out.get("bathrooms"), errors="coerce")
    floor = pd.to_numeric(out.get("floor"), errors="coerce")
    elev  = pd.to_numeric(out.get("has_elevator"), errors="coerce").fillna(0)

    # 1) derivate geometriche/funzionali
    if _needs("log_size_m2", out) and "size_m2" in out.columns:
        out["log_size_m2"] = np.log1p(size)

    if _needs("sqm_per_room", out) and {"size_m2","rooms"}.issubset(out.columns):
        out["sqm_per_room"] = size / rooms.replace(0, np.nan)

    if _needs("baths_per_100sqm", out) and {"bathrooms","size_m2"}.issubset(out.columns):
        out["baths_per_100sqm"] = 100.0 * baths / size.replace(0, np.nan)

    if _needs("elev_x_floor", out) and {"has_elevator","floor"}.issubset(out.columns):
        out["elev_x_floor"] = (elev * np.maximum(floor - 1, 0)).astype("float64")

    if _needs("no_elev_high_floor", out) and {"has_elevator","floor"}.issubset(out.columns):
        out["no_elev_high_floor"] = ((1 - elev) * np.maximum(floor - 1, 0)).astype("float64")

    if _needs("rooms_per_100sqm", out) and {"rooms","size_m2"}.issubset(out.columns):
        out["rooms_per_100sqm"] = (100.0 * rooms / size.replace(0, np.nan)).astype("float64")

    # 2) priors city×zone e macroarea
    if _needs("city_zone_prior", out):
        if "city" not in out.columns and "location" in out.columns:
            out["city"] = out["location"]
        if "zone" not in out.columns:
            out["zone"] = "semi_center"
        ci = out.get("city").astype(str).str.strip().str.lower() if "city" in out.columns else pd.Series("", index=out.index)
        zo = out.get("zone").astype(str).str.strip().str.lower() if "zone" in out.columns else pd.Series("semi_center", index=out.index)
        vals = []
        for c, z in zip(ci, zo):
            v = _CITY_BASE.get(c, {}).get(z, np.nan)
            if pd.isna(v):
                v = _ZONE_MED.get(z, _GLOBAL_CITYZONE_MED)
            vals.append(v)
        out["city_zone_prior"] = np.asarray(vals, dtype="float64")

    if _needs("region_index_prior", out):
        if "region" not in out.columns:
            out["region"] = "center"
        out["region_index_prior"] = out["region"].astype(str).str.strip().str.lower().map(_REGION_INDEX).astype("float64")

    return out

# applica solo se davvero servono (evita side-effect inutili)
for _name in ("df_train","df_valid","df_test"):
    if _name in globals() and isinstance(globals()[_name], pd.DataFrame):
        globals()[_name] = _ensure_eval_derivatives(globals()[_name])

# riallinea le matrici usate in queste celle di valutazione
if "features_A" in globals():
    X_train = _ensure_columns(df_train.copy(), features_A) if "df_train" in globals() else X_train
    X_valid = _ensure_columns(df_valid.copy(), features_A) if "df_valid" in globals() else X_valid
    X_test  = _ensure_columns(df_test.copy(),  features_A) if "df_test"  in globals() else X_test

In [None]:
# === Leakage sentinel + scan proxy (TTR, target in scala naturale) ===
from __future__ import annotations
import re, numpy as np
from sklearn.base import clone
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

def _report(y_true, y_pred, tag: str):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{tag}  MAE={mae:.2f}  RMSE={rmse:.2f}  R2={r2:.4f}")

# y (scala naturale) per VALID — fallback se non definito
try:
    y_val_orig
except NameError:
    y_val_orig = df_valid.loc[X_valid.index, VALUATION_K].to_numpy(dtype=np.float64)

# 1) Sentinel: shuffle del target in scala naturale
y_train_shuf = shuffle(y_train, random_state=SEED).astype(np.float64)

# ricostruisci estimatore coerente con la prep
try:
    base_model = clone(model_A)
except Exception:
    base_model = RandomForestRegressor(n_estimators=400, max_depth=None,
                                       min_samples_leaf=2, random_state=SEED, n_jobs=-1)

inner_pipe = Pipeline([("prep", preproc_A), ("model", base_model)])

def _log1p64(y):  return np.log1p(np.asarray(y, dtype=np.float64))
def _expm164(y):  return np.expm1(np.asarray(y, dtype=np.float64))

sentinel_reg = TransformedTargetRegressor(
    regressor=inner_pipe,
    func=_log1p64,
    inverse_func=_expm164,
    check_inverse=False,
)

fit_params = {}
if "w_train" in globals() and isinstance(w_train, np.ndarray) and len(w_train) == len(X_train):
    # ✅ chiave relativa alla Pipeline interna (step "model")
    fit_params = {"model__sample_weight": w_train}

sentinel_reg.fit(X_train, y_train_shuf, **fit_params)
pred_val_shuf = np.clip(sentinel_reg.predict(X_valid), 0, None)
_report(y_val_orig, pred_val_shuf, "Leakage sentinel (VALID, shuffled y)")

# 2) Scan colonne proxy sospette nelle features in uso
sus_patterns = [
    r"(?:^|_)(avg|mean|median|benchmark|zscore|rank|decile)(?:_|$).*?(price|valuation)",
    r"(price|valuation).*?(avg|mean|median|benchmark|zscore|rank|decile)",
    r"(?:^|_)drift(?:_|$)", r"(?:^|_)caps?(?:_|$)", r"(?:^|_)vs_(?:_|$)"
]
def _is_susp(c: str) -> bool:
    return any(re.search(p, c, re.I) for p in sus_patterns)

in_use_cols = list(X_train.columns)
print("Colonne proxy sospette in uso:", [c for c in in_use_cols if _is_susp(c)] or "—")

In [None]:
# === PATCH CV: pre-trasformazione robusta + fast pipe clone-safe ==============
from __future__ import annotations
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import clone

# 1) carica mapping priors (robusto)
try:
    from notebooks.shared.common.config import ASSET_CONFIG  # type: ignore
    _PROP = ASSET_CONFIG["property"]
    _CITY_BASE = {c.lower(): {z.lower(): float(v) for z, v in d.items()}
                  for c, d in (_PROP.get("city_base_prices") or {}).items()}
    _REGION_INDEX = {k.lower(): float(v) for k, v in (_PROP.get("region_index") or {
        "north": 1.05, "center": 1.00, "south": 0.92
    }).items()}
except Exception:
    _CITY_BASE = {}
    _REGION_INDEX = {"north": 1.05, "center": 1.00, "south": 0.92}

# mediane di fallback
_ZONE_KEYS = set(z for d in _CITY_BASE.values() for z in d.keys())
_ZONE_MED = {z: float(np.nanmedian([d.get(z, np.nan) for d in _CITY_BASE.values()]))
             for z in _ZONE_KEYS} if _CITY_BASE else {}
_GLOBAL_CITYZONE_MED = float(np.nanmedian([v for d in _CITY_BASE.values() for v in d.values()])) if _CITY_BASE else 0.0

def _required_cols_from_prep(prep: "ColumnTransformer") -> list[str]:
    """Colonne che il prep si aspetta in input (cat + num)."""
    req = []
    for name, _, cols in getattr(prep, "transformers", []):
        if cols is None or cols == "drop":
            continue
        if isinstance(cols, (list, tuple, np.ndarray)):
            req.extend([str(c) for c in cols])
        else:
            req.append(str(cols))
    # dedup preservando ordine
    return list(dict.fromkeys(req))

def _ensure_columns(df_part: pd.DataFrame, required: list[str]) -> pd.DataFrame:
    miss = [c for c in required if c not in df_part.columns]
    if miss:
        for c in miss:
            df_part[c] = np.nan
    return df_part[required]

def _fill_priors_for_cv(df: pd.DataFrame) -> pd.DataFrame:
    """Garantisce city/zone/region canoniche e riempie city_zone_prior/region_index_prior con fallback."""
    out = df.copy()
    # assicurati di avere i campi base
    if "city" not in out.columns and "location" in out.columns:
        out["city"] = out["location"]
    if "zone" not in out.columns:
        out["zone"] = "semi_center"
    if "region" not in out.columns:
        out["region"] = "center"

    # canonizza
    for c in ("city", "zone", "region"):
        if c in out.columns:
            out[c] = out[c].astype(str).str.strip().str.lower()

    # city_zone_prior
    if "city_zone_prior" not in out.columns or pd.isna(out["city_zone_prior"]).all():
        ci = out.get("city", pd.Series(index=out.index, dtype=str)).astype(str)
        zo = out.get("zone", pd.Series(index=out.index, dtype=str)).astype(str)
        vals = []
        for c, z in zip(ci, zo):
            v = _CITY_BASE.get(c, {}).get(z, np.nan)
            if pd.isna(v):
                v = _ZONE_MED.get(z, _GLOBAL_CITYZONE_MED)
            vals.append(v)
        out["city_zone_prior"] = np.asarray(vals, dtype="float64")
    else:
        out["city_zone_prior"] = pd.to_numeric(out["city_zone_prior"], errors="coerce").astype("float64")

    # region_index_prior
    if "region_index_prior" not in out.columns or pd.isna(out["region_index_prior"]).all():
        out["region_index_prior"] = out["region"].map(_REGION_INDEX).astype("float64")
    else:
        out["region_index_prior"] = pd.to_numeric(out["region_index_prior"], errors="coerce").astype("float64")

    # fallback finale se (rarissimo) ancora tutti NaN
    if pd.isna(out["city_zone_prior"]).all():
        out["city_zone_prior"] = float(_GLOBAL_CITYZONE_MED)
    if pd.isna(out["region_index_prior"]).all():
        out["region_index_prior"] = float(np.nanmean(list(_REGION_INDEX.values())))

    return out

def _build_fast_pipe_and_prefn(base_pipe: Pipeline):
    """
    Ritorna (fast_pipe_senza_derive, pre_fn):
      - fast_pipe: Pipeline(prep, rf|model) clone-safe
      - pre_fn(X): applica derive.transform(X) se esiste, poi fill_priors e allinea alle colonne richieste dal prep
    """
    steps = getattr(base_pipe, "named_steps", {})
    prep = steps.get("prep", None)
    reg  = steps.get("rf", steps.get("model", None))
    derive = steps.get("derive", None)
    if prep is None or reg is None:
        raise RuntimeError("Pipeline base priva di 'prep' o step finale (rf/model).")

    try:
        prep_fast = clone(prep)
    except Exception:
        prep_fast = prep
    try:
        reg_fast = clone(reg)
    except Exception:
        from sklearn.ensemble import RandomForestRegressor
        reg_fast = RandomForestRegressor(n_estimators=400, random_state=SEED, n_jobs=-1, min_samples_leaf=2)

    last_name = "rf" if "rf" in steps else ("model" if "model" in steps else "est")
    fast_pipe = Pipeline([("prep", prep_fast), (last_name, reg_fast)])

    # colonne che il prep si aspetta
    required = _required_cols_from_prep(prep)

    def pre_fn(Xdf: pd.DataFrame) -> pd.DataFrame:
        X2 = Xdf.copy()
        # 1) derive fuori pipeline, se esiste
        if derive is not None and hasattr(derive, "transform"):
            X2 = derive.transform(X2)
        # 2) assicurati che i prior esistano e NON siano NaN
        X2 = _fill_priors_for_cv(X2)
        # 3) allinea esattamente alle colonne attese dal prep
        X2 = _ensure_columns(X2, required)
        return X2

    return fast_pipe, pre_fn

# costruisci fast pipe + funzione di pre-trasformazione
fast_pipe, _pre_fn = _build_fast_pipe_and_prefn(pipe_A)

# pre-trasforma X per la CV
X_train_cv = _pre_fn(X_train)
# (se nella cella userai anche VALID/TEST, fai lo stesso)
# X_valid_cv = _pre_fn(X_valid); X_test_cv = _pre_fn(X_test)

# pesi (se presenti) già allineati a X_train
w_full = None
if "sample_weight" in df_train.columns:
    w_full = df_train.loc[X_train.index, "sample_weight"].astype("float64").to_numpy()

In [None]:
# === Group-aware CV: GroupShuffleSplit 5× e LOLO Top-K (TTR, NO expm1 manuale) ===
from __future__ import annotations
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, LeaveOneGroupOut
from sklearn.base import clone
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import TransformedTargetRegressor

# 0) Wrapper numericamente stabili per TTR
def _log1p64(y):  return np.log1p(np.asarray(y, dtype=np.float64))
def _expm164(y):  return np.expm1(np.asarray(y, dtype=np.float64))

# 1) Scegli colonna di gruppo
GROUP_CANDIDATES = ["location", "region", "zone", "urban_type"]
group_col = next((c for c in GROUP_CANDIDATES if c in df_train.columns), None)

if group_col is None:
    print("⛔ Nessuna colonna di gruppo disponibile (location/region/zone/urban_type). Salto GSS/LOLO.")
else:
    # y_train DEVE essere in scala naturale (k€)
    y_train_nat = np.asarray(y_train, dtype=np.float64)  # assicurati che y_train sia già in k€
    groups_full = df_train.loc[X_train_cv.index, group_col].astype(str).to_numpy()

    # helper: fit+metriche su uno split (usa TTR e passa pesi allo step finale della Pipeline)
    def _fit_eval_on_split(X, y, tr_idx, va_idx, base_pipe, w: np.ndarray | None):
        inner = clone(base_pipe)

        # TTR: log1p/expm1 gestiti qui; nessuna expm1 manuale altrove
        reg = TransformedTargetRegressor(
            regressor=inner,
            func=_log1p64,
            inverse_func=_expm164,
            check_inverse=False,
        )

        # individua nome step finale che accetta sample_weight
        if "rf" in inner.named_steps:
            last_step = "rf"
        elif "model" in inner.named_steps:
            last_step = "model"
        else:
            last_step = list(inner.named_steps.keys())[-1]

        fit_params = {}
        if w is not None and len(w) == len(X):
            # ✅ chiavi relative alla Pipeline interna (niente 'regressor__')
            fit_params = {f"{last_step}__sample_weight": w[tr_idx]}

        reg.fit(X.iloc[tr_idx], y[tr_idx], **fit_params)
        pred = np.clip(reg.predict(X.iloc[va_idx]), 0, None)  # già in k€
        true = y[va_idx]

        mae = mean_absolute_error(true, pred)
        rmse = np.sqrt(mean_squared_error(true, pred))
        r2 = r2_score(true, pred)
        return mae, rmse, r2

    if "rf" in fast_pipe.named_steps:
        fast_pipe.named_steps["rf"].set_params(n_estimators=200)

    # pesi (se presenti) allineati a X_train
    w_full = None
    if "sample_weight" in df_train.columns:
        w_full = df_train.loc[X_train_cv.index, "sample_weight"].astype("float64").to_numpy()

    # --- A) GroupShuffleSplit 5×
    gss = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED)
    maeL, rmseL, r2L = [], [], []
    for tr_idx, va_idx in gss.split(X_train_cv, y_train_nat, groups=groups_full):
        mae, rmse, r2 = _fit_eval_on_split(X_train_cv, y_train_nat, tr_idx, va_idx, fast_pipe, w_full)
        maeL.append(mae); rmseL.append(rmse); r2L.append(r2)

    print(
        f"GroupShuffleSplit (5×, group={group_col}) → "
        f"MAE={np.mean(maeL):.2f}±{np.std(maeL):.2f}  "
        f"RMSE={np.mean(rmseL):.2f}±{np.std(rmseL):.2f}  "
        f"R2={np.mean(r2L):.4f}±{np.std(r2L):.4f}"
    )

    # --- B) LOLO Top-K (più rapido del LOLO completo)
    K = 10
    vc = pd.Series(groups_full).value_counts()
    top_groups = vc.index[:min(K, len(vc))]
    mask = pd.Series(groups_full).isin(top_groups).to_numpy()

    Xk = X_train.loc[mask]
    yk = y_train_nat[mask]
    gk = pd.Series(groups_full)[mask].to_numpy()
    wk = None if w_full is None else w_full[mask]

    if len(np.unique(gk)) < 2 or len(Xk) < 10:
        print("LOLO Top-K: gruppi insufficienti. Salto.")
    else:
        logo = LeaveOneGroupOut()
        maeL2, rmseL2, r2L2 = [], [], []
        fast_pipe2 = clone(fast_pipe)

        for tr_idx, va_idx in logo.split(Xk, yk, groups=gk):
            mae, rmse, r2 = _fit_eval_on_split(Xk, yk, tr_idx, va_idx, fast_pipe2, wk)
            maeL2.append(mae); rmseL2.append(rmse); r2L2.append(r2)

        print(
            f"LOLO Top-{len(np.unique(gk))} (group={group_col}) → "
            f"MAE={np.mean(maeL2):.2f}±{np.std(maeL2):.2f}  "
            f"RMSE={np.mean(rmseL2):.2f}±{np.std(rmseL2):.2f}  "
            f"R2={np.mean(r2L2):.4f}±{np.std(r2L2):.4f}"
        )

In [None]:
# === Minimal features (TTR, chiavi fit_params corrette) ===
from __future__ import annotations
import numpy as np
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# wrapper numericamente stabili
def _log1p64(y):  return np.log1p(np.asarray(y, dtype=np.float64))
def _expm164(y):  return np.expm1(np.asarray(y, dtype=np.float64))

base_keep_all = [
    "size_m2","rooms","bathrooms","year_built","age_years",
    "floor","building_floors","is_top_floor","is_ground_floor","has_elevator",
    "garage","parking_spot","has_garden","has_balcony",
    "distance_to_center_km","orientation","view","region","zone","urban_type","location"
]
base_keep = [c for c in base_keep_all if c in df_train.columns]

if len(base_keep) < 2:
    print("⛔ Minimal: meno di 2 feature base disponibili.")
else:
    # split cat/num basati su dtype nel TRAIN
    cat_b = [c for c in base_keep if df_train[c].dtype.name in ("object","category","bool")]
    num_b = [c for c in base_keep if c not in set(cat_b)]

    prep_b = ColumnTransformer([
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("enc", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
        ]), cat_b),
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
        ]), num_b),
    ], remainder="drop")

    pipe_inner = Pipeline([
        ("prep", prep_b),
        ("rf", RandomForestRegressor(
            n_estimators=400, random_state=SEED, n_jobs=-1, min_samples_leaf=2
        )),
    ])

    # TTR: log1p/expm1 gestiti qui, niente expm1 manuale
    pipe_min = TransformedTargetRegressor(
        regressor=pipe_inner,
        func=_log1p64, inverse_func=_expm164,
        check_inverse=False,
    )

    # allinea colonne
    def _ensure(df_part, cols):
        miss = [c for c in cols if c not in df_part.columns]
        if miss:
            for c in miss: df_part[c] = np.nan
        return df_part[cols]

    Xtr_min = _ensure(df_train.copy(), base_keep).loc[X_train.index]
    Xva_min = _ensure(df_valid.copy(), base_keep).loc[X_valid.index]

    # y in scala naturale (k€)
    y_tr_nat = df_train.loc[Xtr_min.index, VALUATION_K].astype("float64").to_numpy()
    y_va_nat = df_valid.loc[Xva_min.index, VALUATION_K].astype("float64").to_numpy()

    # fit_params: **niente 'regressor__'**
    fit_params = {}
    if "sample_weight" in df_train.columns:
        w = df_train.loc[Xtr_min.index, "sample_weight"].astype("float64").to_numpy()
        fit_params = {"rf__sample_weight": w}

    pipe_min.fit(Xtr_min, y_tr_nat, **fit_params)
    pred_min = np.clip(pipe_min.predict(Xva_min), 0, None)  # già k€

    mae = mean_absolute_error(y_va_nat, pred_min)
    rmse = np.sqrt(mean_squared_error(y_va_nat, pred_min))
    r2 = r2_score(y_va_nat, pred_min)

    print(f"Minimal VALID → MAE={mae:.2f}  RMSE={rmse:.2f}  R2={r2:.6f}  (features: {len(base_keep)})")

In [None]:
# === Ablation sicura: rimuovi alcune colonne e valuta su VALID (e opz. GSS) ===
from __future__ import annotations
import numpy as np
from sklearn.base import clone
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GroupShuffleSplit

# -- wrapper numericamente stabili (se non già definiti)
try:
    _log1p64
    _expm164
except NameError:
    def _log1p64(y):  return np.log1p(np.asarray(y, dtype=np.float64))
    def _expm164(y):  return np.expm1(np.asarray(y, dtype=np.float64))

def _build_preproc_from(base_prep: ColumnTransformer,
                        keep_cat: list[str],
                        keep_num: list[str]) -> ColumnTransformer:
    """
    Ricostruisce un ColumnTransformer usando (se disponibili) i trasformatori
    del prep di base, altrimenti crea pipe di default.
    """
    cat_est, num_est = None, None
    if hasattr(base_prep, "transformers"):
        for name, est, cols in base_prep.transformers:
            if name == "cat":
                cat_est = clone(est)
            elif name == "num":
                num_est = clone(est)

    if cat_est is None:
        cat_est = Pipeline([
            ("impute", SimpleImputer(strategy="most_frequent")),
            ("encode", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
        ])
    if num_est is None:
        num_est = Pipeline([
            ("impute", SimpleImputer(strategy="median")),
        ])

    transformers = []
    if keep_cat:
        transformers.append(("cat", cat_est, keep_cat))
    if keep_num:
        transformers.append(("num", num_est, keep_num))
    if not transformers:
        raise ValueError("Nessuna feature rimanente per l'ablation.")

    return ColumnTransformer(transformers=transformers, remainder="drop")

def _ensure(df_part, cols):
    """Garantisce che tutte le colonne esistano (aggiunge NaN se mancano) e restituisce solo quelle in 'cols'."""
    miss = [c for c in cols if c not in df_part.columns]
    if miss:
        for c in miss: df_part[c] = np.nan
    return df_part[cols]

def _make_fit_params_for_ttr_regressor(pipe_base: Pipeline, mask=None):
    """
    Crea il dict dei fit_params per passare i pesi allo step finale della Pipeline interna.
    NOTA: con TTR NON usare 'regressor__' qui; le chiavi vanno direttamente allo step (rf/model).
    """
    if ("sample_weight" in df_train.columns) and hasattr(pipe_base, "named_steps"):
        if "rf" in pipe_base.named_steps:
            key = "rf__sample_weight"
        elif "model" in pipe_base.named_steps:
            key = "model__sample_weight"
        else:
            last = list(pipe_base.named_steps.keys())[-1]
            key = f"{last}__sample_weight"

        w = df_train.loc[X_train.index, "sample_weight"].astype("float64").to_numpy()
        if mask is not None:
            w = w[mask]
        return {key: w}
    return {}

def eval_drop(cols_to_drop, base_pipe=pipe_A):
    """
    Esegue un’ablation (drop di alcune colonne) e valuta su VALID usando TTR (log1p/expm1).
    Ritorna (ttr_pipeline, keep_features).
    """
    cols_to_drop = set(cols_to_drop)
    keep = [c for c in features_A if c not in cols_to_drop]
    keep_cat = [c for c in cat_cols if c in keep]
    keep_num = [c for c in num_cols if c in keep]

    # ricostruisci preproc coerente
    preproc_new = _build_preproc_from(base_pipe.named_steps["prep"], keep_cat, keep_num)

    # modello base
    rf = RandomForestRegressor(
        n_estimators=400, random_state=SEED, n_jobs=-1, max_depth=None, min_samples_leaf=2
    )
    pipe_base = Pipeline([("prep", preproc_new), ("rf", rf)])

    # TTR per gestire scala target in modo consistente
    ttr = TransformedTargetRegressor(
        regressor=pipe_base,
        func=_log1p64,
        inverse_func=_expm164,
        check_inverse=False,
    )

    # prepara matrici e target (scala naturale k€)
    Xtr = _ensure(df_train.copy(), keep).loc[X_train.index]
    Xva = _ensure(df_valid.copy(), keep).loc[X_valid.index]
    y_tr_nat = df_train.loc[Xtr.index, VALUATION_K].astype("float64").to_numpy()
    y_va_nat = df_valid.loc[Xva.index, VALUATION_K].astype("float64").to_numpy()

    # fit con pesi (se presenti)
    fit_params = _make_fit_params_for_ttr_regressor(pipe_base)
    ttr.fit(Xtr, y_tr_nat, **fit_params)

    # predizioni già in scala naturale
    predV = np.clip(ttr.predict(Xva), 0, None)
    mae = mean_absolute_error(y_va_nat, predV)
    rmse = np.sqrt(mean_squared_error(y_va_nat, predV))
    r2 = r2_score(y_va_nat, predV)

    print(f"DROP {sorted(cols_to_drop)} → VALID  MAE={mae:.2f}  RMSE={rmse:.2f}  R2={r2:.4f}  (kept {len(keep)} cols)")
    return ttr, keep

# Esempi di uso (come prima)
p_drop_both, keep_cols_both = eval_drop(["size_m2", "distance_to_center_km"])
p_drop_size, keep_cols_size = eval_drop(["size_m2"])
p_drop_dist, keep_cols_dist = eval_drop(["distance_to_center_km"])

# --- GSS rapido con set ridotto (3 split) per il modello p_drop_both ---
GROUP_CANDIDATES = ["location","region","zone","urban_type"]
group_col = next((c for c in GROUP_CANDIDATES if c in df_train.columns), None)

if group_col:
    # gruppi allineati a X_train
    groups = df_train.loc[X_train.index, group_col].astype(str).to_numpy()

    # ricostruisci una versione "fast" della pipeline interna con meno alberi
    fast_inner = clone(p_drop_both.regressor)   # Pipeline(prep, rf)
    if "rf" in fast_inner.named_steps:
        fast_inner.named_steps["rf"].set_params(n_estimators=200)

    # TTR fast
    ttr_fast = TransformedTargetRegressor(
        regressor=fast_inner,
        func=_log1p64,
        inverse_func=_expm164,
        check_inverse=False,
    )

    # prepara X e y in scala naturale per i fold
    X_full = _ensure(df_train.copy(), keep_cols_both).loc[X_train.index]
    y_full = df_train.loc[X_train.index, VALUATION_K].astype("float64").to_numpy()

    # pesi allineati se presenti
    w_full = df_train.loc[X_train.index, "sample_weight"].astype("float64").to_numpy() if "sample_weight" in df_train.columns else None

    gss = GroupShuffleSplit(n_splits=3, test_size=0.2, random_state=SEED)
    maeL, rmseL, r2L = [], [], []

    for tr, va in gss.split(X_full, y_full, groups=groups):
        # fit params per questo split (NB: chiavi relative alla Pipeline interna del TTR)
        fit_params = {}
        if w_full is not None:
            # passali allo step 'rf' dentro regressor
            fit_params = {"rf__sample_weight": w_full[tr]}

        q = clone(ttr_fast)
        q.fit(X_full.iloc[tr], y_full[tr], **fit_params)
        pred = np.clip(q.predict(X_full.iloc[va]), 0, None)  # già in k€
        true = y_full[va]
        maeL.append(mean_absolute_error(true, pred))
        rmseL.append(np.sqrt(mean_squared_error(true, pred)))
        r2L.append(r2_score(true, pred))

    print(
        f"GSS 3× (drop size_m2 & distance_to_center_km) → "
        f"MAE={np.mean(maeL):.2f}±{np.std(maeL):.2f}  "
        f"RMSE={np.mean(rmseL):.2f}±{np.std(rmseL):.2f}  "
        f"R2={np.mean(r2L):.4f}±{np.std(r2L):.4f}"
    )
else:
    print("GSS: nessuna colonna di gruppo disponibile.")

In [None]:
# === GSS 5× senza distance_to_center_km (robusto ai nomi step + TTR + pesi) ===
from __future__ import annotations
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from sklearn.base import clone
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

# wrapper numericamente stabili per TTR
def _log1p64(y):  return np.log1p(np.asarray(y, dtype=np.float64))
def _expm164(y):  return np.expm1(np.asarray(y, dtype=np.float64))

def _extract_base_regressor(base_pipe) -> RandomForestRegressor:
    """Prova a clonare lo step finale del tuo pipe (rf/model), altrimenti crea un RF."""
    try:
        steps = getattr(base_pipe, "named_steps", {})
        if "rf" in steps:
            return clone(steps["rf"])
        if "model" in steps:
            return clone(steps["model"])
    except Exception:
        pass
    return RandomForestRegressor(
        n_estimators=400, random_state=SEED, n_jobs=-1, min_samples_leaf=2
    )

def _fit_params_key_for(pipe_inner: Pipeline) -> str:
    """Ritorna la chiave corretta per passare sample_weight allo step finale."""
    if "rf" in pipe_inner.named_steps:
        return "rf__sample_weight"
    if "model" in pipe_inner.named_steps:
        return "model__sample_weight"
    last = list(pipe_inner.named_steps.keys())[-1]
    return f"{last}__sample_weight"

drop_col = "distance_to_center_km"
keep_cols = [c for c in features_A if c != drop_col]
keep_cat  = [c for c in cat_cols if c in keep_cols]
keep_num  = [c for c in num_cols if c in keep_cols]

# prep minimale per le colonne mantenute
cat_est = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
])
num_est = Pipeline([("impute", SimpleImputer(strategy="median"))])
prep = ColumnTransformer([("cat", cat_est, keep_cat), ("num", num_est, keep_num)], remainder="drop")

# regressor di base coerente col tuo pipe_A (o RF di default)
base_reg = _extract_base_regressor(pipe_A)

# pipeline interna + TTR (gestione log1p/expm1)
inner = Pipeline([("prep", prep), ("rf", base_reg)])
ttr_template = TransformedTargetRegressor(
    regressor=inner, func=_log1p64, inverse_func=_expm164, check_inverse=False
)

# allinea X, y (scala naturale, k€), pesi e gruppi agli indici di X_train
X_keep = X_train[keep_cols]
y_nat  = df_train.loc[X_keep.index, VALUATION_K].astype("float64").to_numpy()
w_full = None
if "sample_weight" in df_train.columns:
    w_full = df_train.loc[X_keep.index, "sample_weight"].astype("float64").to_numpy()

group_col = next((c for c in ["location","region","zone","urban_type"] if c in df_train.columns), None)
if group_col is None:
    raise RuntimeError("Nessuna colonna di gruppo disponibile (location/region/zone/urban_type).")
groups = df_train.loc[X_keep.index, group_col].astype(str).to_numpy()

# GSS 5×
gss = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED)
maeL, rmseL, r2L = [], [], []

# chiave corretta per i pesi
fit_key = _fit_params_key_for(inner)

for tr, va in gss.split(X_keep, y_nat, groups=groups):
    ttr = clone(ttr_template)

    fit_params = {}
    if w_full is not None:
        # ⚠️ con TTR le chiavi vanno DIRETTE allo step della pipeline interna (niente 'regressor__')
        fit_params = {fit_key: w_full[tr]}

    ttr.fit(X_keep.iloc[tr], y_nat[tr], **fit_params)
    pred = np.clip(ttr.predict(X_keep.iloc[va]), 0, None)  # già k€
    true = y_nat[va]

    maeL.append(mean_absolute_error(true, pred))
    rmseL.append(np.sqrt(mean_squared_error(true, pred)))
    r2L.append(r2_score(true, pred))

print(
    f"GSS 5× (drop {drop_col}) → "
    f"MAE={np.mean(maeL):.2f}±{np.std(maeL):.2f}  "
    f"RMSE={np.mean(rmseL):.2f}±{np.std(rmseL):.2f}  "
    f"R2={np.mean(r2L):.4f}±{np.std(r2L):.4f}"
)

### Train & Validation (A vs B) & Champion Selection

In [None]:
# 06) Fit & validation (RF baseline A/B) con TTR, pesi corretti e metriche robuste — SAFE

from __future__ import annotations
import time, json, numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

def _metrics(y_true, y_pred):
    mae  = float(mean_absolute_error(y_true, y_pred))
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    r2   = float(r2_score(y_true, y_pred))
    return {"MAE": mae, "RMSE": rmse, "R2": r2}

# --- 0) Sanitize liste features: uniche e disgiunte
def _uniq(xs):
    return list(dict.fromkeys(xs))

cat_cols = _uniq(cat_cols)
_num_all = _uniq(num_cols)
num_cols = [c for c in _num_all if c not in set(cat_cols)]
num_cols_B = [c for c in num_cols if c != "confidence_score"]  # B: niente confidence come feature

_inter = set(cat_cols) & set(num_cols)
if _inter:
    logger.warning("Colonne presenti sia in cat che num (rimosse da num): %s", sorted(_inter))

# --- 1) Preprocessori con le liste pulite
cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="constant", fill_value="__MISSING__")),
    ("encode", _build_ohe(min_freq)),
])
num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
])

preproc_A = ColumnTransformer(
    transformers=[
        ("cat", cat_pipe, cat_cols),
        ("num", num_pipe, num_cols),
    ],
    remainder="drop",
)
preproc_B = ColumnTransformer(
    transformers=[
        ("cat", cat_pipe, cat_cols),
        ("num", num_pipe, num_cols_B),
    ],
    remainder="drop",
)

# --- 2) Reallinea le colonne tra split (se mancano, NaN → imputazione)
def _ensure_columns(df_part, required):
    missing = [c for c in required if c not in df_part.columns]
    if missing:
        for c in missing:
            df_part[c] = np.nan
        logger.info("Aggiunte colonne mancanti allo split: %s", missing)
    return df_part[required]

features_A = cat_cols + num_cols
features_B = cat_cols + num_cols_B

df_train_A = _ensure_columns(df_train.copy(), features_A)
df_valid_A = _ensure_columns(df_valid.copy(), features_A)
df_test_A  = _ensure_columns(df_test.copy(),  features_A)

df_train_B = _ensure_columns(df_train.copy(), features_B)
df_valid_B = _ensure_columns(df_valid.copy(), features_B)
df_test_B  = _ensure_columns(df_test.copy(),  features_B)

X_train = df_train_A[features_A].copy()
X_valid = df_valid_A[features_A].copy()
X_test  = df_test_A[features_A].copy()

Xtr_B = df_train_B[features_B].copy()
Xva_B = df_valid_B[features_B].copy()
Xte_B = df_test_B[features_B].copy()

# --- 3) Target in SCALA NATURALE (k€) — TTR farà log1p/expm1
y_train_nat = df_train[VALUATION_K].astype("float64").to_numpy()
y_val_nat   = df_valid[VALUATION_K].astype("float64").to_numpy()
y_tst_nat   = df_test[VALUATION_K].astype("float64").to_numpy()

# --- 4) Modelli di base
model_A = RandomForestRegressor(
    n_estimators=400, random_state=SEED, n_jobs=-1, max_depth=None, min_samples_leaf=2
)
model_B = RandomForestRegressor(
    n_estimators=400, random_state=SEED, n_jobs=-1, max_depth=None, min_samples_leaf=2
)

# Pipeline interne con step finale chiamato "model"
pipe_A_inner = Pipeline([("prep", preproc_A), ("model", model_A)])
pipe_B_inner = Pipeline([("prep", preproc_B), ("model", model_B)])

# TTR per applicare log1p/expm1 in modo consistente
ttr_A = TransformedTargetRegressor(regressor=pipe_A_inner, func=np.log1p, inverse_func=np.expm1)
ttr_B = TransformedTargetRegressor(regressor=pipe_B_inner, func=np.log1p, inverse_func=np.expm1)

# --- 5) Fit A (confidence come feature, nessun peso)
t0 = time.perf_counter()
ttr_A.fit(X_train, y_train_nat)
tA = time.perf_counter() - t0

pred_val_A = np.clip(ttr_A.predict(X_valid), 0, None)
pred_tst_A = np.clip(ttr_A.predict(X_test),  0, None)

# --- 6) Fit B (confidence esclusa come feature, usata come PESO se presente)
fit_params_B = {}
if "sample_weight" in df_train.columns:
    w_train = df_train.loc[Xtr_B.index, "sample_weight"].astype("float64").to_numpy()
    # ✅ con TTR, i parametri vanno all'interno della pipeline: regressor__model__sample_weight
    fit_params_B = {"model__sample_weight": w_train}  # <-- chiave relativa alla Pipeline interna

t0 = time.perf_counter()
ttr_B.fit(Xtr_B, y_train_nat, **fit_params_B)
tB = time.perf_counter() - t0

pred_val_B = np.clip(ttr_B.predict(Xva_B), 0, None)
pred_tst_B = np.clip(ttr_B.predict(Xte_B), 0, None)

# --- 7) Metriche su scala naturale
mA_val = _metrics(y_val_nat, pred_val_A)
mA_tst = _metrics(y_tst_nat, pred_tst_A)
mB_val = _metrics(y_val_nat, pred_val_B)
mB_tst = _metrics(y_tst_nat, pred_tst_B)

print(f"A (conf as feature)  VALID: {mA_val}  TEST: {mA_tst}   (fit {tA:.2f}s)")
print(f"B (conf as weight)   VALID: {mB_val}  TEST: {mB_tst}   (fit {tB:.2f}s)")

# --- 8) Selezione champion su VALID (MAE, tie-break RMSE)
def _champ(mA, mB):
    if mA["MAE"] < mB["MAE"]:
        return "A"
    if mA["MAE"] > mB["MAE"]:
        return "B"
    return "A" if mA["RMSE"] <= mB["RMSE"] else "B"

champion = _champ(mA_val, mB_val)
print("Champion:", champion)

# --- 9) Salva mini-report
summary = {
    "timing_sec": {"A": round(tA, 3), "B": round(tB, 3)},
    "A": {"VALID": mA_val, "TEST": mA_tst, "n_features": len(features_A)},
    "B": {"VALID": mB_val, "TEST": mB_tst, "n_features": len(features_B)},
    "champion": champion,
}
(ART_DIR / "rf_baselines_summary.json").write_text(
    json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8"
)
logger.info("Saved RF baselines summary → %s", ART_DIR / "rf_baselines_summary.json")

### (RF A/B): significatività & blending

In [None]:
# 06.1) RF A vs B: bootstrap ΔMAE + breakdown per decile + blending VALID→TEST (robusto)
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# === Guard-rails
for name in ["pred_val_A", "pred_val_B", "pred_tst_A", "pred_tst_B"]:
    if name not in globals():
        raise RuntimeError(f"Variabile mancante: {name}")

if len(df_test) == 0 or len(df_valid) == 0:
    raise RuntimeError("Split vuoti: df_valid/df_test non possono essere vuoti.")

ART_DIR.mkdir(parents=True, exist_ok=True)

def _as_1d_float(x):
    a = np.asarray(x, dtype="float64").reshape(-1)
    # sostituisci inf con nan e poi imputiamo con mediana
    a[~np.isfinite(a)] = np.nan
    if np.isnan(a).any():
        med = np.nanmedian(a)
        a = np.where(np.isnan(a), med, a)
    return a

# === Array puliti (TEST)
y_true_t = _as_1d_float(df_test[VALUATION_K].to_numpy())
yhatA_t  = _as_1d_float(pred_tst_A)
yhatB_t  = _as_1d_float(pred_tst_B)

# riallineo lunghezze se necessario (difetti estremi)
n = min(len(y_true_t), len(yhatA_t), len(yhatB_t))
y_true_t, yhatA_t, yhatB_t = y_true_t[:n], yhatA_t[:n], yhatB_t[:n]

# === Paired bootstrap ΔMAE su TEST (A−B)
B = 1000
rng_boot = np.random.default_rng(SEED + 101)
idx_mat = rng_boot.integers(0, n, size=(B, n))

def _mae(a, b):  # veloce e robusto
    return np.mean(np.abs(a - b))

deltas = np.fromiter(
    (_mae(y_true_t[idx], yhatA_t[idx]) - _mae(y_true_t[idx], yhatB_t[idx]) for idx in idx_mat),
    dtype="float64", count=B
)

ci_lo, ci_hi = np.percentile(deltas, [2.5, 97.5])
# p-value 2 code rispetto a 0
p_two_sided = 2.0 * min((deltas <= 0).mean(), (deltas >= 0).mean())

print(f"ΔMAE (A−B) bootstrap: mean={deltas.mean():.4f}, 95% CI=({ci_lo:.4f},{ci_hi:.4f}), p≈{p_two_sided:.3f}")

ab_summary = {
    "delta_mae_mean": float(deltas.mean()),
    "ci_95": [float(ci_lo), float(ci_hi)],
    "p_two_sided": float(p_two_sided),
    "B": int(B),
    "n_test": int(n),
    "mae_A_test": float(_mae(y_true_t, yhatA_t)),
    "mae_B_test": float(_mae(y_true_t, yhatB_t)),
}
(ART_DIR / "ab_bootstrap_summary.json").write_text(json.dumps(ab_summary, indent=2), encoding="utf-8")

# === Breakdown per decile (TEST)
try:
    dec = pd.qcut(y_true_t, q=10, labels=False, duplicates="drop")
except Exception:
    dec = pd.Series(np.zeros_like(y_true_t, dtype=int))

rep = (
    pd.DataFrame({"y": y_true_t, "yA": yhatA_t, "yB": yhatB_t, "dec": dec})
    .groupby("dec", observed=True, sort=True)
    .apply(lambda g: pd.Series({
        "n": int(len(g)),
        "MAE_A": float(mean_absolute_error(g["y"], g["yA"])),
        "MAE_B": float(mean_absolute_error(g["y"], g["yB"])),
        "ΔMAE_AminusB": float(mean_absolute_error(g["y"], g["yA"]) - mean_absolute_error(g["y"], g["yB"])),
        "R2_A": float(r2_score(g["y"], g["yA"])) if g["y"].nunique() > 1 else np.nan,
        "R2_B": float(r2_score(g["y"], g["yB"])) if g["y"].nunique() > 1 else np.nan,
    }))
    .reset_index()
)

rep_csv  = ART_DIR / "ab_compare_by_decile.csv"
rep_parq = ART_DIR / "ab_compare_by_decile.parquet"
rep.to_csv(rep_csv, index=False)
rep.to_parquet(rep_parq, index=False)
print("Saved:", rep_csv, ",", rep_parq)

# === Blending semplice α∈[0,1] su VALID → reporting anche su TEST
y_true_v = _as_1d_float(df_valid[VALUATION_K].to_numpy())
yhatA_v  = _as_1d_float(pred_val_A)
yhatB_v  = _as_1d_float(pred_val_B)

m = min(len(y_true_v), len(yhatA_v), len(yhatB_v))
y_true_v, yhatA_v, yhatB_v = y_true_v[:m], yhatA_v[:m], yhatB_v[:m]

alphas = np.linspace(0.0, 1.0, 21)
rows = []
best_idx = None
for i, a in enumerate(alphas):
    y_blend_v = a * yhatB_v + (1.0 - a) * yhatA_v
    y_blend_t = a * yhatB_t + (1.0 - a) * yhatA_t
    mae_v = mean_absolute_error(y_true_v, y_blend_v)
    mae_t = mean_absolute_error(y_true_t, y_blend_t)
    rmse_t = np.sqrt(mean_squared_error(y_true_t, y_blend_t))
    r2_t = r2_score(y_true_t, y_blend_t)
    rows.append({
        "alpha": float(a),
        "MAE_VALID": float(mae_v),
        "MAE_TEST": float(mae_t),
        "RMSE_TEST": float(rmse_t),
        "R2_TEST": float(r2_t),
    })
    if best_idx is None or mae_v < rows[best_idx]["MAE_VALID"]:
        best_idx = i

blend_df = pd.DataFrame(rows).sort_values("MAE_VALID", ascending=True)
blend_csv  = ART_DIR / "blend_search_AB.csv"
blend_parq = ART_DIR / "blend_search_AB.parquet"
blend_df.to_csv(blend_csv, index=False)
blend_df.to_parquet(blend_parq, index=False)

best_row = blend_df.iloc[0]
print("Saved:", blend_csv, ",", blend_parq)
print(
    f"Blend best α on VALID = {best_row['alpha']:.2f}  "
    f"|  MAE_VALID={best_row['MAE_VALID']:.3f}  "
    f"|  MAE_TEST={best_row['MAE_TEST']:.3f}  "
    f"|  R2_TEST={best_row['R2_TEST']:.4f}"
)

### XGBoost + Optuna

In [None]:
# ---- TARGET SCALE HANDLER (EUR vs kEUR) ----
import numpy as np

TARGET_RAW_TRAIN = df_train[VALUATION_K].astype("float64").to_numpy()
p50 = float(np.nanmedian(TARGET_RAW_TRAIN))

# eur_if_big: se la mediana > 2_000 assumo che la colonna sia in EURO; altrimenti è già in k€
UNIT_SCALE = 1000.0 if p50 > 2000.0 else 1.0       # divide-by for training
UNIT_LABEL = "EUR" if UNIT_SCALE == 1000.0 else "kEUR"

def to_log(y_nat):
    """porta il target su scala log1p, uniformando all'unità di training."""
    return np.log1p(np.asarray(y_nat, float) / UNIT_SCALE)

def from_log(y_log):
    """torna su scala naturale nell'UNITÀ ORIGINALE DEL DATASET (stessa di df[VALUATION_K])."""
    return np.expm1(np.asarray(y_log, float)) * UNIT_SCALE

# Tetto numericamente stabile calcolato nella stessa unità del training
p999 = float(np.nanpercentile(TARGET_RAW_TRAIN / UNIT_SCALE, 99.9))
MAX_LOG = float(np.log1p(p999 * 10.0))  # 10× il 99.9p nella stessa unità del training

def safe_expm1_scaled(y_log):
    z = np.asarray(y_log, float)
    z = np.clip(z, -20.0, MAX_LOG)      # evita overflow
    return np.expm1(z) * UNIT_SCALE     # torna alla stessa unità del dataset

# y in log per il training, coerente con UNIT_SCALE
y_train = to_log(df_train[VALUATION_K].values)
y_valid = to_log(df_valid[VALUATION_K].values)
y_test  = to_log(df_test [VALUATION_K].values)

# per metriche, i "true" restano nella loro unità originale (df_... è già in quella unità)
y_val_true = df_valid[VALUATION_K].astype("float64").to_numpy()
y_tst_true = df_test [VALUATION_K].astype("float64").to_numpy()

print(f"Detected target unit: {UNIT_LABEL}  (median={p50:,.1f})  MAX_LOG={MAX_LOG:.3f}")

In [None]:
# XGBoost + Optuna — import & setup (coerente con notebooks/outputs/modeling/property)
from __future__ import annotations

from pathlib import Path
import json

try:
    import xgboost as xgb
    import optuna
    from optuna.pruners import MedianPruner
    from optuna.samplers import TPESampler
except Exception as e:
    raise RuntimeError("Servono xgboost e optuna (pip install xgboost optuna)") from e

# Dir coerenti (siamo dentro notebooks/)
BASE_OUT = Path("outputs")
PROP_DIR = BASE_OUT / "modeling" / "property"
XGB_DIR  = PROP_DIR / "xgb"
for d in (PROP_DIR, XGB_DIR):
    d.mkdir(parents=True, exist_ok=True)

# Config da TRAIN_CFG (già caricato in alto nel nb)
N_TRIALS   = int(TRAIN_CFG.get("xgb_optuna_trials", 25))
EARLY_STOP = int(TRAIN_CFG.get("xgb_early_stopping_rounds", 100))

In [None]:
# 07ter) Optuna: tuning XGBoost (setup B: confidence come peso) — safe & robust

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.base import clone
import inspect
import numpy as np
import pandas as pd

# tetto dinamico: 10× il 99.9° percentile del target (in k€), poi log1p
MAX_LOG = float(np.log1p(np.nanpercentile(df_train[VALUATION_K].values, 99.9) * 10.0))

def safe_expm1(z, max_log=MAX_LOG):
    z = np.asarray(z, dtype=np.float64)
    z = np.clip(z, -20.0, max_log)  # -20 ~ ~0 in scala naturale; max_log evita overflow
    return np.expm1(z)


# Preprocess identico a setup B (senza confidence tra le feature)
_transformers = []
if "cat_cols" in globals() and len(cat_cols) > 0:
    _transformers.append(("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_cols))
if "num_cols_B" in globals() and len(num_cols_B) > 0:
    _transformers.append(("num", "passthrough", num_cols_B))
preproc_B = ColumnTransformer(transformers=_transformers, remainder="drop")

# Feature list (dedup, ordine preservato)
features_B = list(dict.fromkeys([*(cat_cols if "cat_cols" in globals() else []),
                                 *(num_cols_B if "num_cols_B" in globals() else [])]))

# Pesi (confidence/sample_weight) se presenti → allineati all'indice di X_train
if "sample_weight" in df_train.columns:
    weights_B = df_train.loc[X_train.index, "sample_weight"].to_numpy(dtype=float)
else:
    weights_B = np.ones(len(X_train), dtype=float)

def _suggest_params(trial: optuna.Trial) -> dict:
    return {
        "n_estimators":     trial.suggest_int("n_estimators", 400, 1800),
        "max_depth":        trial.suggest_int("max_depth", 4, 12),
        "learning_rate":    trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "subsample":        trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 1.0, 10.0),
        "reg_alpha":        trial.suggest_float("reg_alpha", 1e-8, 1e-1, log=True),
        "reg_lambda":       trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "gamma":            trial.suggest_float("gamma", 0.0, 5.0),
        "random_state":     SEED,
        "tree_method":      "hist",
        "n_jobs":           -1,
    }

def _fit_xgb_with_preproc(params: dict,
                          Xtr_df: pd.DataFrame, ytr_log: np.ndarray,
                          Xva_df: pd.DataFrame, yva_log: np.ndarray,
                          w: np.ndarray, early_rounds: int = EARLY_STOP):
    """
    Allena XGB su target log1p (usiamo y_train/y_valid già in log1p),
    early-stopping su valid (MAE calcolata in log-space da XGB).
    Le metriche 'finali' le faremo poi in scala naturale con expm1.
    """
    prep = clone(preproc_B)
    Xt = prep.fit_transform(Xtr_df)
    Xv = prep.transform(Xva_df)

    model = xgb.XGBRegressor(**params)
    sig = inspect.signature(model.fit)
    fit_kwargs = {}
    if "eval_set" in sig.parameters:        fit_kwargs["eval_set"] = [(Xv, yva_log)]
    if "eval_metric" in sig.parameters:     fit_kwargs["eval_metric"] = "mae"
    if "sample_weight" in sig.parameters:   fit_kwargs["sample_weight"] = w
    if "early_stopping_rounds" in sig.parameters:
        fit_kwargs["early_stopping_rounds"] = early_rounds
    elif "callbacks" in sig.parameters:
        from xgboost.callback import EarlyStopping
        fit_kwargs["callbacks"] = [EarlyStopping(rounds=early_rounds, save_best=True)]
    if "verbose" in sig.parameters:         fit_kwargs["verbose"] = False

    model.fit(Xt, ytr_log, **fit_kwargs)
    pipe = Pipeline([("prep", prep), ("xgb", model)])
    return pipe, model, Xt, Xv

def objective(trial: optuna.Trial) -> float:
    params = _suggest_params(trial)
    Xtr_df, Xva_df = X_train[features_B], X_valid[features_B]
    # y_train/y_valid sono in log1p dalle celle RF — li riusiamo
    pipe, model, Xt, Xv = _fit_xgb_with_preproc(params, Xtr_df, y_train, Xva_df, y_valid, weights_B)
    # misura l'obiettivo in scala naturale
    y_log_pred = model.predict(Xv)              # pred in log
    y_log_pred = model.predict(Xv)
    pred_nat = safe_expm1_scaled(y_log_pred)
    if not np.all(np.isfinite(pred_nat)):  # trial instabile → penalizza
        return 1e9
    return mean_absolute_error(y_val_true, pred_nat)

study = optuna.create_study(
    direction="minimize",
    pruner=MedianPruner(n_warmup_steps=5),
    sampler=TPESampler(seed=SEED),
)
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

print("Best trial:", study.best_trial.number, "MAE (valid, natural):", study.best_value)
best_params = study.best_trial.params
(XGB_DIR / "optuna_best_params_setupB.json").write_text(json.dumps(best_params, indent=2), encoding="utf-8")
study.trials_dataframe().to_parquet(XGB_DIR / "optuna_trials_setupB.parquet", index=False)

In [None]:
# 07quater-bis) Build XGB finale (setup B) + metriche VALID/TEST

import inspect, json
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def _rmse(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# Best params da Optuna o fallback sensato
bp_path = XGB_DIR / "optuna_best_params_setupB.json"
if bp_path.exists():
    best_params = json.loads(bp_path.read_text(encoding="utf-8"))
else:
    best_params = {
        "n_estimators": 900,
        "max_depth": 8,
        "learning_rate": 0.06,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "min_child_weight": 3.0,
        "reg_alpha": 1e-6,
        "reg_lambda": 1.0,
        "gamma": 0.0,
        "random_state": SEED,
        "tree_method": "hist",
        "n_jobs": -1,
    }

# Fit finale con early stopping su VALID
pipe_xgb_B, model_xgb_B, Xt, Xv = _fit_xgb_with_preproc(
    best_params,
    X_train[features_B], y_train,
    X_valid[features_B], y_valid,
    weights_B,
    early_rounds=EARLY_STOP,
)

# Predizioni in scala naturale (k€)
ylog_val = pipe_xgb_B.named_steps["xgb"].predict(
    pipe_xgb_B.named_steps["prep"].transform(X_valid[features_B])
)
ylog_tst = pipe_xgb_B.named_steps["xgb"].predict(
    pipe_xgb_B.named_steps["prep"].transform(X_test[features_B])
)
pred_val_XGB = safe_expm1_scaled(ylog_val)
pred_tst_XGB = safe_expm1_scaled(ylog_tst)

m_val = {
    "MAE": float(mean_absolute_error(y_val_true, pred_val_XGB)),
    "RMSE": float(np.sqrt(mean_squared_error(y_val_true, pred_val_XGB))),
    "R2":  float(r2_score(y_val_true, pred_val_XGB)),
}
m_tst = {
    "MAE": float(mean_absolute_error(y_tst_true, pred_tst_XGB)),
    "RMSE": float(np.sqrt(mean_squared_error(y_tst_true, pred_tst_XGB))),
    "R2":  float(r2_score(y_tst_true, pred_tst_XGB)),
}
print("XGB_B VALID:", m_val, "| unit:", UNIT_LABEL)
print("XGB_B TEST :", m_tst,  "| unit:", UNIT_LABEL)

In [None]:
# Persistenza XGB + gain importance + update manifest (coerente con PROP_DIR)

from pathlib import Path
import joblib, json, hashlib

xgb_path = XGB_DIR / "xgb_setupB_champion.joblib"

# 1) Salva pipeline completa (prep + xgb)
joblib.dump(pipe_xgb_B, xgb_path)
print("Saved XGB model to:", xgb_path)

# 2) Gain importance (se disponibile)
try:
    booster = pipe_xgb_B.named_steps["xgb"].get_booster()
    f_gain = booster.get_score(importance_type="gain")
    (XGB_DIR / "xgb_gain_importance.json").write_text(json.dumps(f_gain, indent=2), encoding="utf-8")
    print("Saved:", XGB_DIR / "xgb_gain_importance.json")
except Exception as e:
    print("Gain importance not available:", e)

# 3) Hash per tracciabilità
h = hashlib.sha256()
with open(xgb_path, "rb") as _f:
    for chunk in iter(lambda: _f.read(1024 * 1024), b""):
        h.update(chunk)
xgb_sha = h.hexdigest()

# 4) Aggiorna training manifest nella cartella PROPERTY
manifest_path = PROP_DIR / "training_manifest.json"
manifest = json.loads(manifest_path.read_text(encoding="utf-8")) if manifest_path.exists() else {}

# paths
paths = manifest.setdefault("paths", {})
paths.update({
    "xgb_model": str(xgb_path),
    "xgb_model_sha256": xgb_sha,
    "xgb_gain_importance": str(XGB_DIR / "xgb_gain_importance.json"),
    "xgb_optuna_best_params": str(XGB_DIR / "optuna_best_params_setupB.json") if (XGB_DIR / "optuna_best_params_setupB.json").exists() else paths.get("xgb_optuna_best_params"),
    "xgb_optuna_trials": str(XGB_DIR / "optuna_trials_setupB.parquet") if (XGB_DIR / "optuna_trials_setupB.parquet").exists() else paths.get("xgb_optuna_trials"),
    # opzionale: punta il "pipeline_path" di default allo XGB champion
    "pipeline_path": str(xgb_path),
})

# metrics
metrics = manifest.setdefault("metrics", {})
metrics.update({
    "xgb_valid": m_val,
    "xgb_test":  m_tst,
})

# meta (utile per la UI/serving)
manifest["model_meta"] = {
    "value_model_name": "XGBRegressor",
    "value_model_version": "v2",          # etichetta “v2” per differenziarlo dall’RF v1
    "setup": "B_conf_as_weight",
    "early_stopping_rounds": int(EARLY_STOP),
    "optuna_trials": int(N_TRIALS),
    "seed": int(SEED),
}

manifest_path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
print("Updated manifest:", manifest_path)

### Feature Importance (on champion)

In [None]:
# 07) Feature importances (RF champion) + salvataggio figure — ROBUST
from __future__ import annotations
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.compose import TransformedTargetRegressor
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline

TOPN = 20  # barre nei grafici

# -- helper: prendi per prima un'istanza esistente tra più nomi
def _pick_first(*names):
    for n in names:
        if n in globals() and globals()[n] is not None:
            return globals()[n]
    return None

# -- helper: estrai inner estimator, preproc e step finale da Pipeline o TTR
def _extract_parts(model_like):
    """
    Ritorna (inner_pipe, preproc, final_est, final_step_name, is_ttr)
    - inner_pipe: Pipeline("prep", "..."), l'oggetto che ha .named_steps
    - preproc   : ColumnTransformer dello step "prep", se presente
    - final_est : estimatore finale (es. RandomForestRegressor)
    - final_step_name: nome dello step finale nella pipeline (es. "rf" o "model")
    - is_ttr    : bool, True se model_like è un TransformedTargetRegressor
    """
    is_ttr = isinstance(model_like, TransformedTargetRegressor)
    inner = model_like.regressor_ if is_ttr and hasattr(model_like, "regressor_") else \
            (model_like.regressor if is_ttr and hasattr(model_like, "regressor") else model_like)

    if not isinstance(inner, Pipeline):
        raise RuntimeError("Pipeline interna non trovata: atteso Pipeline con step 'prep' e modello finale.")

    # preproc
    preproc = inner.named_steps.get("prep", None)

    # step finale & estimator
    # preferisci chiavi comuni, altrimenti prendi l'ultimo step
    final_step_name = None
    for cand in ("rf", "model", "xgb"):
        if cand in inner.named_steps:
            final_step_name = cand
            break
    if final_step_name is None:
        final_step_name = list(inner.named_steps.keys())[-1]
    final_est = inner.named_steps[final_step_name]

    return inner, preproc, final_est, final_step_name, is_ttr

# -- helper: ricava lista feature nella GIUSTA sequenza del CT (cat poi num)
def _feature_names_from_ct(preproc, fallback_cat, fallback_num):
    """
    Con OrdinalEncoder per le categoriche ⇒ 1 col/feature.
    """
    if preproc is None or not hasattr(preproc, "transformers"):
        # fallback: usa liste note
        return [*fallback_cat, *[c for c in fallback_num if c not in set(fallback_cat)]]

    cat_cols_ct, num_cols_ct = [], []
    for name, est, cols in preproc.transformers:
        if name == "cat":
            cat_cols_ct = list(cols) if isinstance(cols, (list, tuple, np.ndarray, pd.Index)) else list(fallback_cat)
        elif name == "num":
            num_cols_ct = list(cols) if isinstance(cols, (list, tuple, np.ndarray, pd.Index)) else list(fallback_num)

    # sicurezza ordine/duplicati
    seen = set()
    ordered = []
    for c in list(cat_cols_ct) + list(num_cols_ct):
        if c not in seen:
            seen.add(c); ordered.append(c)
    return ordered

# -- 1) Scegli il CHAMPION e la relativa pipeline
champ = (champion if "champion" in globals() else "A")  # default
if champ == "A":
    chosen = _pick_first("ttr_A", "pipe_A")
    # liste colonne previste per A
    chosen_cat = [c for c in (cat_cols if "cat_cols" in globals() else [])]
    chosen_num = [c for c in (num_cols  if "num_cols"  in globals() else [])]
else:
    chosen = _pick_first("ttr_B", "pipe_B")
    # liste colonne previste per B (no confidence come feature)
    chosen_cat = [c for c in (cat_cols    if "cat_cols"    in globals() else [])]
    chosen_num = [c for c in (num_cols_B  if "num_cols_B"  in globals() else [])]

if chosen is None:
    raise RuntimeError("Nessuna pipeline/TransformedTargetRegressor disponibile per il champion selezionato.")

# -- 2) Estrai parti interne e ricava feature usate davvero
inner_pipe, preproc, final_est, final_step, is_ttr = _extract_parts(chosen)
feat_in_use = _feature_names_from_ct(preproc, chosen_cat, chosen_num)

# -- 3) Prepara X_test allineato (crea colonne mancanti → imputazione)
def _ensure_cols(df_part: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    dfp = df_part.copy()
    missing = [c for c in cols if c not in dfp.columns]
    for c in missing:
        dfp[c] = np.nan
    return dfp[cols]

if "df_test" not in globals():
    raise RuntimeError("df_test mancante.")
X_tst_use = _ensure_cols(df_test, feat_in_use)

# -- 4) BUILT-IN importance (se disponibile) con nomi feature consistenti
builtin_imp = None
fi_raw = getattr(final_est, "feature_importances_", None)
if fi_raw is not None:
    try:
        imp = np.asarray(fi_raw)
        if imp.ndim == 1 and imp.size > 0:
            # allinea lunghezze; se non combaciano, usa f0..f{n-1}
            if len(feat_in_use) != imp.shape[0]:
                feat_names = [f"f{i}" for i in range(int(imp.shape[0]))]
            else:
                feat_names = list(feat_in_use)
            builtin_imp = (
                pd.DataFrame({"feature": feat_names, "importance": imp.astype(float, copy=False)})
                .sort_values("importance", ascending=False)
                .reset_index(drop=True)
            )
            builtin_imp.to_csv(MODEL_DIR / "feature_importance_builtin.csv", index=False)
            builtin_imp.to_parquet(MODEL_DIR / "feature_importance_builtin.parquet", index=False)

            plt.figure(figsize=(10, 6))
            _top = min(TOPN, len(builtin_imp))
            ax = builtin_imp.head(_top).plot(kind="bar", x="feature", y="importance", legend=False, rot=45)
            ax.set_title(f"RF Built-in Feature Importance (top {_top})")
            ax.set_ylabel("Importance")
            plt.tight_layout()
            plt.savefig(FIG_DIR / "rf_feature_importance_builtin.png", dpi=150)
            plt.close()
            print("Saved:", FIG_DIR / "rf_feature_importance_builtin.png")
        else:
            print("⚠️ feature_importances_ presente ma vuoto/0-D → salto built-in.")
    except Exception as e:
        print(f"⚠️ Impossibile calcolare built-in importance: {e}")
else:
    print("ℹ️ feature_importances_ non disponibile sul modello finale → salto built-in.")

# -- 5) PERMUTATION importance
#    Scala y coerente:
#      - se TTR: predict è in scala NATURALE → passiamo y naturale
#      - se Pipeline pura (train su log1p): passiamo y in log
y_perm = None
if is_ttr:
    y_perm = df_test[VALUATION_K].to_numpy(dtype="float64", copy=False)
else:
    y_perm = np.log1p(df_test[VALUATION_K].to_numpy(dtype="float64", copy=False))

perm = permutation_importance(
    estimator=chosen,              # TTR o Pipeline
    X=X_tst_use,
    y=y_perm,
    n_repeats=8,
    random_state=SEED if "SEED" in globals() else 42,
    n_jobs=-1,
    scoring="r2",
)
# nomi: usa feat_in_use (o fallback f0..)
feat_names_pi = list(feat_in_use) if len(feat_in_use) == perm.importances_mean.shape[0] \
                else [f"f{i}" for i in range(perm.importances_mean.shape[0])]

perm_imp = (
    pd.DataFrame({
        "feature": feat_names_pi,
        "importance": perm.importances_mean.astype(float, copy=False),
        "std": perm.importances_std.astype(float, copy=False),
    })
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)
perm_imp.to_csv(MODEL_DIR / "feature_importance_permutation.csv", index=False)
perm_imp.to_parquet(MODEL_DIR / "feature_importance_permutation.parquet", index=False)

plt.figure(figsize=(10, 6))
_top = min(TOPN, len(perm_imp))
ax = perm_imp.head(_top).plot(kind="bar", x="feature", y="importance", yerr="std", legend=False, rot=45)
ax.set_title(f"Permutation Importance (top {_top})")
ax.set_ylabel("Importance (mean ΔR²)")
plt.tight_layout()
plt.savefig(FIG_DIR / "rf_feature_importance_permutation.png", dpi=150)
plt.close()
print("Saved:", FIG_DIR / "rf_feature_importance_permutation.png")

# -- 6) Anteprima (se disponibile)
if builtin_imp is not None:
    display(builtin_imp.head(12))
display(perm_imp.head(12))

### Salvataggio figure importances & residuals

In [None]:
# 07bis (v2) — Importances + Residuali TEST con ricalcolo robusto delle predizioni

from __future__ import annotations
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.compose import TransformedTargetRegressor

FIG_DIR.mkdir(parents=True, exist_ok=True)

# ---------------- helpers ----------------
def _expm1_safe(z, cap: float = 12.0):
    z = np.asarray(z, dtype=np.float64)
    z = np.clip(z, -20.0, cap)  # cap log-pred per evitare overflow
    out = np.expm1(z)
    out[out < 0] = 0.0
    return out

def _ensure_cols(df_part: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    dfp = df_part.copy()
    miss = [c for c in cols if c not in dfp.columns]
    if miss:
        for c in miss: dfp[c] = np.nan
        print(f"ℹ️ Aggiunte {len(miss)} colonne mancanti allo split TEST (imputate): {miss[:10]}")
    return dfp[cols]

def _pick_first(*names):
    for n in names:
        if n in globals() and globals()[n] is not None:
            return globals()[n]
    return None

def _predict_nat_safely(est, X_df: pd.DataFrame, cols: list[str], log_cap: float = 12.0):
    X_use = _ensure_cols(X_df, cols)
    if isinstance(est, TransformedTargetRegressor):
        # prendi i LOG-pred dal regressor interno, poi expm1 safe
        log_pred = est.regressor_.predict(X_use)
        # diagnostica
        q = np.nanpercentile(log_pred, [50, 90, 99, 99.9])
        print(f"log-pred percentiles (TTR inner): p50={q[0]:.3f}, p90={q[1]:.3f}, p99={q[2]:.3f}, p99.9={q[3]:.3f}")
        return _expm1_safe(log_pred, cap=log_cap)
    else:
        # pipeline RF/XGB allenata su log1p → predict = log-pred
        log_pred = est.predict(X_use)
        q = np.nanpercentile(log_pred, [50, 90, 99, 99.9])
        print(f"log-pred percentiles (pipe): p50={q[0]:.3f}, p90={q[1]:.3f}, p99={q[2]:.3f}, p99.9={q[3]:.3f}")
        return _expm1_safe(log_pred, cap=log_cap)

def _plot_topN(df_imp: pd.DataFrame, val_col: str, title: str, out_path, xlabel: str):
    if df_imp is None or not isinstance(df_imp, pd.DataFrame) or df_imp.empty:
        print(f"ℹ️ skip plot: {title} — dataframe vuoto")
        return
    top = df_imp.head(20).iloc[::-1]
    plt.figure(figsize=(9,6))
    ax = top.plot(x="feature", y=val_col, kind="barh", legend=False)
    ax.set_title(title); ax.set_xlabel(xlabel)
    plt.tight_layout(); plt.savefig(out_path, dpi=150, bbox_inches="tight"); plt.close()
    print("Saved:", out_path)

# --------------- 1) Feature importances ---------------
_plot_topN(
    builtin_imp if "builtin_imp" in globals() else None,
    "importance",
    "RF Feature Importance (built-in, top 20)",
    FIG_DIR / "rf_feature_importance_builtin_top20.png",
    "Importance",
)
_plot_topN(
    perm_imp if "perm_imp" in globals() else None,
    "importance",
    "Permutation Importance (ΔR², top 20)",
    FIG_DIR / "rf_permutation_importance_top20.png",
    "Importance (mean ΔR²)",
)

# --------------- 2) Residuali TEST ---------------
if "df_test" not in globals():
    raise RuntimeError("df_test non è definito — impossibile calcolare i residuali.")

y_true_t = df_test[VALUATION_K].to_numpy(dtype="float64", copy=False)

# Ignora eventuale y_pred_t “sporco” in memoria
if "y_pred_t" in globals():
    del y_pred_t

# Se esistono pred calcolate prima e coerenti le uso, altrimenti ricalcolo
USE_CACHED = False
if "champion" in globals():
    if champion == "A" and "pred_tst_A" in globals() and isinstance(pred_tst_A, (np.ndarray, list)):
        y_pred_t = np.asarray(pred_tst_A, dtype=np.float64); USE_CACHED = True
    elif champion == "B" and "pred_tst_B" in globals() and isinstance(pred_tst_B, (np.ndarray, list)):
        y_pred_t = np.asarray(pred_tst_B, dtype=np.float64); USE_CACHED = True

if not USE_CACHED:
    champ = champion if "champion" in globals() else "A"
    # scegli estimator e lista colonne
    if champ == "A":
        est  = _pick_first("ttr_A", "pipe_A", "chosen_pipe")
        cols = _pick_first("features_A") or ([*cat_cols, *num_cols] if "cat_cols" in globals() and "num_cols" in globals() else list(df_test.columns))
    else:
        est  = _pick_first("ttr_B", "pipe_B", "chosen_pipe")
        cols = _pick_first("features_B") or ([*cat_cols, *num_cols_B] if "cat_cols" in globals() and "num_cols_B" in globals() else list(df_test.columns))
    if est is None:
        raise RuntimeError("Nessuna pipeline (ttr_*/pipe_*) disponibile per ricalcolare le predizioni TEST.")

    print(f"Estimator scelto: {type(est).__name__} | n_cols={len(cols)} | esempi colonne: {cols[:10]}")
    missing = [c for c in cols if c not in df_test.columns]
    if missing:
        print(f"⚠️ Mancano {len(missing)} colonne nel TEST (verranno imputate): {missing[:10]}")

    y_pred_t = _predict_nat_safely(est, df_test, cols, log_cap=float(TRAIN_CFG.get("log_cap_clip", 12.0)))

# Garantisci che sia 1D della lunghezza giusta
y_pred_t = np.asarray(y_pred_t)
if y_pred_t.ndim == 0:
    # se è uno scalare (es. NaN), replico per evitare IndexError e fallisco con messaggio chiaro dopo
    y_pred_t = np.full_like(y_true_t, fill_value=np.nan, dtype=np.float64)

if y_pred_t.shape[0] != y_true_t.shape[0]:
    raise RuntimeError(f"Dimension mismatch: y_pred_t={y_pred_t.shape}, y_true_t={y_true_t.shape}. "
                       "Controlla la lista colonne usata per il predict.")

non_finite_mask = ~(np.isfinite(y_true_t) & np.isfinite(y_pred_t))
n_bad = int(non_finite_mask.sum())
if n_bad > 0:
    bad_idx = np.where(non_finite_mask)[0][:10].tolist()
    print(f"⚠️ Righe non finite (y_true/y_pred): {n_bad} / {len(y_true_t)}. Esempi idx: {bad_idx}")

mask = np.isfinite(y_true_t) & np.isfinite(y_pred_t)
y_true_t = y_true_t[mask]
y_pred_t = y_pred_t[mask]

if len(y_true_t) < max(10, int(0.3 * len(df_test))):
    raise RuntimeError(
        "Predizioni/target TEST contengono troppi NaN/Inf anche dopo clipping.\n"
        "- Verifica che la pipeline scelta sia *quella allenata* (ttr_A/ttr_B o pipe_A/pipe_B).\n"
        "- Stai passando le *stesse colonne* usate in training (features_A/B)?\n"
        "- Guarda i percentili dei log-pred stampati sopra: se sono enormi, aumenta TRAIN_CFG.log_cap_clip o rivedi il modello."
    )

# Residuali e plot
residuals = y_true_t - y_pred_t

plt.figure(figsize=(8,5))
plt.hist(residuals, bins=60, density=True)
plt.title(f"Residuals (TEST, champion {champion if 'champion' in globals() else 'A'})")
plt.xlabel("y − ŷ (k€)"); plt.ylabel("Density")
plt.tight_layout()
out_res = FIG_DIR / "rf_residuals_test_hist.png"
plt.savefig(out_res, dpi=150, bbox_inches="tight"); plt.close()
print("Saved:", out_res)

plt.figure(figsize=(8,5))
plt.scatter(y_pred_t, residuals, s=10, alpha=0.6)
plt.axhline(0.0, linestyle="--")
plt.title(f"Residuals vs Pred (TEST, champion {champion if 'champion' in globals() else 'A'})")
plt.xlabel("ŷ (k€)"); plt.ylabel("y − ŷ (k€)")
plt.tight_layout()
out_sc = FIG_DIR / "rf_residuals_vs_pred_test.png"
plt.savefig(out_sc, dpi=150, bbox_inches="tight"); plt.close()
print("Saved:", out_sc)

### Valuations for Segments (decils e location) & Predictions Save

In [None]:
# 08 (refactor) — Breakdown per decili/location + export predizioni TEST (robusto)

from __future__ import annotations
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import TransformedTargetRegressor

from notebooks.shared.common.constants import ASSET_ID, LOCATION, VALUATION_K

MODEL_DIR.mkdir(parents=True, exist_ok=True)

# --- helper numerico ---
LOG_CAP = float(TRAIN_CFG.get("log_cap_clip", 12.0))  # limita i log-pred per evitare overflow

def _expm1_safe(z, cap: float = LOG_CAP):
    z = np.asarray(z, dtype=np.float64)
    z = np.clip(z, -20.0, cap)
    out = np.expm1(z)
    out[out < 0] = 0.0
    return out

def _ensure_cols(df_part: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    dfp = df_part.copy()
    miss = [c for c in cols if c not in dfp.columns]
    if miss:
        for c in miss:
            dfp[c] = np.nan
        print(f"ℹ️ Aggiunte {len(miss)} colonne mancanti allo split TEST (imputate): {miss[:10]}")
    return dfp[cols]

def _pick_first(*names):
    for n in names:
        if n in globals() and globals()[n] is not None:
            return globals()[n]
    return None

def _predict_nat_test_from(est, X_df: pd.DataFrame, cols: list[str]) -> np.ndarray:
    """Predizioni in scala naturale (k€) con auto-gestione TTR vs pipeline log1p."""
    X_use = _ensure_cols(X_df, cols)
    if isinstance(est, TransformedTargetRegressor):
        pred_nat = est.predict(X_use)                # già in k€
    else:
        log_pred = est.predict(X_use)                # log1p(y)
        pred_nat = _expm1_safe(log_pred)
    return np.asarray(pred_nat, dtype=np.float64)

# --- helper: groupby.apply compatibile con pandas nuovi/vecchi
def _group_apply_safe(df: pd.DataFrame, key: str, func):
    gb = df.groupby(key, observed=True)
    try:
        # pandas ≥ 2.2: esclude automaticamente la colonna di raggruppamento
        out = gb.apply(func, include_groups=False)
    except TypeError:
        # pandas < 2.2: selezioniamo esplicitamente solo le colonne utili
        out = gb[["y_true", "y_pred"]].apply(func)
    return out.reset_index()

# --- prendi/ricostruisci y_pred_t in scala naturale (k€) ---
def _get_champion_pred_test() -> np.ndarray:
    # usa cache se valida
    if "y_pred_t" in globals() and isinstance(y_pred_t, (np.ndarray, list)) and len(y_pred_t) == len(df_test):
        return np.asarray(y_pred_t, dtype=np.float64)

    if "champion" not in globals():
        raise RuntimeError("Variabile 'champion' mancante.")

    # pred già calcolate in k€?
    if champion == "A" and "pred_tst_A" in globals():
        return np.asarray(pred_tst_A, dtype=np.float64)
    if champion == "B" and "pred_tst_B" in globals():
        return np.asarray(pred_tst_B, dtype=np.float64)

    # altrimenti ricalcola
    if champion == "A":
        est  = _pick_first("ttr_A", "pipe_A", "chosen_pipe")
        cols = _pick_first("features_A") or ([*cat_cols, *num_cols] if "cat_cols" in globals() and "num_cols" in globals() else list(df_test.columns))
    else:
        est  = _pick_first("ttr_B", "pipe_B", "chosen_pipe")
        cols = _pick_first("features_B") or ([*cat_cols, *num_cols_B] if "cat_cols" in globals() and "num_cols_B" in globals() else list(df_test.columns))

    if est is None:
        raise RuntimeError("Nessuna pipeline disponibile per ricalcolare le predizioni TEST.")

    return _predict_nat_test_from(est, df_test, cols)

# --- costruisci dataset metrico pulito ---
y_true_t = df_test[VALUATION_K].to_numpy(dtype=np.float64, copy=False)
y_pred_t = _get_champion_pred_test()

# sanifica NaN/Inf
mask = np.isfinite(y_true_t) & np.isfinite(y_pred_t)
valid_n = int(mask.sum())
if valid_n < max(30, int(0.3 * len(y_true_t))):
    bad_idx = np.where(~mask)[0][:10].tolist()
    raise RuntimeError(
        f"Predizioni TEST non finite: validi {valid_n}/{len(mask)}. "
        f"Esempi idx problematici: {bad_idx}. Verifica allineamento feature e scala predizioni."
    )

yt = y_true_t[mask]
yh = y_pred_t[mask]
idx_valid = df_test.index[mask]

cols_keep = [ASSET_ID] if ASSET_ID in df_test.columns else []
if LOCATION in df_test.columns:
    cols_keep.append(LOCATION)

dfm = df_test.loc[idx_valid, cols_keep].copy()
dfm["y_true"] = yt
dfm["y_pred"] = yh

# --- decili sul target naturale ---
try:
    dfm["decile"] = pd.qcut(dfm["y_true"], q=10, labels=False, duplicates="drop")
except Exception:
    dfm["decile"] = 0

# --- funzioni metriche ---
def _rmse(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def _agg_metrics(g: pd.DataFrame) -> pd.Series:
    return pd.Series({
        "n": int(len(g)),
        "MAE": float(mean_absolute_error(g["y_true"], g["y_pred"])),
        "RMSE": float(_rmse(g["y_true"], g["y_pred"])),
        "R2": float(r2_score(g["y_true"], g["y_pred"])) if len(g) > 1 else np.nan,
    })

# --- Breakdown per decile ---
dec_rep = _group_apply_safe(dfm, "decile", _agg_metrics)
dec_rep.to_csv(MODEL_DIR / "metrics_by_decile.csv", index=False)
dec_rep.to_parquet(MODEL_DIR / "metrics_by_decile.parquet", index=False)
print("Saved:", MODEL_DIR / "metrics_by_decile.csv")
print("Saved:", MODEL_DIR / "metrics_by_decile.parquet")

# --- Breakdown per location ---
if LOCATION in dfm.columns:
    loc_rep = _group_apply_safe(dfm, LOCATION, _agg_metrics)
else:
    loc_rep = pd.DataFrame([{
        LOCATION: "NA",
        "n": int(len(dfm)),
        "MAE": float(mean_absolute_error(dfm["y_true"], dfm["y_pred"])) if len(dfm) else np.nan,
        "RMSE": float(_rmse(dfm["y_true"], dfm["y_pred"])) if len(dfm) else np.nan,
        "R2": float(r2_score(dfm["y_true"], dfm["y_pred"])) if len(dfm) > 1 else np.nan,
    }])

loc_rep.to_csv(MODEL_DIR / "metrics_by_location.csv", index=False)
loc_rep.to_parquet(MODEL_DIR / "metrics_by_location.parquet", index=False)
print("Saved:", MODEL_DIR / "metrics_by_location.csv")
print("Saved:", MODEL_DIR / "metrics_by_location.parquet")

# --- export predizioni TEST ---
pred_cols = []
if ASSET_ID in dfm.columns: pred_cols.append(ASSET_ID)
if LOCATION in dfm.columns: pred_cols.append(LOCATION)
pred_df = dfm[pred_cols + ["y_true", "y_pred"]].rename(columns={"y_true": VALUATION_K})

pred_df.to_parquet(MODEL_DIR / "predictions_test.parquet", index=False)
pred_df.to_csv(MODEL_DIR / "predictions_test.csv", index=False, encoding="utf-8")
print("Saved:", MODEL_DIR / "predictions_test.parquet")
print("Saved:", MODEL_DIR / "predictions_test.csv")

display(pred_df.head(10))

### Model Persistence & Manifest Training

In [None]:
# 08.9) Deriva chosen_pipe / chosen_cols dal champion (robusto)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def _cols_from_ct(prep: ColumnTransformer) -> list[str]:
    """Estrae i nomi colonna dichiarati nel ColumnTransformer (solo selector per nome)."""
    cols = []
    if hasattr(prep, "transformers"):
        for name, trans, sel in prep.transformers:
            if name == "remainder":
                continue
            if isinstance(sel, (list, tuple)):
                cols.extend([c for c in sel if isinstance(c, str)])
    # dedup preservando ordine
    return list(dict.fromkeys(cols))

def _pipe_cols(pipe: Pipeline) -> list[str]:
    if "prep" in pipe.named_steps and isinstance(pipe.named_steps["prep"], ColumnTransformer):
        return _cols_from_ct(pipe.named_steps["prep"])
    # fallback: prova a trovare il primo ColumnTransformer nella pipeline
    for _, step in pipe.steps:
        if isinstance(step, ColumnTransformer):
            return _cols_from_ct(step)
    return []

# 1) Champion: se non definito, scegli in base al VALID (MAE)
if "champion" not in globals():
    if "mA_val" in globals() and "mB_val" in globals():
        champion = "A" if mA_val["MAE"] <= mB_val["MAE"] else "B"
    else:
        champion = "A"  # fallback conservativo

# 2) Scegli la pipeline e le colonne
chosen_pipe = None
chosen_cols = None

if champion == "A" and "pipe_A" in globals():
    chosen_pipe = pipe_A
    if "features_A" in globals():
        chosen_cols = list(dict.fromkeys(features_A))
    elif "cat_cols" in globals() and "num_cols" in globals():
        chosen_cols = list(dict.fromkeys([*cat_cols, *num_cols]))
elif champion == "B" and "pipe_B" in globals():
    chosen_pipe = pipe_B
    if "features_B" in globals():
        chosen_cols = list(dict.fromkeys(features_B))
    elif "cat_cols" in globals() and "num_cols_B" in globals():
        chosen_cols = list(dict.fromkeys([*cat_cols, *num_cols_B]))

# 3) Fallback: infila dal ColumnTransformer se le liste sopra non esistono
if (chosen_cols is None or len(chosen_cols) == 0) and "chosen_pipe" in globals() and chosen_pipe is not None:
    chosen_cols = _pipe_cols(chosen_pipe)

# 4) Ultime cinture: tieni solo colonne realmente presenti nel train (se disponibile)
if "df_train" in globals() and isinstance(df_train, pd.DataFrame):
    chosen_cols = [c for c in chosen_cols if c in df_train.columns]

# 5) Sanity
if chosen_pipe is None or not isinstance(chosen_pipe, Pipeline):
    raise RuntimeError("Impossibile determinare chosen_pipe (pipe_A/pipe_B mancanti).")
if not chosen_cols:
    raise RuntimeError("Impossibile derivare chosen_cols: nessuna lista features trovata nel contesto/pipeline.")

print(f"Champion: {champion} | n_cols={len(chosen_cols)}")

In [None]:
# 09) Persistenza modello (RF champion) + Serving v2 — outputs/modeling/property
from __future__ import annotations

import json, os, hashlib
from datetime import datetime, timezone
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.base import clone

# === trasformatori SERVING importabili (niente FunctionTransformer locali) ===
from notebooks.shared.common.serving_transformers import GeoCanonizer, PriorsGuard
# Deriver: prova quello “ufficiale”, altrimenti un fallback classe-based dal modulo serving
try:
    from notebooks.shared.common.transformers import PropertyDerivedFeatures as _Deriver  # type: ignore
except Exception:
    try:
        from notebooks.shared.common.serving_transformers import BasicDeriver as _Deriver  # <- aggiungilo se non c’è
    except Exception as e:
        raise RuntimeError(
            "Nessun deriver importabile trovato. Installa/aggiungi "
            "`PropertyDerivedFeatures` o `BasicDeriver` nel modulo serving_transformers."
        ) from e

# ─────────────────────────────────────────────────────────────────────────────
# Cartelle
# ─────────────────────────────────────────────────────────────────────────────
BASE_OUT   = Path("outputs")
MODEL_DIR  = BASE_OUT / "modeling"
FIG_DIR    = MODEL_DIR / "figures"
ART_DIR    = MODEL_DIR / "artifacts"
PROP_DIR   = MODEL_DIR / "property"
for d in (MODEL_DIR, FIG_DIR, ART_DIR, PROP_DIR):
    d.mkdir(parents=True, exist_ok=True)

# ─────────────────────────────────────────────────────────────────────────────
# Sanity sul champion
# ─────────────────────────────────────────────────────────────────────────────
from sklearn.compose import ColumnTransformer as _CT  # solo per isinstance check tipizzato

if "chosen_pipe" not in globals() or not isinstance(chosen_pipe, Pipeline):
    raise RuntimeError("chosen_pipe non definito o non è una sklearn Pipeline.")

# Ricava chosen_cols dal champion se mancano
if "chosen_cols" not in globals() or not chosen_cols:
    if "champion" in globals() and champion in ("A", "B"):
        if champion == "A":
            chosen_cols = list(dict.fromkeys([*(cat_cols or []), *(num_cols or [])]))
        else:
            chosen_cols = list(dict.fromkeys([*(cat_cols or []), *(num_cols_B or [])]))
    else:
        raise RuntimeError("chosen_cols mancante e champion non determinabile.")

if "cat_cols" not in globals():
    cat_cols = []

VALUATION_K = "valuation_k"
SEED = int(globals().get("SEED", 42))

# ─────────────────────────────────────────────────────────────────────────────
# Helper
# ─────────────────────────────────────────────────────────────────────────────
def _sha256_file(p: Path, chunk: int = 1 << 20) -> str:
    h = hashlib.sha256()
    with p.open("rb") as f:
        for ch in iter(lambda: f.read(chunk), b""):
            h.update(ch)
    return h.hexdigest()

def _find_preproc(pipe: Pipeline) -> ColumnTransformer:
    if "prep" in pipe.named_steps and isinstance(pipe.named_steps["prep"], _CT):
        return pipe.named_steps["prep"]
    for _, step in pipe.steps:
        if isinstance(step, _CT):
            return step
    raise RuntimeError("Nessuno step ColumnTransformer trovato nella pipeline (prep).")

def _find_regressor(pipe: Pipeline):
    return pipe.steps[-1][1]

def _rel_to_prop(p: Path | None) -> str | None:
    if not p: return None
    try: return os.path.relpath(p, PROP_DIR).replace("\\", "/")
    except Exception: return p.as_posix()

# ─────────────────────────────────────────────────────────────────────────────
# Priors & derivate (coerenti con il training)
# ─────────────────────────────────────────────────────────────────────────────
DERIVED_FEATURES = [
    "log_size_m2","sqm_per_room","baths_per_100sqm",
    "elev_x_floor","no_elev_high_floor","rooms_per_100sqm",
    "city_zone_prior","region_index_prior",
]
DERIVED_SET = set(DERIVED_FEATURES)

# Config per priors
from notebooks.shared.common.config import ASSET_CONFIG
_PROP = ASSET_CONFIG["property"]
CITY_BASE = {str(c).lower(): {str(z).lower(): float(v) for z, v in zv.items()}
             for c, zv in (_PROP.get("city_base_prices") or {}).items()}
REGION_INDEX = {str(k).lower(): float(v) for k, v in (_PROP.get("region_index") or {
    "north": 1.05, "center": 1.00, "south": 0.92
}).items()}

# Fallback robusti per city_zone_prior
_ZONE_KEYS = set(z for d in CITY_BASE.values() for z in d.keys())
_ZONE_MED = {z: float(np.nanmedian([d.get(z, np.nan) for d in CITY_BASE.values()]))
             for z in _ZONE_KEYS} if CITY_BASE else {}
_GLOBAL_CITYZONE_MED = float(np.nanmedian([v for d in CITY_BASE.values() for v in d.values()])) if CITY_BASE else 0.0

# Deriver istanza (classe importabile → pickle-safe) con adattamento alla firma
def _build_deriver(Cls):
    """
    Instanzia il deriver passando solo gli argomenti supportati dalla sua __init__.
    Compatibile con:
      - PropertyDerivedFeatures(city_base=..., region_index=..., [flag...])
      - BasicDeriver() o BasicDeriver(city_base=..., region_index=...)
    """
    candidate_kwargs = dict(
        # flag possibili (se non esistono verranno filtrati)
        make_log_size=True,
        make_sqm_per_room=True,
        make_baths_per_100sqm=True,
        make_elev_x_floor=True,
        make_no_elev_penalty=True,
        make_city_zone_prior=True,
        make_region_macro_prior=True,
        # mapping prior
        city_base=CITY_BASE,
        region_index=REGION_INDEX,
        # qualcuno potrebbe accettare anche queste:
        zone_medians=_ZONE_MED,
        global_cityzone_median=_GLOBAL_CITYZONE_MED,
    )

    try:
        sig = inspect.signature(Cls.__init__)
        allowed = {k for k in sig.parameters.keys() if k != "self"}
        kwargs = {k: v for k, v in candidate_kwargs.items() if k in allowed}
        return Cls(**kwargs)  # 1° tentativo: con tutti i parametri ammessi
    except TypeError:
        # fallback ultra-conservativo
        base_kwargs = {}
        for k in ("city_base", "region_index"):
            if k in allowed:
                base_kwargs[k] = candidate_kwargs[k]
        return Cls(**base_kwargs) if base_kwargs else Cls()

feature_deriver = _build_deriver(_Deriver)

# ─────────────────────────────────────────────────────────────────────────────
# Loader X/y: usa solo RAW + basi minime per derivate
# ─────────────────────────────────────────────────────────────────────────────
def _load_training_Xy(cols_raw: list[str]) -> tuple[pd.DataFrame, np.ndarray, np.ndarray | None]:
    """
    Ritorna X (DF), y (k€), w (o None). Aggiunge sempre le basi necessarie
    alle derivate così gli step di serving possono lavorare correttamente.
    """
    base_needed = ["size_m2","rooms","bathrooms","floor","city","zone","region","location"]
    cols_plus_base = list(dict.fromkeys([*cols_raw, *base_needed]))

    def _prepare(df: pd.DataFrame):
        X = df.reindex(columns=cols_plus_base)
        # riempi basi ragionevoli per evitare colonne tutte NaN
        if "city" in X.columns and "location" in df.columns:
            X["city"] = X["city"].fillna(df["location"])
        if "zone" in X.columns:
            X["zone"] = X["zone"].fillna("semi_center")
        if "region" in X.columns:
            X["region"] = X["region"].fillna("center")

        y = pd.to_numeric(df[VALUATION_K], errors="coerce").to_numpy(dtype="float64")
        w_out = None
        if "sample_weight" in df.columns:
            w_out = pd.to_numeric(df["sample_weight"], errors="coerce").to_numpy(dtype="float64")
        return X, y, w_out

    if "df_train" in globals() and isinstance(df_train, pd.DataFrame) and VALUATION_K in df_train.columns:
        return _prepare(df_train)

    candidates = []
    if "data_path" in globals():
        try: candidates.append(Path(str(data_path)))
        except Exception: pass
    candidates += [BASE_OUT / "dataset_generated.parquet", BASE_OUT / "dataset_generated.csv"]

    for p in candidates:
        if p and p.exists():
            df = pd.read_parquet(p) if p.suffix.lower() in {".parquet", ".pq"} else pd.read_csv(p)
            if VALUATION_K not in df.columns:
                continue
            return _prepare(df)

    raise RuntimeError("Impossibile ricostruire X/y (e pesi) per il fit della serving pipeline.")

# ─────────────────────────────────────────────────────────────────────────────
# Salvataggio legacy per retro-compat
# ─────────────────────────────────────────────────────────────────────────────
legacy_model_path = ART_DIR / f"rf_champion_{globals().get('champion','A')}.joblib"
joblib.dump(chosen_pipe, legacy_model_path)
legacy_sha = _sha256_file(legacy_model_path)

# ─────────────────────────────────────────────────────────────────────────────
# Serving pipeline v2 (GeoCanonizer → Deriver → PriorsGuard → prep → TTR)
# ─────────────────────────────────────────────────────────────────────────────
preproc  = clone(_find_preproc(chosen_pipe))
base_reg = _find_regressor(chosen_pipe)
RegCls   = base_reg.__class__
reg_params = base_reg.get_params()

serving_pipe_v2 = Pipeline(steps=[
    ("canon_geo",   GeoCanonizer()),
    ("derive",      feature_deriver),  # classe importabile
    ("priors_guard",PriorsGuard(
        city_base=CITY_BASE,
        region_index=REGION_INDEX,
        zone_medians=_ZONE_MED,
        global_cityzone_median=_GLOBAL_CITYZONE_MED,
    )),
    ("prep",        preproc),
    ("ttr",         TransformedTargetRegressor(
        regressor=RegCls(**reg_params),
        func=np.log1p, inverse_func=np.expm1, check_inverse=False
    )),
])

# Fit: SOLO colonne raw (le derivate le crea la pipeline)
chosen_cols_raw = [c for c in chosen_cols if c not in DERIVED_SET]
X_fit, y_fit, w_fit = _load_training_Xy(chosen_cols_raw)

fit_kwargs = {}
if w_fit is not None and np.isfinite(w_fit).any():
    fit_kwargs = {"ttr__sample_weight": w_fit}

# diagnostica soft prima del fit
# diagnostica soft prima del fit (no fit richiesto sugli step stateless)
X_probe = X_fit.iloc[:128].copy()
X_probe = serving_pipe_v2.named_steps["canon_geo"].transform(X_probe)
X_probe = serving_pipe_v2.named_steps["derive"].transform(X_probe)
X_probe = serving_pipe_v2.named_steps["priors_guard"].transform(X_probe)

print("city_zone_prior NaN (sample 128):",
      int(pd.to_numeric(X_probe.get("city_zone_prior"), errors="coerce").isna().sum()),
      "/", len(X_probe))

serving_pipe_v2.fit(X_fit, y_fit, **fit_kwargs)

# ─────────────────────────────────────────────────────────────────────────────
# Salva pipeline + meta
# ─────────────────────────────────────────────────────────────────────────────
pipe_path = PROP_DIR / "value_regressor_v2.joblib"
meta_path = PROP_DIR / "value_regressor_v2_meta.json"
joblib.dump(serving_pipe_v2, pipe_path)

model_meta = {
    "asset_type": "property",
    "task": "value_regressor",
    "model_version": "v2",
    "model_class": RegCls.__name__,
    "trained_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
    "n_features": int(len(chosen_cols_raw)),
    "features_categorical": list(cat_cols),
    "features_numeric": [c for c in chosen_cols_raw if c not in set(cat_cols)],
    "target_name": VALUATION_K,
    "unit": "k_eur",
    "pipeline_sha256": _sha256_file(pipe_path),
}
meta_path.write_text(json.dumps(model_meta, indent=2, ensure_ascii=False), encoding="utf-8")

# ─────────────────────────────────────────────────────────────────────────────
# Manifest v2
# ─────────────────────────────────────────────────────────────────────────────
manifest_path = PROP_DIR / "training_manifest.json"
try:
    existing = json.loads(manifest_path.read_text(encoding="utf-8")) if manifest_path.exists() else {}
except Exception:
    existing = {}

paths = dict(existing.get("paths", {}))
paths.update({
    "pipeline_path": pipe_path.name,
    "meta_path": meta_path.name,
    "dataset": _rel_to_prop(Path(str(data_path))) if "data_path" in globals() else paths.get("dataset"),
    "rf_model": _rel_to_prop(legacy_model_path),
    "rf_model_sha256": legacy_sha,
    "feature_importance_builtin_csv": _rel_to_prop(MODEL_DIR / "feature_importance_builtin.csv"),
    "feature_importance_builtin_parquet": _rel_to_prop(MODEL_DIR / "feature_importance_builtin.parquet"),
    "feature_importance_permutation_csv": _rel_to_prop(MODEL_DIR / "feature_importance_permutation.csv"),
    "feature_importance_permutation_parquet": _rel_to_prop(MODEL_DIR / "feature_importance_permutation.parquet"),
    "metrics_by_decile_csv": _rel_to_prop(MODEL_DIR / "metrics_by_decile.csv"),
    "metrics_by_decile_parquet": _rel_to_prop(MODEL_DIR / "metrics_by_decile.parquet"),
    "metrics_by_location_csv": _rel_to_prop(MODEL_DIR / "metrics_by_location.csv"),
    "metrics_by_location_parquet": _rel_to_prop(MODEL_DIR / "metrics_by_location.parquet"),
    "predictions_test_parquet": _rel_to_prop(MODEL_DIR / "predictions_test.parquet"),
    "predictions_test_csv": _rel_to_prop(MODEL_DIR / "predictions_test.csv"),
})

metrics = dict(existing.get("metrics", {}))
if {"mA_val","mA_tst","mB_val","mB_tst"}.issubset(globals()):
    champ = globals().get("champion", "A")
    metrics.update({
        "rf_valid": globals()["mA_val"] if champ == "A" else globals()["mB_val"],
        "rf_test":  globals()["mA_tst"] if champ == "A" else globals()["mB_tst"],
    })

feature_config = {
    "categorical": list(cat_cols),
    "numeric": [c for c in chosen_cols_raw if c not in set(cat_cols)],
    "excluded": sorted(list(globals().get("exclude", []))) if "exclude" in globals() else [],
}

manifest_new = {
    "generated_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
    "schema_version": "v2",
    "asset_type": "property",
    "task": "value_regressor",
    "seed": SEED,
    "paths": paths,
    "model_meta": {"model_version": "v2", "model_class": RegCls.__name__},
    "metrics": metrics,
    "feature_config": feature_config,
    "expected_features": {
        "categorical": feature_config["categorical"],
        "numeric": feature_config["numeric"],
        "derived": DERIVED_FEATURES,
    },
}

merged = dict(existing)
for k, v in manifest_new.items():
    if v is not None:
        merged[k] = v

manifest_path.write_text(json.dumps(merged, indent=2, ensure_ascii=False), encoding="utf-8")

print("✅ Saved legacy (artifacts):", ART_DIR / legacy_model_path.name)
print("✅ Saved serving v2 pipeline:", pipe_path)
print("✅ Saved meta:", meta_path)
print("✅ Saved manifest:", manifest_path)
print("Derivate in serving: ", ", ".join(DERIVED_FEATURES))

In [None]:
# 11) Post-training drift check (location) — firma: compute_location_drift(df, target_weights, tolerance)
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
import pandas as pd

# 1) Import robusto della funzione (firma: df, target_weights, tolerance)
try:
    from notebooks.shared.n03_train_model.metrics import compute_location_drift  # <- richiede (df, target_weights, tolerance)
    _HAS_CLD = True
except Exception:
    _HAS_CLD = False

# 2) Contesto: path & config (già definiti a inizio notebook)
manifest_path = PROP_DIR / "training_manifest.json"  # es.: notebooks/outputs/modeling/property/training_manifest.json
try:
    manifest = json.loads(manifest_path.read_text(encoding="utf-8")) if manifest_path.exists() else {}
except Exception:
    manifest = {}

TRAIN_CFG = globals().get("TRAIN_CFG", {}) or {}
LOCATION  = globals().get("LOCATION", "location")
TOL       = float(TRAIN_CFG.get("drift_tolerance", 0.05))  # default 5%

# 3) Helpers -----------------------------------------------------------------
def _norm_weights(d: dict) -> dict[str, float]:
    """Normalizza pesi (>=0) per sommare a 1.0; ignora chiavi con pesi negativi/non numerici."""
    clean = {str(k): float(v) for k, v in d.items() if pd.api.types.is_number(v) and float(v) >= 0.0}
    s = float(sum(clean.values()))
    if s <= 0:
        return {k: 0.0 for k in clean}
    return {k: v / s for k, v in clean.items()}

def _empirical_weights(df_like: pd.DataFrame, col: str) -> dict[str, float]:
    if not isinstance(df_like, pd.DataFrame) or col not in df_like.columns:
        return {}
    vc = df_like[col].dropna().astype(str).value_counts(normalize=True)
    return {k: float(v) for k, v in vc.items()}

def _fallback_drift(df_like: pd.DataFrame, target_w: dict[str, float]) -> dict:
    """Fallback semplice: differenze assolute + ratio + TVD/JSD."""
    emp = _empirical_weights(df_like, LOCATION)
    keys = sorted(set(emp) | set(target_w))
    p = np.array([emp.get(k, 0.0) for k in keys], dtype=float)
    q = np.array([target_w.get(k, 0.0) for k in keys], dtype=float)
    eps = 1e-12
    p = np.clip(p, eps, 1.0); q = np.clip(q, eps, 1.0)
    p /= p.sum(); q /= q.sum()
    m = 0.5 * (p + q)
    jsd = float(0.5 * (np.sum(p * (np.log(p) - np.log(m))) + np.sum(q * (np.log(q) - np.log(m)))))
    tvd = float(0.5 * np.abs(p - q).sum())
    report = {
        "method": "fallback_jsd_tvd",
        "JSD": jsd,
        "TVD": tvd,
        "per_location": {}
    }
    for k in keys:
        emp_k = emp.get(k, 0.0); tgt_k = target_w.get(k, 0.0)
        diff = emp_k - tgt_k
        report["per_location"][k] = {
            "target_weight": tgt_k,
            "empirical_weight": emp_k,
            "difference": diff,
            "drifted": bool(abs(diff) > TOL),
            "ratio": (emp_k / tgt_k) if tgt_k > 0 else float("inf")
        }
    return report

# 4) Scegli scenario: baseline da config OPPURE train vs test ---------------
baseline_cfg = (TRAIN_CFG.get("expected_profile", {}) or {}).get("location_distribution", {}) or None

try:
    if baseline_cfg:
        # Scenario A: confronto dataset complessivo vs baseline attesa
        if "df" not in globals():
            raise RuntimeError("df non disponibile per il drift vs baseline.")
        target_w = _norm_weights(baseline_cfg)
        if _HAS_CLD:
            drift_result = compute_location_drift(df, target_w, TOL)
        else:
            drift_result = _fallback_drift(df, target_w)
        out_path = MODEL_DIR / "location_drift_vs_expected.json"
        out_key  = "location_drift_vs_expected"
    else:
        # Scenario B: train vs test — usa la distribuzione del TEST come target_weights
        if "df_train" not in globals() or "df_test" not in globals():
            raise RuntimeError("df_train/df_test non disponibili per il drift train vs test.")
        tgt = _empirical_weights(df_test, LOCATION)
        target_w = _norm_weights(tgt)
        if _HAS_CLD:
            drift_result = compute_location_drift(df_train, target_w, TOL)
        else:
            drift_result = _fallback_drift(df_train, target_w)
        out_path = MODEL_DIR / "location_drift_train_vs_test.json"
        out_key  = "location_drift_train_vs_test"

except Exception as e:
    # Fallback totale (se qualcosa va storto nelle ramificazioni sopra)
    try:
        logger.info("compute_location_drift non disponibile/errore (%s). Uso fallback semplice.", e)
    except Exception:
        print(f"compute_location_drift non disponibile/errore ({e}). Uso fallback semplice.")
    if baseline_cfg and "df" in globals():
        drift_result = _fallback_drift(df, _norm_weights(baseline_cfg))
        out_path = MODEL_DIR / "location_drift_vs_expected.json"
        out_key  = "location_drift_vs_expected"
    elif "df_train" in globals() and "df_test" in globals():
        drift_result = _fallback_drift(df_train, _norm_weights(_empirical_weights(df_test, LOCATION)))
        out_path = MODEL_DIR / "location_drift_train_vs_test.json"
        out_key  = "location_drift_train_vs_test"
    else:
        raise

# 5) Persistenza output + aggiornamento manifest ----------------------------
out_path.write_text(json.dumps(drift_result, indent=2, ensure_ascii=False), encoding="utf-8")

m = dict(manifest.get("metrics", {}))
m[out_key] = drift_result
manifest["metrics"] = m
manifest_path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")

print(f"Saved drift metrics → {out_path.as_posix()}  (manifest aggiornato: {manifest_path.as_posix()})")

### ModelReportRunner

In [None]:
# === Model Report Runner (coerente con il training: OrdinalEncoder + TTR, paths outputs/…) ===
from __future__ import annotations
import json
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.compose import TransformedTargetRegressor

# --- helper RMSE retro-compatibile (sklearn vecchie non hanno 'squared=')
def _rmse(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# ── 0) Carica manifest v2 per feature_config (paths coerenti: outputs/…)
PROP_DIR = Path("outputs/modeling/property")
MF_PATH = PROP_DIR / "training_manifest.json"

cat_cols, num_cols = [], []
if MF_PATH.exists():
    mf = json.loads(MF_PATH.read_text(encoding="utf-8"))
    fc = (mf.get("feature_config") or {})
    cat_cols = list(fc.get("categorical") or [])
    num_cols = list(fc.get("numeric") or [])
else:
    # fallback se non c'è manifest
    cat_cols = ["location", "region", "zone", "energy_class", "urban_type", "orientation", "view", "condition", "heating"]
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c != "valuation_k"]

# Sanity: tieni solo colonne esistenti
cat_cols = [c for c in cat_cols if c in df.columns]
num_cols = [c for c in num_cols if c in df.columns]
ALL = cat_cols + num_cols

assert "valuation_k" in df.columns, "Manca il target valuation_k"
assert len(ALL) > 0, "Nessuna feature trovata (cat+num)"

# ── 1) Pipeline coerente con il training (Ordinal + imputazioni) + TTR
pre_all = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("enc", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
        ]), cat_cols) if cat_cols else ("cat", "drop", []),
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
        ]), num_cols) if num_cols else ("num", "drop", []),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

rf_all = RandomForestRegressor(
    n_estimators=300, random_state=42, n_jobs=-1, max_depth=None, min_samples_leaf=2
)

pipe_all = Pipeline([
    ("prep", pre_all),
    ("ttr", TransformedTargetRegressor(
        regressor=rf_all,
        func=np.log1p, inverse_func=np.expm1, check_inverse=False
    )),
])

# ── 2) Valutazione split semplice (random) con tutte le feature
X_all = df[ALL].copy()
y_nat = df["valuation_k"].astype(float).to_numpy()

X_tr, X_te, y_tr, y_te = train_test_split(X_all, y_nat, test_size=0.2, random_state=42)
pipe_all.fit(X_tr, y_tr)
y_hat = pipe_all.predict(X_te)

r2_all  = r2_score(y_te, y_hat)
mae_all = mean_absolute_error(y_te, y_hat)
rmse_all = _rmse(y_te, y_hat)
print(f"Random split → R²(all)={r2_all:.4f}  MAE={mae_all:.2f}  RMSE={rmse_all:.2f}")

# ── 3) Solo numeriche (stesso schema di imputazione) + TTR
if num_cols:
    pre_num = ColumnTransformer(
        [("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols)],
        remainder="drop",
        verbose_feature_names_out=False,
    )
    rf_num = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1, min_samples_leaf=2)
    pipe_num = Pipeline([
        ("prep", pre_num),
        ("ttr", TransformedTargetRegressor(
            regressor=rf_num, func=np.log1p, inverse_func=np.expm1, check_inverse=False
        )),
    ])
    Xn = df[num_cols].copy()
    Xn_tr, Xn_te, yn_tr, yn_te = train_test_split(Xn, y_nat, test_size=0.2, random_state=42)
    pipe_num.fit(Xn_tr, yn_tr)
    y_num = pipe_num.predict(Xn_te)

    r2_num  = r2_score(yn_te, y_num)
    mae_num = mean_absolute_error(yn_te, y_num)
    rmse_num = _rmse(yn_te, y_num)
    print(f"Random split → R²(num)={r2_num:.4f}  MAE={mae_num:.2f}  RMSE={rmse_num:.2f}")
    print(f"ΔR² (all - num): {r2_all - r2_num:+.4f}")

# ── 4) Stima più robusta: GroupShuffleSplit (group=location se presente)
if "location" in df.columns:
    gss = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    r2s, maes, rmses = [], [], []
    groups = df["location"].astype(str).to_numpy()
    for tr_idx, te_idx in gss.split(df[ALL], y_nat, groups=groups):
        pipe_all.fit(df.iloc[tr_idx][ALL], y_nat[tr_idx])
        y_g = pipe_all.predict(df.iloc[te_idx][ALL])
        r2s.append(r2_score(y_nat[te_idx], y_g))
        maes.append(mean_absolute_error(y_nat[te_idx], y_g))
        rmses.append(_rmse(y_nat[te_idx], y_g))
    print(f"GSS 5× (group=location) → R²={np.mean(r2s):.4f}±{np.std(r2s):.4f}  "
          f"MAE={np.mean(maes):.2f}±{np.std(maes):.2f}  RMSE={np.mean(rmses):.2f}±{np.std(rmses):.2f}")

# ── 5) Feature importance (dal RF dentro TTR) + nomi coerenti CT
try:
    try:
        feat_names = list(pipe_all.named_steps["prep"].get_feature_names_out())
    except Exception:
        feat_names = [*cat_cols, *num_cols]

    rf_fitted = pipe_all.named_steps["ttr"].regressor_
    importances = getattr(rf_fitted, "feature_importances_", None)
    if importances is None:
        raise RuntimeError("feature_importances_ non disponibile sul regressore.")

    imp = np.asarray(importances, dtype=float)
    if len(feat_names) != len(imp):
        feat_names = [f"f{i}" for i in range(len(imp))]

    fi = (pd.DataFrame({"feature": feat_names, "importance": imp})
            .sort_values("importance", ascending=False)
            .reset_index(drop=True))

    print("\nTop 10 feature importance (Ordinal+CT):")
    print(fi.head(10).to_string(index=False))
except Exception as e:
    print("Feature importance non disponibile:", e)

In [None]:
# --- Introspezione categorie OHE (safe) ---
# 1) prende il prep dal pipeline
prep = pipe_A.named_steps["prep"]

def _ensure_prep_fitted(prep):
    """Se prep non è fit, fa un fit temporaneo SOLO sul preproc (senza toccare il modello)."""
    if not hasattr(prep, "transformers_"):
        # se esiste uno step di derivazione, applicalo prima del fit del prep
        X_for_fit = X_train
        if "derive" in pipe_A.named_steps:
            X_for_fit = pipe_A.named_steps["derive"].transform(X_train)
        # fit solo il preproc (NO model)
        prep.fit(X_for_fit, y_train)
    return prep

prep = _ensure_prep_fitted(prep)

# 2) recupera il OneHotEncoder dentro al ramo 'cat'
cat_branch = prep.named_transformers_.get("cat")
if hasattr(cat_branch, "named_steps"):
    ohe = cat_branch.named_steps.get("encode")
else:
    ohe = None

if ohe is None or not hasattr(ohe, "categories_"):
    raise RuntimeError("Il ramo categorico non contiene uno step 'encode' con OneHotEncoder già fit.")

cats_map = {col: list(cats) for col, cats in zip(cat_cols, ohe.categories_)}

print("OHE categories — region:", cats_map.get("region", [])[:10])
print("OHE categories — zone  :", cats_map.get("zone", [])[:10])

# extra: verifica handle_unknown
try:
    print("handle_unknown:", getattr(ohe, "handle_unknown", None))
except Exception:
    pass