### Imports & Paths

In [1]:
from __future__ import annotations

import os
import re
import json
import logging
import warnings
from pathlib import Path
from typing import Dict, Any, Optional, List, Tuple
from datetime import datetime, timezone

import joblib                     # type: ignore
import numpy as np                # type: ignore
import pandas as pd               # type: ignore

# sklearn (per verificare che il modello sia fitted)
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError

# Shared modules (config, utils, constants)
from shared.common.config import configure_logger
from shared.common.utils import canonical_json_dumps, sha256_hex
from shared.common.sanity_checks import leakage_gate, scale_gate
from shared.common.constants import SCHEMA_VERSION, NOTE_MAX_BYTES

# -----------------------------------------------------------------------------
# Setup & Config
# -----------------------------------------------------------------------------
ASSET_TYPE = "property"

# Prefer a specific version via env or fallback to "v2"
PREFERRED_MODEL_VERSION = os.getenv("MODEL_VERSION", "v2")

# --- Model roots (prefer /shared/outputs/models) ---
_candidates: List[Path] = []
env_root = os.getenv("MODELS_ROOT")
if env_root and env_root.strip():
    _c = Path(env_root)
    _candidates.append(_c)

_candidates += [Path("./outputs/modeling")]

# default se non esiste ancora nulla
MODELS_ROOT: Path = next((c for c in _candidates if c.exists()), Path("../shared/outputs/models"))
MODEL_DIR = MODELS_ROOT / ASSET_TYPE

# Predictions log path (JSONL append)
LOG_PATH = Path("./outputs/logs/predictions_log.jsonl")
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

# API comparison toggle (can be overridden via env)
API_BASE = os.getenv("API_BASE", "http://127.0.0.1:8000")
COMPARE_WITH_API = os.getenv("COMPARE_WITH_API", "true").lower() in {"1", "true", "yes", "y"}

# Configure logger (firma: configure_logger(level, name=None, json_format=None))
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
LOG_JSON = os.getenv("LOG_JSON", "false").lower() in {"1", "true", "yes", "y"}
logger = configure_logger(level=LOG_LEVEL, name="nb04_infer", json_format=LOG_JSON)
warnings.filterwarnings("ignore", category=UserWarning)

logger.info("Model root resolved", extra={"MODELS_ROOT": str(MODELS_ROOT), "MODEL_DIR": str(MODEL_DIR)})

# -----------------------------------------------------------------------------
# Helpers
# -----------------------------------------------------------------------------
_version_re = re.compile(r"value_regressor_(v\d+)\.joblib$")

def _list_versions(dirpath: Path) -> List[str]:
    items: List[Tuple[int, str]] = []
    for p in dirpath.glob("value_regressor_v*.joblib"):
        m = _version_re.search(p.name)
        if m:
            v = m.group(1)  # 'vN'
            try:
                n = int(v[1:])
            except Exception:
                n = -1
            items.append((n, v))
    items.sort(reverse=True)  # dalla più recente
    return [v for _, v in items]

def _is_fitted_pipeline(pl) -> bool:
    try:
        # sklearn Pipeline: ultimo step è il modello
        if hasattr(pl, "steps"):
            check_is_fitted(pl.steps[-1][1])
        else:
            check_is_fitted(pl)
        return True
    except Exception:
        return False

def _read_json(path: Path) -> Dict[str, Any]:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def _sha256_file(path: Path) -> str:
    return sha256_hex(path.read_bytes())

def _dedup_preserve(seq: List[str]) -> List[str]:
    seen, out = set(), []
    for s in seq:
        if s not in seen:
            seen.add(s); out.append(s)
    return out

# -----------------------------------------------------------------------------
# Resolve a FITTED model (prefer requested version, else newest fitted)
# -----------------------------------------------------------------------------
def resolve_fitted_model(base_dir: Path, preferred: Optional[str]) -> Dict[str, Any]:
    # 1) prova la versione preferita
    if preferred:
        p = base_dir / f"value_regressor_{preferred}.joblib"
        m = base_dir / f"value_regressor_{preferred}_meta.json"
        if p.exists() and m.exists():
            pl = joblib.load(p)
            if _is_fitted_pipeline(pl):
                return {
                    "version": preferred,
                    "pipeline": p,
                    "meta": m,
                    "manifest": base_dir / "training_manifest.json",
                    "obj": pl,
                }
            logger.warning(f"Model {preferred} presente ma non fitted; cerco fallback…")

    # 2) cerca la prima versione fitted tra quelle disponibili (vN ↓)
    for ver in _list_versions(base_dir):
        p = base_dir / f"value_regressor_{ver}.joblib"
        m = base_dir / f"value_regressor_{ver}_meta.json"
        if not (p.exists() and m.exists()):
            continue
        pl = joblib.load(p)
        if _is_fitted_pipeline(pl):
            return {
                "version": ver,
                "pipeline": p,
                "meta": m,
                "manifest": base_dir / "training_manifest.json",
                "obj": pl,
            }

    raise FileNotFoundError(f"Nessun modello fitted trovato in {base_dir}")

# -----------------------------------------------------------------------------
# Load pipeline & metadata (fitted fallback) + expected features
# -----------------------------------------------------------------------------
resolved = resolve_fitted_model(MODEL_DIR, PREFERRED_MODEL_VERSION)
MODEL_VERSION: str = resolved["version"]
PIPELINE_PATH: Path = resolved["pipeline"]
META_PATH: Path = resolved["meta"]
MANIFEST_PATH: Path = resolved["manifest"]
pipeline = resolved["obj"]

logger.info(
    "Using model artifacts",
    extra={
        "asset_type": ASSET_TYPE,
        "model_version": MODEL_VERSION,
        "pipeline": str(PIPELINE_PATH),
        "meta": str(META_PATH),
        "manifest": str(MANIFEST_PATH) if MANIFEST_PATH.exists() else None,
    },
)

# carica meta + verifica integrità bundle (hash)
model_meta: Dict[str, Any] = _read_json(META_PATH)
expected_hash = model_meta.get("model_hash") or model_meta.get("pipeline_sha256")
actual_hash = _sha256_file(PIPELINE_PATH)
if expected_hash and expected_hash != actual_hash:
    raise ValueError(
        f"Bundle manomesso: meta={expected_hash[:8]}… != actual={actual_hash[:8]}…"
    )

# expected features: preferisci feature_order.json → poi manifest → infine meta.json
feature_order_candidates: List[Path] = [
    PIPELINE_PATH.parent / "feature_order.json",
]
if MANIFEST_PATH.exists():
    try:
        manifest = _read_json(MANIFEST_PATH)
        path_from_manifest = manifest.get("paths", {}).get("feature_order")
        if path_from_manifest:
            feature_order_candidates.append(Path(path_from_manifest))
    except Exception as e:
        logger.warning("Manifest presente ma non leggibile; fallback a meta.json", extra={"error": str(e)})
        manifest = {}

FEATURE_ORDER_PATH: Optional[Path] = next((p for p in feature_order_candidates if p and p.exists()), None)

categorical_expected: List[str] = model_meta.get("features_categorical", []) or []
numeric_expected: List[str] = model_meta.get("features_numeric", []) or []

if FEATURE_ORDER_PATH:
    try:
        feature_order: List[str] = _read_json(FEATURE_ORDER_PATH)
        ALL_EXPECTED: List[str] = list(feature_order)
    except Exception as e:
        logger.warning("feature_order.json non leggibile; uso meta/manifest", extra={"error": str(e)})
        ALL_EXPECTED = _dedup_preserve(list(categorical_expected) + [c for c in numeric_expected if c not in categorical_expected])
else:
    # fallback senza feature_order.json
    if MANIFEST_PATH.exists():
        try:
            # compat: in alcuni manifest i feature possono stare in model.feature_list oppure model.features
            feats_from_manifest = (
                manifest.get("model", {}).get("feature_list")
                or manifest.get("model", {}).get("features", {})
            )
            if isinstance(feats_from_manifest, dict):
                categorical_expected = feats_from_manifest.get("categorical", categorical_expected) or categorical_expected
                numeric_expected = feats_from_manifest.get("numeric", numeric_expected) or numeric_expected
        except Exception as e:
            logger.warning("Impossibile leggere feature da manifest; uso meta.json", extra={"error": str(e)})
    ALL_EXPECTED = _dedup_preserve(list(categorical_expected) + [c for c in numeric_expected if c not in categorical_expected])

print(f"✅ Loaded FITTED model {MODEL_VERSION} from {PIPELINE_PATH.parent}")
print(f"   Features: {len(ALL_EXPECTED)} (cat={len(categorical_expected)}, num={len(numeric_expected)})")
print(f"API compare: {COMPARE_WITH_API} → {API_BASE}")

[2025-09-23 02:51:58,373] INFO nb04_infer: Model root resolved
[2025-09-23 02:52:01,644] INFO nb04_infer: Using model artifacts


✅ Loaded FITTED model v2 from outputs\modeling\property
   Features: 26 (cat=8, num=18)
API compare: True → http://127.0.0.1:8000


### Validation Utilities

In [2]:
from __future__ import annotations
from typing import Dict, Any, Tuple
import numpy as np

from shared.common.pricing import explain_price
from shared.common.constants import LOCATION
from shared.common.utils import canonical_location, get_utc_now
from shared.common.sanity_checks import price_benchmark, validate_property

# Alias comuni → chiavi canoniche
_CANONICAL_ALIASES = {
    "sqm": "size_m2",
    "size": "size_m2",
    "m2": "size_m2",
    "year": "year_built",
    "built_year": "year_built",
    "balcony": "has_balcony",
    "garden": "has_garden",
    "garage": "has_garage",
    "air_quality": "air_quality_index",
    "noise": "noise_level",
    "valuation": "valuation_k",
    "price_k": "valuation_k",
}

# Derivate consentite (no leakage)
_SAFE_DERIVED = {"age_years", "luxury_score", "env_score"}


def _canonicalize_keys(record: Dict[str, Any]) -> Dict[str, Any]:
    """Mappa alias comuni verso i nomi campo canonici, con normalizzazione soft delle chiavi."""
    out: Dict[str, Any] = {}
    for k, v in record.items():
        k_norm = (k or "").strip()
        k_lc = k_norm.lower()
        out[_CANONICAL_ALIASES.get(k_lc, k_lc)] = v
    return out


def _autofill_safe(record: Dict[str, Any]) -> Dict[str, Any]:
    """
    Deriva SOLO campi non leaky e indipendenti dal target:
    - age_years da year_built
    - luxury_score da has_garden/has_balcony/has_garage
    - env_score da air_quality_index e noise_level (normalizzati in [0,1])
    """
    r = dict(record)

    # age_years
    if "age_years" not in r and "year_built" in r and r.get("year_built") not in (None, ""):
        try:
            r["age_years"] = max(0, get_utc_now().year - int(r["year_built"]))
        except Exception:
            # se non coerente, lasciamo mancante (lo segnalerà la validazione)
            pass

    # luxury_score (media semplice di tre boolean/0-1)
    if "luxury_score" not in r:
        g = 1.0 if bool(r.get("has_garden", 0)) else 0.0
        b = 1.0 if bool(r.get("has_balcony", 0)) else 0.0
        ga = 1.0 if bool(r.get("has_garage", 0)) else 0.0
        r["luxury_score"] = (g + b + ga) / 3.0

    # env_score in [0,1] (qualità aria ↑, rumore ↓)
    if "env_score" not in r:
        try:
            aq = float(r.get("air_quality_index", 0.0))
            nz = float(r.get("noise_level", 0.0))
            aq_n = float(np.clip(aq / 100.0, 0.0, 1.0))
            nz_n = float(np.clip(nz / 100.0, 0.0, 1.0))
            r["env_score"] = float(np.clip(aq_n * (1.0 - nz_n), 0.0, 1.0))
        except Exception:
            r["env_score"] = None

    # ⚠️ NON deriviamo:
    # - price_per_sqm (leaky dal target)
    # - efficienze basate su valuation_k (leakage)
    return r


def validate_input_record(
    record: Dict[str, Any],
    *,
    strict: bool = True,
    drop_extras: bool = True
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Canonicalizza → deriva campi sicuri → **VALIDA su versione pre-drop** (dominio) → poi filtra per il modello.
    """
    # A) snapshot "dominio" per il validator (NON rinomina chiavi)
    dom = dict(record)  # copia shallow
    # normalizza location per il validator (se presente)
    if LOCATION in dom and dom.get(LOCATION):
        try:
            dom[LOCATION] = canonical_location(dom[LOCATION])
        except Exception:
            pass
    # defaults utili al validator
    dom.setdefault("asset_type", "property")
    dom.setdefault("last_verified_ts", get_utc_now().replace(microsecond=0).isoformat().replace("+00:00","Z"))

    # B) versione "modello": alias + derivate sicure
    base = _canonicalize_keys(record)
    base = _autofill_safe(base)
    if LOCATION in base and base.get(LOCATION):
        try:
            base[LOCATION] = canonical_location(base[LOCATION])
        except Exception:
            pass

    # C) VALIDAZIONE su 'dom' (pre-drop, con chiavi originali)
    report = validate_property(dom)

    if strict and not report.get("ok", True):
        raise ValueError(f"❌ Property validation failed: {report.get('errors') or report}")

    # D) Filtra extra SOLO per l’input al modello
    allowed = set(globals().get("ALL_EXPECTED", [])) | _SAFE_DERIVED
    extras = [k for k in list(base.keys()) if k not in allowed]
    if drop_extras:
        for k in extras:
            base.pop(k, None)
    elif strict and extras:
        raise ValueError(f"❌ Unexpected extra features: {extras}")

    return base, report


def detect_anomalies(record: Dict[str, Any]) -> Tuple[bool, Dict[str, Any]]:
    """
    Wrapper sugli esiti di validate_property:
    - True se presenti violazioni/blocchi forti (es. campi fuori dominio, misure impossibili)
    - Ritorna anche il report per logging/telemetria.
    """
    _, report = validate_input_record(record, strict=False)
    has_blockers = not report.get("ok", True)
    return has_blockers, report


def _maybe_explain_price(rec: dict) -> dict | None:
    """Breakdown euristico del prezzo (no ML), utile per trasparenza UI."""
    try:
        return explain_price(rec)  # dict con componenti/moltipl. – già nel tuo /shared
    except Exception as e:
        # usa logger globale se disponibile nel notebook
        try:
            logger.info("explain_price not available", extra={"error": str(e)})
        except Exception:
            pass
        return None

def _price_benchmark_flag(rec: dict, yhat_k: float) -> dict | None:
    """
    Flag 'fuori banda' rispetto alle mediane/location (se disponibili nello shared).
    Ritorna un dict tipo: {"z":..., "out_of_band": bool, "band": [low, high], ...}
    """
    try:
        return price_benchmark(location=rec.get("location"), valuation_k=float(yhat_k))
    except Exception as e:
        try:
            logger.info("price_benchmark not available", extra={"error": str(e)})
        except Exception:
            pass
        return None

### Sample Single Property

In [3]:
from uuid import uuid4
from shared.common.constants import ASSET_ID

# Esempio di record "grezzo" da UI/utente
sample_property_raw = {
    "location": "Milan",
    "size_m2": 500,
    "rooms": 4,
    "bathrooms": 2,
    "year_built": 1999,
    "floor": 2,
    "building_floors": 6,
    "has_elevator": 1,
    "has_garden": 0,
    "has_balcony": 1,
    "has_garage": 1,
    "energy_class": "B",
    "humidity_level": 50.0,
    "temperature_avg": 20.5,
    "noise_level": 40,
    "air_quality_index": 70,
    "owner_occupied": 1,
    "public_transport_nearby": 1,
    "distance_to_center_km": 2.5,
}

# 1) Valida e derivi SOLO campi sicuri (age_years, luxury_score, env_score)
sample_property, validation_report = validate_input_record(sample_property_raw, strict=True)

# 2) Normalizza boolean-like a intero {0,1} (aiuta schema/serving)
_bool_like = [
    k for k in sample_property.keys()
    if k.startswith("has_")
] + ["owner_occupied", "public_transport_nearby"]
for k in _bool_like:
    if k in sample_property:
        sample_property[k] = int(bool(sample_property[k]))

# 3) Genera un asset_id se mancante (non influisce sulle feature del modello)
if ASSET_ID not in sample_property:
    sample_property[ASSET_ID] = f"asset_infer_{uuid4().hex[:8]}"

logger.info(
    "Sample property validated",
    extra={
        "asset_id": sample_property.get(ASSET_ID),
        "location": sample_property.get("location"),
        "ok": validation_report.get("ok", True),
        "warnings": validation_report.get("warnings"),
        "errors": validation_report.get("errors"),
    },
)
print(f"✅ Sample validated. asset_id={sample_property.get(ASSET_ID)} location={sample_property.get('location')}")

[VALIDATION] Asset unknown normalized. Errors=['condition_score_missing', 'risk_score_missing', 'luxury_score_missing', 'env_score_missing', 'valuation_k_too_low_or_missing', 'price_per_sqm_non_positive_or_missing', "missing_keys:['listing_month', 'view', 'age_years', 'orientation', 'urban_type', 'risk_score', 'parking_spot', 'condition_score', 'zone', 'asset_id', 'cellar', 'garage', 'concierge', 'condition', 'env_score', 'luxury_score', 'region', 'attic', 'heating']"] Flags=['condition_score_missing', 'risk_score_missing', 'luxury_score_missing', 'env_score_missing', 'valuation_override', 'price_per_sqm_recomputed', 'schema_incomplete'] Changes={'valuation_k': (None, 250.0), 'validation_errors': (None, ['condition_score_missing', 'risk_score_missing', 'luxury_score_missing', 'env_score_missing', 'valuation_k_too_low_or_missing', 'price_per_sqm_non_positive_or_missing', "missing_keys:['listing_month', 'view', 'age_years', 'orientation', 'urban_type', 'risk_score', 'parking_spot', 'cond

✅ Sample validated. asset_id=asset_infer_eaa37945 location=None


### Load Pipeline & Metadata

In [4]:
loaded_version = MODEL_VERSION

def _dedup_preserve(seq: List[str]) -> List[str]:
    seen: set[str] = set()
    out: List[str] = []
    for s in seq:
        if s not in seen:
            seen.add(s)
            out.append(s)
    return out

def _read_json(path: Path) -> dict:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def resolve_expected_features(
    model_meta: dict,
    manifest_path: Path,
    feature_order_path: Optional[Path] = None,
) -> Tuple[List[str], List[str], List[str]]:
    """
    Priorità:
    1) feature_order.json (ordine canone per serving → parità train→serve)
    2) manifest: model.feature_list / model.features.{categorical,numeric}
    3) meta.json: features_categorical / features_numeric
    Ritorna: (categorical, numeric, all_expected_in_order)
    """
    # 1) feature_order.json (se presente)
    if feature_order_path and feature_order_path.exists():
        try:
            order: List[str] = _read_json(feature_order_path)
            order = _dedup_preserve([str(c) for c in order])
            # Recupera cat/num di supporto da meta (non influiscono sull'ordine)
            cat = list(model_meta.get("features_categorical", []) or [])
            num = list(model_meta.get("features_numeric", []) or [])
            cat = _dedup_preserve([c for c in cat if c in order])
            num = _dedup_preserve([c for c in num if c in order and c not in set(cat)])
            return cat, num, order
        except Exception as e:
            logger.warning("feature_order.json non leggibile; fallback a manifest/meta", extra={"error": str(e)})

    # 2) manifest (cat/num)
    cat = list(model_meta.get("features_categorical", []) or [])
    num = list(model_meta.get("features_numeric", []) or [])
    if manifest_path.exists():
        try:
            mf = _read_json(manifest_path)
            feats = (mf.get("model", {}).get("feature_list") or mf.get("model", {}).get("features"))
            if isinstance(feats, dict):
                cat = list(feats.get("categorical", cat) or cat)
                num = list(feats.get("numeric", num) or num)
        except Exception as e:
            logger.warning("Manifest presente ma non leggibile; fallback a meta.json", extra={"error": str(e)})

    # 3) dedup & nessun overlap; ordine: cat poi num (stabile)
    cat = _dedup_preserve([str(c) for c in cat])
    num = _dedup_preserve([str(c) for c in num if c not in set(cat)])
    all_expected = cat + num
    return cat, num, all_expected

# ---- Risolvi il path di feature_order.json (se disponibile) ----
feature_order_candidates: List[Path] = []

# se definito in una cella precedente, usalo
if "FEATURE_ORDER_PATH" in globals() and isinstance(FEATURE_ORDER_PATH, (Path, str)) and FEATURE_ORDER_PATH:
    feature_order_candidates.append(Path(FEATURE_ORDER_PATH))

# manifest -> paths.feature_order
if MANIFEST_PATH.exists():
    try:
        _mf = _read_json(MANIFEST_PATH)
        p = _mf.get("paths", {}).get("feature_order")
        if p:
            feature_order_candidates.append(Path(p))
    except Exception:
        pass

# default nella stessa dir del modello
feature_order_candidates.append(PIPELINE_PATH.parent / "feature_order.json")

FEATURE_ORDER_PATH_RESOLVED: Optional[Path] = next((p for p in feature_order_candidates if p and p.exists()), None)

categorical_expected, numeric_expected, ALL_EXPECTED = resolve_expected_features(
    model_meta=model_meta,
    manifest_path=MANIFEST_PATH,
    feature_order_path=FEATURE_ORDER_PATH_RESOLVED,
)

assert len(ALL_EXPECTED) > 0, "Nessuna feature attesa risolta (controlla meta/manifest/feature_order.json)."

logger.info(
    "Expected features resolved",
    extra={
        "model_version": loaded_version,
        "n_categorical": len(categorical_expected),
        "n_numeric": len(numeric_expected),
        "n_total": len(ALL_EXPECTED),
        "feature_order_path": str(FEATURE_ORDER_PATH_RESOLVED) if FEATURE_ORDER_PATH_RESOLVED else None,
    },
)
print(
    f"✅ Artifacts ready — model {loaded_version} | "
    f"features: {len(ALL_EXPECTED)} (cat={len(categorical_expected)}, num={len(numeric_expected)})"
)

[2025-09-23 02:52:02,472] INFO nb04_infer: Expected features resolved


✅ Artifacts ready — model v2 | features: 26 (cat=8, num=18)


### Predict with confidence

In [41]:
from typing import Dict, Any, Tuple, Optional, List
from pathlib import Path
import json
import numpy as np
import pandas as pd

# sklearn (opzionale: in ambienti "slim" può non esserci)
try:
    from sklearn.pipeline import Pipeline  # type: ignore
except Exception:
    Pipeline = None  # type: ignore

# -------------------------------------------------------------------
# Conf levels → z-approx (no SciPy richiesto)
# -------------------------------------------------------------------
_Z_FOR_CONF = {0.80: 1.282, 0.90: 1.645, 0.95: 1.960, 0.98: 2.326, 0.99: 2.576}
def _z_for_conf(conf: float) -> float:
    return _Z_FOR_CONF.get(round(conf, 2), 1.960)

# -------------------------------------------------------------------
# Helpers (NO uso di variabili globali: pipeline/manifest/features sono parametri)
# -------------------------------------------------------------------
def _to_numpy_nan(x):
    try:
        import pandas as _pd
        if x is _pd.NA:
            return np.nan
    except Exception:
        pass
    return x

def _coerce_nullable_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    """Converte dtypes pandas 'nullable' in tipi compatibili sklearn."""
    df = df.copy()
    for col in df.columns:
        dt = df[col].dtype
        dtn = str(dt).lower()
        # string/boolean nullable -> object
        if "string" in dtn or "boolean" in dtn:
            df[col] = df[col].astype(object)
        # interi nullable -> float64
        if dtn in ("int64", "int32", "int16") and getattr(dt, "name", "").startswith("Int"):
            # Caso raro: se dt è 'Int64' di pandas, converti a float64
            df[col] = df[col].astype("float64")
        # fallback: rimpiazza eventuali <NA>
        try:
            import pandas as _pd
            df[col] = df[col].replace({_pd.NA: np.nan})
        except Exception:
            pass
    return df

def _clean_missing_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalizza i NaN:
      - coerce dei dtypes nullable (string/boolean/IntXX) verso object/float
      - sostituisce ogni pd.NA con np.nan (evita 'boolean value of NA is ambiguous')
    """
    df = df.copy()
    df = _coerce_nullable_dtypes(df)
    try:
        import pandas as _pd
        df = df.replace({_pd.NA: np.nan})
        df = df.applymap(_to_numpy_nan)
        df = df.where(_pd.notna(df), np.nan)
    except Exception:
        df = df.where(pd.notna(df), np.nan)
    return df

GEO_REQUIRED = ("city", "zone", "region")

def _prepare_df(record: Dict[str, Any], expected_features: List[str]) -> pd.DataFrame:
    """
    Prepara il DF con le feature attese.
    - Colonne mancanti riempite con np.nan.
    - city/zone/region sempre presenti come stringa vuota "" (evita pandas.NA e i crash in SimpleImputer).
    - Dtypes numerici safe (niente boolean/integer nullable).
    """
    import numpy as np
    import pandas as pd
    # importa helper dtype compatibili con varie versioni di pandas
    try:
        from pandas.api.types import is_integer_dtype, is_bool_dtype
    except Exception:
        # fallback ultra-conservativo
        def is_integer_dtype(s):  # type: ignore
            return str(s.dtype).startswith(("int", "Int"))
        def is_bool_dtype(s):     # type: ignore
            return str(s.dtype).startswith(("bool", "Boolean"))

    # 1) riga ordinata sulle sole expected_features
    row = {k: record.get(k, np.nan) for k in expected_features}
    df = pd.DataFrame([row], columns=expected_features)

    # 2) inietta sempre le geo columns come stringa vuota (evita <NA> dopo .str.* in PriorsGuard)
    for col in GEO_REQUIRED:
        if col not in df.columns:
            df[col] = ""          # importante: stringa vuota, NON np.nan
        else:
            if pd.isna(df.at[0, col]):
                df.at[0, col] = ""

    # 3) evita dtypes "nullable" che introducono pandas.NA in step sklearn:
    #    converti integer/bool in float64 (così i missing restano np.nan)
    for c in df.columns:
        s = df[c]
        if is_integer_dtype(s) or is_bool_dtype(s):
            df[c] = s.astype("float64")

    return df

def _extract_model_and_X(pipeline_obj, df: pd.DataFrame) -> Tuple[Any, Any]:
    """
    Se 'pipeline_obj' è una sklearn Pipeline:
      - separa preproc (tutti i passaggi tranne l'ultimo) e modello (ultimo step)
      - applica 'transform' al preproc e ritorna (model, X_transformed)
    Altrimenti ritorna (pipeline_obj, df) come passthrough.
    """
    if Pipeline is not None and hasattr(pipeline_obj, "steps"):
        preproc = pipeline_obj[:-1]
        X = preproc.transform(_clean_missing_df(df))
        model = pipeline_obj.steps[-1][1]
        return model, X
    else:
        return pipeline_obj, _clean_missing_df(df)

def _sigma_from_manifest(manifest_path: Optional[Path]) -> Optional[float]:
    if not manifest_path or not manifest_path.exists():
        return None
    try:
        mf = json.loads(manifest_path.read_text(encoding="utf-8"))
        mroot = mf.get("metrics") or {}
        metrics = mroot.get("valid") or mroot.get("validation") or mroot
        rmse = metrics.get("rmse") or metrics.get("RMSE")
        mae = metrics.get("mae") or metrics.get("MAE")
        if isinstance(rmse, (int, float)) and rmse > 0:
            return float(rmse)
        if isinstance(mae, (int, float)) and mae > 0:
            return float(mae) * 1.253314
    except Exception as e:
        try:
            logger.warning("Cannot read sigma from manifest", extra={"error": str(e)})
        except Exception:
            pass
    return None

# -------------------------------------------------------------------
# API principale
# -------------------------------------------------------------------
def predict_with_confidence(
    record: Dict[str, Any],
    *,
    pipeline_obj: Any,
    expected_features: List[str],
    manifest_path: Optional[Path] = None,
    n_t_min: int = 3,
    confidence: float = 0.95,
    verbose: bool = False,
) -> Dict[str, Any]:
    """
    Stima puntuale + intervallo di confidenza:
    - Se il modello è una foresta (ha 'estimators_'): usa la varianza tra gli alberi (t≈z).
    - Altrimenti: usa σ ≈ RMSE (o MAE→σ) dal training_manifest → z-interval.
    """
    # 1) Prepara X (missing puliti)
    df = _prepare_df(record, expected_features)

    # 2) Estrai una sola volta preproc e modello, poi predici sullo stesso X
    model, X = _extract_model_and_X(pipeline_obj, df)

    # 2a) Predizione puntuale
    y_raw = model.predict(X) if hasattr(model, "predict") else pipeline_obj(df)
    y_hat = float(np.ravel(y_raw)[0])

    # 3) Varianza tra alberi (se disponibile)
    try:
        estimators = getattr(model, "estimators_", None)
        if isinstance(estimators, (list, tuple)) and len(estimators) >= n_t_min:
            per_tree = np.array([np.ravel(est.predict(X))[0] for est in estimators], dtype=float)
            m = float(per_tree.mean())
            s = float(per_tree.std(ddof=1)) if per_tree.size > 1 else 0.0
            z = _z_for_conf(confidence)
            ci_margin = float(z * s)
            lower, upper = float(m - ci_margin), float(m + ci_margin)
            method = "forest_variance"
            n_estimators = len(estimators)
        else:
            raise RuntimeError("No per-tree predictions available")
    except Exception:
        # 4) Fallback: σ globale dal manifest o 10% di y_hat
        sigma = _sigma_from_manifest(manifest_path)
        if sigma is None:
            sigma = max(1.0, abs(y_hat) * 0.10)
        z = _z_for_conf(confidence)
        ci_margin = float(z * float(sigma))
        lower, upper = float(y_hat - ci_margin), float(y_hat + ci_margin)
        m, s, method = y_hat, float(sigma), "global_sigma"
        n_estimators = None

    if verbose:
        try:
            logger.info("Prediction info",
                        extra={"y_hat": y_hat, "mean_used": m, "std_used": s,
                               "ci_margin": ci_margin, "confidence": confidence, "method": method})
        except Exception:
            pass

    return {
        "prediction": round(m, 2),
        "point_pred": round(y_hat, 2),
        "uncertainty": round(s, 2),
        "confidence": float(confidence),
        "confidence_interval": (round(lower, 2), round(upper, 2)),
        "ci_margin": round(ci_margin, 2),
        "method": method,
        "n_estimators": n_estimators,
    }

In [42]:
# --- ensure model is fitted & resolve artifacts (idempotente) ---
from pathlib import Path
import os, re, json
from typing import Optional, List, Dict, Any
import pandas as pd
import numpy as np
import joblib

from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError

# Helpers locali ----------------------------------------------------
def _list_versions(dirpath: Path) -> List[str]:
    vers = []
    for p in sorted(dirpath.glob("value_regressor_v*.joblib")):
        m = re.search(r"value_regressor_(v\d+)\.joblib$", p.name)
        if m:
            try:
                vers.append((int(m.group(1)[1:]), m.group(1)))  # (num, 'vN')
            except Exception:
                vers.append((-1, m.group(1)))
    # ordina discendente (più recente prima)
    return [v for _, v in sorted(vers, reverse=True)]

def _load_pipeline(path: Path):
    return joblib.load(path)

def _is_fitted_pipeline(pl) -> bool:
    try:
        if hasattr(pl, "steps"):  # sklearn Pipeline
            check_is_fitted(pl.steps[-1][1])
        else:
            check_is_fitted(pl)
        return True
    except Exception:
        return False

def _read_json(path: Path) -> dict:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def _sha256_file(path: Path) -> str:
    import hashlib
    return hashlib.sha256(path.read_bytes()).hexdigest()

def _dedup_preserve(seq: List[str]) -> List[str]:
    seen, out = set(), []
    for s in seq:
        if s not in seen:
            seen.add(s); out.append(s)
    return out

def _resolve_expected_features(
    *, model_meta: dict, manifest_path: Optional[Path], feature_order_path: Optional[Path]
) -> List[str]:
    # 1) feature_order.json → ordine canone
    if feature_order_path and feature_order_path.exists():
        try:
            order = _read_json(feature_order_path)
            return _dedup_preserve([str(c) for c in order])
        except Exception as e:
            try:
                logger.warning("feature_order.json non leggibile; fallback a manifest/meta", extra={"error": str(e)})
            except Exception:
                pass
    # 2) manifest: model.feature_list / model.features
    cat = list(model_meta.get("features_categorical", []) or [])
    num = list(model_meta.get("features_numeric", []) or [])
    if manifest_path and manifest_path.exists():
        try:
            mf = _read_json(manifest_path)
            feats = (mf.get("model", {}).get("feature_list") or mf.get("model", {}).get("features"))
            if isinstance(feats, dict):
                cat = list(feats.get("categorical", cat) or cat)
                num = list(feats.get("numeric", num) or num)
        except Exception as e:
            try:
                logger.warning("Manifest presente ma non leggibile; fallback a meta.json", extra={"error": str(e)})
            except Exception:
                pass
    # 3) cat poi num (stabile) con dedup e senza overlap
    cat = _dedup_preserve([str(c) for c in cat])
    num = _dedup_preserve([str(c) for c in num if c not in set(cat)])
    return cat + num

def resolve_fitted_model(base_dir: Path, preferred: Optional[str]) -> dict:
    # 1) prova preferito
    if preferred:
        p = base_dir / f"value_regressor_{preferred}.joblib"
        m = base_dir / f"value_regressor_{preferred}_meta.json"
        if p.exists() and m.exists():
            pl = _load_pipeline(p)
            if _is_fitted_pipeline(pl):
                return {"version": preferred, "pipeline": p, "meta": m, "manifest": base_dir / "training_manifest.json", "obj": pl}
            try:
                logger.warning(f"Model {preferred} presente ma non fitted, cerco fallback…")
            except Exception:
                pass

    # 2) cerca il primo fitted tra i presenti (vN decrescente)
    for ver in _list_versions(base_dir):
        p = base_dir / f"value_regressor_{ver}.joblib"
        m = base_dir / f"value_regressor_{ver}_meta.json"
        if not (p.exists() and m.exists()):
            continue
        pl = _load_pipeline(p)
        if _is_fitted_pipeline(pl):
            return {"version": ver, "pipeline": p, "meta": m, "manifest": base_dir / "training_manifest.json", "obj": pl}

    raise FileNotFoundError(f"Nessun modello fitted trovato in {base_dir}")

# Risoluzione idempotente ------------------------------------------
if "MODEL_DIR" not in globals():
    _candidates = []
    env_root = os.getenv("MODELS_ROOT")
    if env_root and env_root.strip():
        _candidates.append(Path(env_root))
    _candidates.append(Path("./outputs/modeling"))
    MODELS_ROOT = next((c for c in _candidates if c.exists()), Path("./outputs/modeling"))
    ASSET_TYPE = "property"
    MODEL_DIR = MODELS_ROOT / ASSET_TYPE

resolved = (
    resolved if "resolved" in globals()
    else resolve_fitted_model(MODEL_DIR, os.getenv("MODEL_VERSION", "v2"))
)

MODEL_VERSION = resolved["version"]
PIPELINE_PATH = resolved["pipeline"]
META_PATH = resolved["meta"]
MANIFEST_PATH = resolved["manifest"]
pipeline = resolved["obj"]

# carica meta + verifica integrità hash
model_meta = _read_json(META_PATH)
expected_hash = model_meta.get("model_hash") or model_meta.get("pipeline_sha256")
actual_hash = _sha256_file(PIPELINE_PATH)
if expected_hash and expected_hash != actual_hash:
    raise ValueError(f"Bundle manomesso: meta={expected_hash[:8]}… != actual={actual_hash[:8]}…")

# feature_order path dai candidati
feature_order_candidates: List[Path] = [
    PIPELINE_PATH.parent / "feature_order.json",
]
if MANIFEST_PATH and MANIFEST_PATH.exists():
    try:
        mf = _read_json(MANIFEST_PATH)
        pfo = mf.get("paths", {}).get("feature_order")
        if pfo:
            feature_order_candidates.insert(0, Path(pfo))
    except Exception:
        pass
FEATURE_ORDER_PATH = next((p for p in feature_order_candidates if p and p.exists()), None)

# expected features local alla cella (no dipendenze globali)
ALL_EXPECTED = _resolve_expected_features(
    model_meta=model_meta,
    manifest_path=MANIFEST_PATH if MANIFEST_PATH and MANIFEST_PATH.exists() else None,
    feature_order_path=FEATURE_ORDER_PATH,
)

try:
    logger.info("Loaded fitted model", extra={
        "version": MODEL_VERSION,
        "pipeline": str(PIPELINE_PATH),
        "feature_order": str(FEATURE_ORDER_PATH) if FEATURE_ORDER_PATH else None,
        "n_expected_features": len(ALL_EXPECTED),
    })
except Exception:
    pass

print(f"✅ Loaded FITTED model {MODEL_VERSION} | features={len(ALL_EXPECTED)}")

# === Anomaly / Validation report ===
has_anomaly, validation_report = detect_anomalies(sample_property)

if has_anomaly:
    print("⚠️ Anomaly detected in input property!")
    try:
        logger.info("Validation report", extra={"report": validation_report})
    except Exception:
        pass
else:
    print("✅ No anomalies detected.")
    if validation_report.get("warnings"):
        print(f"ℹ️ Warnings: {validation_report['warnings']}")

try:
    logger.info(f"Anomaly check: {has_anomaly}")
except Exception:
    pass

# ------------------------------------------------------------------
# Drift check helpers
# ------------------------------------------------------------------
def check_feature_drift(record: dict, baseline_stats: dict):
    """
    Verifica drift (|z| > 3) rispetto a 'baseline_stats'.
    baseline_stats atteso: {feature: {"mean":..., "std":...} oppure include "min"/"max"/"iqr"}.
    Usa solo feature numeriche presenti sia in 'record' sia in 'baseline_stats'.
    Ritorna: (drifted: bool, first_message: Optional[str])
    """
    def _mean_std(v: dict) -> tuple[float, float]:
        m = float(v.get("mean", np.nan))
        std = v.get("std")
        if std is None:
            iqr = v.get("iqr")
            if iqr is not None:
                s = float(iqr) / 1.349  # approx σ from IQR
            else:
                mn, mx = v.get("min"), v.get("max")
                s = float(mx - mn) / 4.0 if (mn is not None and mx is not None) else np.nan
        else:
            s = float(std)
        return m, s

    drifted_any = False
    first_msg = None

    for feat, stats in baseline_stats.items():
        if feat not in record:
            continue
        val = record.get(feat)
        if val is None or not isinstance(val, (int, float, np.number)):
            continue
        m, s = _mean_std(stats)
        if not np.isfinite(m) or not np.isfinite(s) or s == 0:
            continue
        z = abs((float(val) - m) / s)
        if z > 3.0:
            msg = f"⚠️ Feature drift on '{feat}': z={z:.2f} (val={val}, mean={m:.2f}, std≈{s:.2f})"
            try:
                logger.warning(msg)
            except Exception:
                pass
            drifted_any = True
            if first_msg is None:
                first_msg = msg
    return drifted_any, first_msg

# Costruisci baseline stats da meta (preferisci raw → engineered)
source = (
    model_meta.get("raw_feature_stats")
    or model_meta.get("engineered_feature_stats")
    or {}
)

# Normalizza in forma {feat: {"mean":..., "std":..., "min":..., "max":..., "iqr":...}}
baseline_stats: Dict[str, Dict[str, Any]] = {}
for k, v in source.items():
    if isinstance(v, dict):
        entry = {}
        for key in ("mean", "std", "min", "max", "iqr"):
            if key in v:
                entry[key] = v[key]
        if "mean" in entry:
            baseline_stats[k] = entry
    else:
        # legacy: tuple/list con (mean, std?)
        try:
            mean = float(v[0])
            std  = float(v[1]) if len(v) > 1 else None
            entry = {"mean": mean}
            if std is not None:
                entry["std"] = std
            baseline_stats[k] = entry
        except Exception:
            pass

# 1) Prova drift su feature raw (intersezione con record)
overlap = set(baseline_stats.keys()) & set(sample_property.keys())
drift_flag = False
drift_msg = None
if overlap:
    drift_flag, drift_msg = check_feature_drift(sample_property, baseline_stats)
else:
    # 2) Tentativo “engineered”: usa il preproc della pipeline per trasformare il record
    try:
        preproc = pipeline[:-1]  # tutto tranne l'ultimo step (modello)
        df_row = pd.DataFrame([{k: sample_property.get(k, np.nan) for k in ALL_EXPECTED}], columns=ALL_EXPECTED)
        df_row = df_row.where(pd.notna(df_row), np.nan)  # pd.NA -> np.nan
        if hasattr(preproc, "get_feature_names_out"):
            names = list(preproc.get_feature_names_out())
        else:
            names = [f"f{i}" for i in range(getattr(preproc, "n_features_in_", 0) or len(ALL_EXPECTED))]
        X = preproc.transform(df_row)
        vals = np.asarray(X).ravel().tolist()
        engineered_record = {n: v for n, v in zip(names, vals)}
        overlap2 = set(baseline_stats.keys()) & set(engineered_record.keys())
        if overlap2:
            drift_flag, drift_msg = check_feature_drift(engineered_record, baseline_stats)
        else:
            try:
                logger.info("No overlapping engineered features for drift check.")
            except Exception:
                pass
    except Exception as e:
        try:
            logger.debug("Engineered drift check not available", extra={"error": str(e)})
        except Exception:
            pass

print(drift_msg if drift_flag else "✅ No significant feature drift detected.")

[2025-09-23 03:26:11,731] INFO nb04_infer: Loaded fitted model
[VALIDATION] Asset asset_infer_eaa37945 normalized. Errors=['condition_score_missing', 'risk_score_missing', 'valuation_k_too_low_or_missing', 'price_per_sqm_non_positive_or_missing', "missing_keys:['temperature_avg', 'listing_month', 'air_quality_index', 'view', 'distance_to_center_km', 'orientation', 'humidity_level', 'noise_level', 'owner_occupied', 'urban_type', 'risk_score', 'parking_spot', 'condition_score', 'zone', 'cellar', 'location', 'garage', 'concierge', 'condition', 'region', 'attic', 'heating']"] Flags=['condition_score_missing', 'risk_score_missing', 'valuation_override', 'price_per_sqm_recomputed', 'schema_incomplete'] Changes={'valuation_k': (None, 250.0), 'validation_errors': (None, ['condition_score_missing', 'risk_score_missing', 'valuation_k_too_low_or_missing', 'price_per_sqm_non_positive_or_missing', "missing_keys:['temperature_avg', 'listing_month', 'air_quality_index', 'view', 'distance_to_center_km

✅ Loaded FITTED model v2 | features=26
✅ No anomalies detected.
✅ No significant feature drift detected.


### Output Schema Builder

In [43]:
# === Output schema builder (refactor, aligned) — SAFE GUARDS ===
from __future__ import annotations
from typing import Dict, Any, Optional
from datetime import datetime
import time
import numpy as np
import pandas as pd
import json

from shared.common.utils import get_utc_now
from shared.common.constants import SCHEMA_VERSION

# (nel dubbio, import del constant)
try:
    ASSET_ID
except NameError:
    from shared.common.constants import ASSET_ID

def _utc_now_z() -> str:
    """UTC ISO-8601 senza microsecondi, suffisso 'Z'."""
    return get_utc_now().replace(microsecond=0).isoformat().replace("+00:00", "Z")

def _model_health_dict() -> Dict[str, Any]:
    """Raccoglie info basilari sugli artefatti del modello."""
    if PIPELINE_PATH.exists():
        size_mb = round(PIPELINE_PATH.stat().st_size / (1024 * 1024), 2)
        last_mod = datetime.utcfromtimestamp(PIPELINE_PATH.stat().st_mtime).replace(microsecond=0).isoformat() + "Z"
        status = "ok"
    else:
        size_mb = 0.0
        last_mod = None
        status = "missing"

    # Preferisci metriche da manifest, poi da meta.json
    mf_metrics = {}
    try:
        if MANIFEST_PATH and MANIFEST_PATH.exists():
            mf = json.loads(MANIFEST_PATH.read_text(encoding="utf-8"))
            mroot = mf.get("metrics") or {}
            mf_metrics = mroot.get("validation") or mroot.get("valid") or {}
    except Exception as e:
        try:
            logger.info("Manifest not readable for metrics", extra={"error": str(e)})
        except Exception:
            pass

    meta_metrics = model_meta.get("metrics", {}) or {}
    metrics = mf_metrics or meta_metrics

    return {
        "status": status,
        "model_path": str(PIPELINE_PATH),
        "size_mb": size_mb,
        "last_modified": last_mod,
        "metadata_valid": META_PATH.exists(),
        "metrics": metrics,
    }

def build_output_record(
    record: Dict[str, Any],
    *,
    asset_type: str,
    confidence_output: Dict[str, Any],
    latency_ms: float,
    has_anomaly: bool,
    validation_report: Dict[str, Any],
    drift_flag: bool,
    drift_msg: Optional[str],
    extra_metrics: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
    """Costruisce il payload finale, pronto per log/publish."""
    # Info modello
    try:
        model_name = type(getattr(pipeline, "steps", [[None, pipeline]])[-1][1]).__name__
    except Exception:
        model_name = model_meta.get("model_class")

    ci_low, ci_high = confidence_output["confidence_interval"]

    out: Dict[str, Any] = {
        "schema_version": SCHEMA_VERSION,  # "2.0"
        "asset_id": record.get(ASSET_ID, f"asset_manual_0001"),
        "asset_type": asset_type,
        "timestamp": _utc_now_z(),
        "metrics": {
            "valuation_k": round(float(confidence_output["prediction"]), 3),
            "point_pred_k": round(float(confidence_output.get("point_pred", confidence_output["prediction"])), 3),
            "uncertainty_k": round(float(confidence_output["uncertainty"]), 3),
            "confidence": float(confidence_output.get("confidence", 0.95)),
            "confidence_low_k": round(float(ci_low), 3),
            "confidence_high_k": round(float(ci_high), 3),
            "ci_margin_k": round(float(confidence_output["ci_margin"]), 3),
            "latency_ms": float(latency_ms),
        },
        "flags": {
            "anomaly": bool(has_anomaly),
            "drift_detected": bool(drift_flag),
            "needs_review": bool(has_anomaly or drift_flag),
        },
        "model_meta": {
            "value_model_version": MODEL_VERSION,
            "value_model_name": model_name,
            "n_features_total": len(ALL_EXPECTED),
            "n_features_categorical": len(categorical_expected),
            "n_features_numeric": len(numeric_expected),
        },
        "model_health": _model_health_dict(),
        "validation": {
            "ok": bool(validation_report.get("ok", True)),
            "warnings": validation_report.get("warnings"),
            "errors": validation_report.get("errors"),
        },
        "drift": {
            "message": drift_msg,
        },
        # placeholder compatibili col resto dello stack
        "offchain_refs": {"detail_report_hash": None, "sensor_batch_hash": None},
        "cache_hit": False,
        "schema_validation_error": "",
        "blockchain_txid": "",
        "asa_id": "",
        "publish": {"status": "skipped"},
    }

    if "method" in confidence_output:
        out["metrics"]["ci_method"] = confidence_output["method"]
    if confidence_output.get("n_estimators") is not None:
        out["metrics"]["n_estimators"] = int(confidence_output["n_estimators"])

    if extra_metrics:
        for k, v in extra_metrics.items():
            try:
                out["metrics"][k] = round(float(v), 3)
            except Exception:
                try:
                    logger.warning("Skipping non-numeric extra metric", extra={"key": k, "value": v})
                except Exception:
                    pass

    return out

# ------------------------ GUARD RUNTIME VARIABLES ------------------------
# 1) Predizione + CI + latenza (adatta alla nuova firma di predict_with_confidence)
if "confidence_output" not in locals() or "latency_ms" not in locals():
    t0 = time.perf_counter()
    confidence_output = predict_with_confidence(
        sample_property,
        pipeline_obj=pipeline,
        expected_features=ALL_EXPECTED,
        manifest_path=MANIFEST_PATH,
        confidence=0.95,
        verbose=False,
    )
    latency_ms = round((time.perf_counter() - t0) * 1000, 2)

# 2) Validation/anomaly (se mancano)
if "validation_report" not in locals() or "has_anomaly" not in locals():
    # usa la nostra validate_input_record (strict=False per non bloccare)
    _, validation_report = validate_input_record(sample_property, strict=False)
    has_anomaly = not validation_report.get("ok", True)

# 3) Drift (se mancano): prova raw → engineered
if "drift_flag" not in locals() or "drift_msg" not in locals():
    # baseline da meta (raw → engineered)
    source = (
        model_meta.get("raw_feature_stats")
        or model_meta.get("engineered_feature_stats")
        or {}
    )
    baseline_stats = {}
    for k, v in source.items():
        if isinstance(v, dict) and "mean" in v:
            entry = {key: v[key] for key in ("mean", "std", "min", "max", "iqr") if key in v}
            baseline_stats[k] = entry
    drift_flag, drift_msg = check_feature_drift(sample_property, baseline_stats)
    if not drift_flag:
        # engineered fallback
        try:
            preproc = pipeline[:-1]
            df_tmp = pd.DataFrame([{k: sample_property.get(k, np.nan) for k in ALL_EXPECTED}], columns=ALL_EXPECTED)
            df_tmp = df_tmp.where(pd.notna(df_tmp), np.nan)  # pd.NA -> np.nan
            if hasattr(preproc, "get_feature_names_out"):
                names = list(preproc.get_feature_names_out())
            else:
                names = [f"f{i}" for i in range(getattr(preproc, "n_features_in_", 0) or len(ALL_EXPECTED))]
            X = preproc.transform(df_tmp)
            vals = np.asarray(X).ravel().tolist()
            engineered = {n: v for n, v in zip(names, vals)}
            drift_flag, drift_msg = check_feature_drift(engineered, baseline_stats)
        except Exception as e:
            try:
                logger.debug("Engineered drift check not available", extra={"error": str(e)})
            except Exception:
                pass

# ------------------------ BUILD OUTPUT ------------------------
single_output = build_output_record(
    sample_property,
    asset_type=ASSET_TYPE,
    confidence_output=confidence_output,
    latency_ms=latency_ms,
    has_anomaly=has_anomaly,
    validation_report=validation_report,
    drift_flag=drift_flag,
    drift_msg=drift_msg,
)
print("✅ Output record built.")

✅ Output record built.


### Pre-chain compaction

In [44]:
from shared.common.constants import NOTE_MAX_BYTES
from shared.common.utils import canonical_json_dumps, sha256_hex

def build_compact_note(out: dict) -> dict:
    # tieni SOLO quanto necessario on-chain (hash-first approach)
    note = {
        "schema_version": "v2",
        "asset_id": out["asset_id"],
        "asset_type": out["asset_type"],
        "timestamp": out["timestamp"],
        "model": {
            "version": out["model_meta"]["value_model_version"],
            "hash": model_meta.get("pipeline_sha256") or model_meta.get("model_hash"),
        },
        "metrics": {
            "valuation_k": out["metrics"]["valuation_k"],
            "confidence": out["metrics"]["confidence"],
            "ci": [out["metrics"]["confidence_low_k"], out["metrics"]["confidence_high_k"]],
        },
        # opzionale: input_hash, trace_id se disponibili
    }
    return note

note = build_compact_note(single_output)
note_bytes = canonical_json_dumps(note).encode("utf-8")
note_size = len(note_bytes)
note_sha256 = sha256_hex(note_bytes)

single_output.setdefault("publish", {}).update({
    "status": "skipped",
    "note_size": note_size,
    "note_sha256": note_sha256,
    "is_compacted": True,
    "fallback_url_used": False,
})

assert note_size <= NOTE_MAX_BYTES, f"Nota troppo grande: {note_size} > {NOTE_MAX_BYTES}"
print(f"Note size={note_size} bytes | sha256={note_sha256[:16]}…")

Note size=281 bytes | sha256=bd083220182189d3…


### Batch Inference

In [45]:
# === Batch Inference (refactor, aligned) ===
import time
from copy import deepcopy
import numpy as np
import pandas as pd

# Definisci alcuni sample variati a partire dal validated sample
batch_samples = [
    deepcopy(sample_property),
    {**sample_property, "asset_id": None, "location": "Rome",     "size_m2": 120, "energy_class": "C"},
    {**sample_property, "asset_id": None, "location": "Florence", "size_m2":  70, "has_garden": 1, "energy_class": "A"},
    {**sample_property, "asset_id": None, "location": "Turin",    "size_m2": 150, "energy_class": "D"},
]

def _drift_for_record(rec: dict) -> tuple[bool, str | None]:
    """Prova drift su raw features; se non c'è overlap, tenta engineered tramite preproc."""
    # 1) Raw overlap
    overlap = set(baseline_stats.keys()) & set(rec.keys())
    if overlap:
        return check_feature_drift(rec, baseline_stats)
    # 2) Engineered fallback
    try:
        preproc = pipeline[:-1]  # tutto tranne il modello
        if hasattr(preproc, "get_feature_names_out"):
            names = list(preproc.get_feature_names_out())
        else:
            names = None
        X = preproc.transform(pd.DataFrame([{k: rec.get(k, np.nan) for k in ALL_EXPECTED}], columns=ALL_EXPECTED))
        vals = np.asarray(X).ravel().tolist()
        engineered = {n: v for n, v in zip(names or range(len(vals)), vals)}
        overlap2 = set(baseline_stats.keys()) & set(engineered.keys())
        if overlap2:
            return check_feature_drift(engineered, baseline_stats)
    except Exception as e:
        try:
            logger.info("Engineered drift check failed", extra={"error": str(e)})
        except Exception:
            pass
    return False, None

batch_outputs = []
for i, raw in enumerate(batch_samples, start=1):
    # 1) Validate + canonicalize (riusa report per anomaly)
    rec, vreport = validate_input_record(raw, strict=True)
    if ASSET_ID not in rec or not rec.get(ASSET_ID):
        rec[ASSET_ID] = f"asset_batch_{i:03}"

    # 2) Predict + CI + timing reale (nuova firma)
    t0 = time.perf_counter()
    conf = predict_with_confidence(
        rec,
        pipeline_obj=pipeline,
        expected_features=ALL_EXPECTED,
        manifest_path=MANIFEST_PATH,
        confidence=0.95,
        verbose=False,
    )
    latency_ms = round((time.perf_counter() - t0) * 1000, 2)

    # 3) Drift & anomaly flags
    drift_flag, drift_msg = _drift_for_record(rec)
    has_anomaly = not vreport.get("ok", True)

    # 4) Build output record
    out = build_output_record(
        rec,
        asset_type=ASSET_TYPE,
        confidence_output=conf,
        latency_ms=latency_ms,
        has_anomaly=has_anomaly,
        validation_report=vreport,
        drift_flag=drift_flag,
        drift_msg=drift_msg,
    )
    batch_outputs.append(out)

warnings.filterwarnings("ignore", message="X does not have valid feature names")

# Riepilogo compatto
pd.DataFrame(
    [{"asset_id": o["asset_id"], "valuation_k": o["metrics"]["valuation_k"]} for o in batch_outputs]
)

[VALIDATION] Asset asset_infer_eaa37945 normalized. Errors=['condition_score_missing', 'risk_score_missing', 'valuation_k_too_low_or_missing', 'price_per_sqm_non_positive_or_missing', "missing_keys:['temperature_avg', 'listing_month', 'air_quality_index', 'view', 'distance_to_center_km', 'orientation', 'humidity_level', 'noise_level', 'owner_occupied', 'urban_type', 'risk_score', 'parking_spot', 'condition_score', 'zone', 'cellar', 'location', 'garage', 'concierge', 'condition', 'region', 'attic', 'heating']"] Flags=['condition_score_missing', 'risk_score_missing', 'valuation_override', 'price_per_sqm_recomputed', 'schema_incomplete'] Changes={'valuation_k': (None, 250.0), 'validation_errors': (None, ['condition_score_missing', 'risk_score_missing', 'valuation_k_too_low_or_missing', 'price_per_sqm_non_positive_or_missing', "missing_keys:['temperature_avg', 'listing_month', 'air_quality_index', 'view', 'distance_to_center_km', 'orientation', 'humidity_level', 'noise_level', 'owner_occup

  df.at[0, col] = ""
  df.at[0, col] = ""
  df = df.applymap(_to_numpy_nan)
[2025-09-23 03:26:15,801] INFO nb04_infer: Engineered drift check failed
[VALIDATION] Asset None normalized. Errors=['condition_score_missing', 'risk_score_missing', 'valuation_k_too_low_or_missing', 'price_per_sqm_non_positive_or_missing', "missing_keys:['temperature_avg', 'listing_month', 'air_quality_index', 'view', 'distance_to_center_km', 'orientation', 'humidity_level', 'noise_level', 'owner_occupied', 'urban_type', 'risk_score', 'parking_spot', 'condition_score', 'zone', 'cellar', 'garage', 'concierge', 'condition', 'region', 'attic', 'heating']"] Flags=['condition_score_missing', 'risk_score_missing', 'valuation_override', 'price_per_sqm_recomputed', 'schema_incomplete'] Changes={'valuation_k': (None, 60.0), 'validation_errors': (None, ['condition_score_missing', 'risk_score_missing', 'valuation_k_too_low_or_missing', 'price_per_sqm_non_positive_or_missing', "missing_keys:['temperature_avg', 'listing_mo

Unnamed: 0,asset_id,valuation_k
0,asset_batch_001,6.38
1,asset_batch_002,6.02
2,asset_batch_003,5.94
3,asset_batch_004,6.24


### Logging JSON

In [46]:
# === JSONL Logging (atomic append) ===
from datetime import datetime
import os
from pathlib import Path
import json

from shared.common.utils import canonical_json_dumps, get_utc_now

MONITOR_LOG_PATH = Path("./outputs/logs/monitoring_log.jsonl")
MONITOR_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

def append_jsonl(record: dict, path: Path) -> None:
    """Append atomico in JSONL, con timestamp UTC 'Z' e JSON canonico/compatto."""
    path.parent.mkdir(parents=True, exist_ok=True)
    payload = {**record, "_logged_at": get_utc_now().replace(microsecond=0).isoformat().replace("+00:00", "Z")}
    line = canonical_json_dumps(payload)
    fd = os.open(str(path), os.O_WRONLY | os.O_CREAT | os.O_APPEND)
    try:
        with os.fdopen(fd, "a", encoding="utf-8") as f:
            f.write(line + "\n")
            f.flush()
            os.fsync(f.fileno())
    except Exception:
        # in caso di errore, chiudi comunque il descrittore
        try:
            os.close(fd)
        except Exception:
            pass
        raise

# Predictions log
append_jsonl(single_output, LOG_PATH)
for o in batch_outputs:
    append_jsonl(o, LOG_PATH)
print(f"Appended {1 + len(batch_outputs)} predictions to {LOG_PATH}")

# Monitoring log (deriva dai record già costruiti: nessuna stima duplicata)
def _to_monitoring(entry: dict) -> dict:
    m = entry.get("metrics", {}) or {}
    mm = entry.get("model_meta", {}) or {}
    return {
        "asset_id": entry.get("asset_id"),
        "model_version": mm.get("value_model_version", MODEL_VERSION),
        "model_class": mm.get("value_model_name"),
        "latency_ms": m.get("latency_ms"),
        "valuation_k": m.get("valuation_k") or m.get("valuation_base_k"),
        "uncertainty_k": m.get("uncertainty_k") or m.get("uncertainty"),
        "confidence_low_k": m.get("confidence_low_k"),
        "confidence_high_k": m.get("confidence_high_k"),
        "ci_method": m.get("ci_method"),
        "n_estimators": m.get("n_estimators"),
        "anomaly": entry.get("flags", {}).get("anomaly"),
        "drift_detected": entry.get("flags", {}).get("drift_detected"),
    }

append_jsonl(_to_monitoring(single_output), MONITOR_LOG_PATH)
for o in batch_outputs:
    append_jsonl(_to_monitoring(o), MONITOR_LOG_PATH)

print(f"Appended monitoring for {1 + len(batch_outputs)} records to {MONITOR_LOG_PATH}")

Appended 5 predictions to outputs\logs\predictions_log.jsonl
Appended monitoring for 5 records to outputs\logs\monitoring_log.jsonl


### Single Prediction Function

In [47]:
# === Utility: Single Prediction Function (refactor, aligned) ===
from typing import Dict, Any, Optional
import time
import numpy as np
import pandas as pd

def _ensure_baseline_stats() -> Dict[str, Dict[str, Any]]:
    """Normalizza le baseline stats da model_meta (raw → engineered)."""
    source = (
        model_meta.get("raw_feature_stats")
        or model_meta.get("engineered_feature_stats")
        or {}
    )
    baseline: Dict[str, Dict[str, Any]] = {}
    for k, v in source.items():
        if isinstance(v, dict) and "mean" in v:
            entry = {key: v[key] for key in ("mean", "std", "min", "max", "iqr") if key in v}
            baseline[k] = entry
        elif isinstance(v, (list, tuple)) and len(v) >= 1:
            try:
                entry = {"mean": float(v[0])}
                if len(v) > 1:
                    entry["std"] = float(v[1])
                baseline[k] = entry
            except Exception:
                pass
    return baseline

def _drift_for_record(rec: dict, baseline_stats: dict) -> tuple[bool, Optional[str]]:
    # 1) Raw overlap
    overlap = set(baseline_stats.keys()) & set(rec.keys())
    if overlap:
        return check_feature_drift(rec, baseline_stats)
    # 2) Engineered fallback
    try:
        preproc = pipeline[:-1]  # tutto tranne il modello
        if hasattr(preproc, "get_feature_names_out"):
            names = list(preproc.get_feature_names_out())
        else:
            names = None
        X = preproc.transform(pd.DataFrame([{k: rec.get(k, np.nan) for k in ALL_EXPECTED}], columns=ALL_EXPECTED))
        vals = np.asarray(X).ravel().tolist()
        engineered = {n: v for n, v in zip(names or range(len(vals)), vals)}
        overlap2 = set(baseline_stats.keys()) & set(engineered.keys())
        if overlap2:
            return check_feature_drift(engineered, baseline_stats)
    except Exception as e:
        try:
            logger.info("Engineered drift check failed", extra={"error": str(e)})
        except Exception:
            pass
    return False, None

def predict_asset(record: Dict[str, Any], asset_id: Optional[str] = None, asset_type: str = ASSET_TYPE) -> Dict[str, Any]:
    # 1) Validazione + canonicalizzazione + derivate sicure
    rec, vreport = validate_input_record(record, strict=True)
    if not rec.get(ASSET_ID):
        rec[ASSET_ID] = asset_id or f"asset_single_{np.random.randint(1_000_000):06d}"

    # 2) Predizione + CI + latenza reale (nuova firma)
    t0 = time.perf_counter()
    conf = predict_with_confidence(
        rec,
        pipeline_obj=pipeline,
        expected_features=ALL_EXPECTED,
        manifest_path=MANIFEST_PATH,
        confidence=0.95,
        verbose=False,
    )
    latency_ms = round((time.perf_counter() - t0) * 1000, 2)

    # 3) Drift & anomaly
    baseline = _ensure_baseline_stats()
    drift_flag, drift_msg = _drift_for_record(rec, baseline)
    has_anomaly = not vreport.get("ok", True)

    # 4) Output record coerente
    out = build_output_record(
        rec,
        asset_type=asset_type,
        confidence_output=conf,
        latency_ms=latency_ms,
        has_anomaly=has_anomaly,
        validation_report=vreport,
        drift_flag=drift_flag,
        drift_msg=drift_msg,
    )

    # --- Pricing breakdown (euristico) ---
    price_expl = _maybe_explain_price(record)
    if price_expl:
        out.setdefault("explanations", {})["pricing_breakdown"] = price_expl

    # --- Sanity check di prezzo (benchmark per location) ---
    yhat_k = float(conf["prediction"])
    pb = _price_benchmark_flag(record, yhat_k)
    if pb:
        # salva il report nel payload
        out.setdefault("sanity", {})["price_benchmark"] = pb
        # se esiste un flag 'out_of_band', riflettilo nei flags globali
        out["flags"]["price_out_of_band"] = bool(pb.get("out_of_band", False))
        # opzionale: includi anche questo nel needs_review
        out["flags"]["needs_review"] = bool(out["flags"]["needs_review"] or pb.get("out_of_band", False))

    return out

# Test rapido
warnings.filterwarnings("ignore", message="X does not have valid feature names")
test_output = predict_asset(sample_property, asset_id="asset_function_test")
test_output

[VALIDATION] Asset asset_infer_eaa37945 normalized. Errors=['condition_score_missing', 'risk_score_missing', 'valuation_k_too_low_or_missing', 'price_per_sqm_non_positive_or_missing', "missing_keys:['temperature_avg', 'listing_month', 'air_quality_index', 'view', 'distance_to_center_km', 'orientation', 'humidity_level', 'noise_level', 'owner_occupied', 'urban_type', 'risk_score', 'parking_spot', 'condition_score', 'zone', 'cellar', 'location', 'garage', 'concierge', 'condition', 'region', 'attic', 'heating']"] Flags=['condition_score_missing', 'risk_score_missing', 'valuation_override', 'price_per_sqm_recomputed', 'schema_incomplete'] Changes={'valuation_k': (None, 250.0), 'validation_errors': (None, ['condition_score_missing', 'risk_score_missing', 'valuation_k_too_low_or_missing', 'price_per_sqm_non_positive_or_missing', "missing_keys:['temperature_avg', 'listing_month', 'air_quality_index', 'view', 'distance_to_center_km', 'orientation', 'humidity_level', 'noise_level', 'owner_occup

  df.at[0, col] = ""
  df.at[0, col] = ""
  df = df.applymap(_to_numpy_nan)
[2025-09-23 03:26:23,590] INFO nb04_infer: Engineered drift check failed
[2025-09-23 03:26:23,597] INFO nb04_infer: explain_price not available
[2025-09-23 03:26:23,617] INFO nb04_infer: price_benchmark not available


{'schema_version': 'v2',
 'asset_id': 'asset_function_test',
 'asset_type': 'property',
 'timestamp': '2025-09-23T01:26:23Z',
 'metrics': {'valuation_k': 6.38,
  'point_pred_k': 6.38,
  'uncertainty_k': 1.0,
  'confidence': 0.95,
  'confidence_low_k': 4.42,
  'confidence_high_k': 8.34,
  'ci_margin_k': 1.96,
  'latency_ms': 245.04,
  'ci_method': 'global_sigma'},
 'flags': {'anomaly': False, 'drift_detected': False, 'needs_review': False},
 'model_meta': {'value_model_version': 'v2',
  'value_model_name': 'Pipeline',
  'n_features_total': 26,
  'n_features_categorical': 8,
  'n_features_numeric': 18},
 'model_health': {'status': 'ok',
  'model_path': 'outputs\\modeling\\property\\value_regressor_v2.joblib',
  'size_mb': 158.25,
  'last_modified': '2025-09-23T00:05:48Z',
  'metadata_valid': True,
  'metrics': {}},
 'drift': {'message': None},
 'offchain_refs': {'detail_report_hash': None, 'sensor_batch_hash': None},
 'cache_hit': False,
 'schema_validation_error': '',
 'blockchain_txid'

### Sensitivity Check (vary size_m2)

In [30]:
# === Sensitivity check: 'size_m2' (refactor, aligned) ===
sizes = [60, 90, 130, 170, 210]
rows = []

# baseline (size attuale del sample)
base_conf = predict_with_confidence(
    sample_property,
    pipeline_obj=pipeline,
    expected_features=ALL_EXPECTED,
    manifest_path=MANIFEST_PATH,
    confidence=0.95,
    verbose=False,
)
base_pred = float(base_conf["prediction"])

for s in sizes:
    rec_raw = {**sample_property, "size_m2": s}
    try:
        rec, _vreport = validate_input_record(rec_raw, strict=True)
        conf = predict_with_confidence(
            rec,
            pipeline_obj=pipeline,
            expected_features=ALL_EXPECTED,
            manifest_path=MANIFEST_PATH,
            confidence=0.95,
            verbose=False,
        )
        ci_low, ci_high = conf["confidence_interval"]
        rows.append({
            "size_m2": s,
            "prediction_k": round(float(conf["prediction"]), 3),
            "ci_low_k": round(float(ci_low), 3),
            "ci_high_k": round(float(ci_high), 3),
            "ci_margin_k": round(float(conf["ci_margin"]), 3),
            "uncertainty_k": round(float(conf["uncertainty"]), 3),
            "delta_vs_base_k": round(float(conf["prediction"]) - base_pred, 3),
        })
    except Exception as e:
        rows.append({"size_m2": s, "prediction_k": None, "error": str(e)})

warnings.filterwarnings("ignore", message="X does not have valid feature names")
pd.DataFrame(rows)

  df.at[0, col] = ""
  df.at[0, col] = ""
  df = df.applymap(_to_numpy_nan)
[VALIDATION] Asset asset_infer_eaa37945 normalized. Errors=['condition_score_missing', 'risk_score_missing', 'valuation_k_too_low_or_missing', 'price_per_sqm_non_positive_or_missing', "missing_keys:['temperature_avg', 'listing_month', 'air_quality_index', 'view', 'distance_to_center_km', 'orientation', 'humidity_level', 'noise_level', 'owner_occupied', 'urban_type', 'risk_score', 'parking_spot', 'condition_score', 'zone', 'cellar', 'location', 'garage', 'concierge', 'condition', 'region', 'attic', 'heating']"] Flags=['condition_score_missing', 'risk_score_missing', 'valuation_override', 'price_per_sqm_recomputed', 'schema_incomplete'] Changes={'valuation_k': (None, 30.0), 'validation_errors': (None, ['condition_score_missing', 'risk_score_missing', 'valuation_k_too_low_or_missing', 'price_per_sqm_non_positive_or_missing', "missing_keys:['temperature_avg', 'listing_month', 'air_quality_index', 'view', 'distance_

Unnamed: 0,size_m2,prediction_k,ci_low_k,ci_high_k,ci_margin_k,uncertainty_k,delta_vs_base_k
0,60,5.82,3.86,7.78,1.96,1.0,-0.56
1,90,5.93,3.97,7.89,1.96,1.0,-0.45
2,130,6.15,4.19,8.11,1.96,1.0,-0.23
3,170,6.35,4.39,8.31,1.96,1.0,-0.03
4,210,6.38,4.42,8.34,1.96,1.0,0.0


### Compare With API Prediction Consistency

In [31]:
# === Compare with API prediction consistency (refactor, robust) ===
import os
import requests

if COMPARE_WITH_API:
    def _pick_pred_metrics(payload: dict):
        """Estrae predizione e CI dall'output API (tollerante a v1/v2/flat)."""
        if not isinstance(payload, dict):
            return None, None, None, None
        m = payload.get("metrics", {}) if isinstance(payload.get("metrics"), dict) else {}
        # fallback flat
        pred = (
            m.get("valuation_k")
            or m.get("valuation_base_k")
            or m.get("valuation")
            or payload.get("valuation_k")
            or payload.get("prediction")
        )
        ci_low = m.get("confidence_low_k") or payload.get("confidence_low_k")
        ci_high = m.get("confidence_high_k") or payload.get("confidence_high_k")
        unc = m.get("uncertainty_k") or m.get("uncertainty") or payload.get("uncertainty_k")
        return pred, ci_low, ci_high, unc

    def _model_version(payload: dict):
        mm = (payload or {}).get("model_meta", {}) if isinstance(payload, dict) else {}
        return mm.get("value_model_version") or mm.get("model_version")

    try:
        # Usa il record già validato per coerenza
        headers = {"Content-Type": "application/json"}
        token = os.getenv("AXM_TOKEN")
        if token:
            headers["Authorization"] = f"Bearer {token}"
        url = f"{API_BASE}/predict/{ASSET_TYPE}"

        resp = requests.post(url, json=sample_property, headers=headers, timeout=8)
        if resp.status_code == 200:
            api_json = resp.json()
            api_pred, api_low, api_high, api_unc = _pick_pred_metrics(api_json)

            if api_pred is None:
                print(f"[API] ❌ Response OK ma 'metrics.valuation_*' mancante: {api_json}")
            else:
                local_pred = float(single_output["metrics"]["valuation_k"])
                local_low  = float(single_output["metrics"]["confidence_low_k"])
                local_high = float(single_output["metrics"]["confidence_high_k"])

                delta = float(abs(float(api_pred) - local_pred))
                pct = (delta / max(1e-9, abs(local_pred))) * 100.0

                # Check overlap CI (quando disponibile)
                ci_overlap = None
                if api_low is not None and api_high is not None:
                    try:
                        api_low_f, api_high_f = float(api_low), float(api_high)
                        ci_overlap = not (api_high_f < local_low or api_low_f > local_high)
                    except Exception:
                        ci_overlap = None

                av = _model_version(api_json)
                lv = MODEL_VERSION
                ver_note = "" if av is None or av == lv else f" | ⚠️ model_version API={av} vs LOCAL={lv}"

                msg = (
                    f"[API] Pred={float(api_pred):.3f} k€ | Local={local_pred:.3f} k€ | Δ={delta:.4f} ({pct:.2f}%)"
                    f" | CI overlap: {ci_overlap if ci_overlap is not None else 'n/a'}{ver_note}"
                )
                print(msg)
        else:
            print(f"[API] ❌ {resp.status_code} | {resp.text[:200]}")
    except Exception as e:
        print(f"[API] ⚠️ Compare skipped due to exception: {e}")

[API] ❌ 500 | {"detail":"Inference error: boolean value of NA is ambiguous"}


### Hash Pipeline File (Audit)

In [32]:
# === Audit: hash artifacts & compare with manifest/meta ===
import hashlib
import json

def file_sha256(path: Path, chunk_size: int = 1 << 20) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            h.update(chunk)
    return h.hexdigest()

def manifest_expected_hash(manifest_path: Path) -> str | None:
    """
    Tenta più posizioni comuni per l'hash del pipeline:
    - paths.pipeline_sha256 (nostra convenzione)
    - model_meta.model_hash (nostra convenzione)
    - artifacts.pipeline_sha256 / artifacts.model_sha256 (legacy)
    - model.sha256 (legacy)
    - pipeline_sha256 (flat)
    """
    try:
        if not manifest_path.exists():
            return None
        mf = json.loads(manifest_path.read_text(encoding="utf-8"))
        return (
            (mf.get("paths") or {}).get("pipeline_sha256")
            or (mf.get("model_meta") or {}).get("model_hash")
            or (mf.get("artifacts") or {}).get("pipeline_sha256")
            or (mf.get("artifacts") or {}).get("model_sha256")
            or (mf.get("model") or {}).get("sha256")
            or mf.get("pipeline_sha256")
        )
    except Exception as e:
        try:
            logger.info("Manifest present but unreadable for hash", extra={"error": str(e)})
        except Exception:
            pass
    return None

# Calcoli
model_sha = file_sha256(PIPELINE_PATH)
expected_sha_manifest = manifest_expected_hash(MANIFEST_PATH)

# Hash atteso dal meta (valore, non hash del file meta!)
expected_sha_meta = (model_meta.get("model_hash") or model_meta.get("pipeline_sha256"))

print(f"Model SHA256: {model_sha} (first16={model_sha[:16]})")
if expected_sha_meta:
    print(f"Meta   expects: {expected_sha_meta} (match: {expected_sha_meta == model_sha})")

if expected_sha_manifest:
    ok = (expected_sha_manifest == model_sha)
    print(f"Manifest expects: {expected_sha_manifest} (match: {ok})")
    if not ok:
        try:
            logger.warning(
                "Pipeline hash mismatch with manifest",
                extra={"expected": expected_sha_manifest, "actual": model_sha},
            )
        except Exception:
            pass

Model SHA256: 9bd5abe6c69ebbffed532e21e70b9b1181da20ec89e2d7be9d46f1a35d830a52 (first16=9bd5abe6c69ebbff)
Meta   expects: 9bd5abe6c69ebbffed532e21e70b9b1181da20ec89e2d7be9d46f1a35d830a52 (match: True)
Manifest expects: 9bd5abe6c69ebbffed532e21e70b9b1181da20ec89e2d7be9d46f1a35d830a52 (match: True)


### Schema Validation

In [33]:
# === Strict schema validation (version-aware) ===
import json
from pathlib import Path
from jsonschema import validate, ValidationError
from jsonschema import Draft7Validator
from shared.common.utils import NumpyJSONEncoder
from shared.common.constants import SCHEMA_VERSION as _DEF_SCHEMA_VER

try:
    # se disponibile, usa un validator più recente
    from jsonschema import Draft202012Validator as BetterValidator
    Validator = BetterValidator
except Exception:
    Validator = Draft7Validator

SCHEMAS_DIR = Path("../schemas")
example_path = SCHEMAS_DIR / "output_example.json"

def _norm_schema_tag(ver: str) -> str:
    """
    Normalizza la versione di schema dagli output:
    accetta '2', '2.0', 'v2' -> 'v2'; altrimenti torna il tag in lower.
    """
    v = (ver or "").strip().lower()
    if v in {"2", "2.0", "02", "v2"}:
        return "v2"
    if v.startswith("v"):
        return v
    return v

# 1) Scegli lo schema in base all'output; fallback al default (v2) poi v1
schema_tag = _norm_schema_tag(str(single_output.get("schema_version", _DEF_SCHEMA_VER)))
schema_def_path = SCHEMAS_DIR / f"output_schema_{schema_tag}.json"
if not schema_def_path.exists():
    # fallback al default dichiarato nel progetto (es. "2.0" -> v2)
    schema_def_path = SCHEMAS_DIR / f"output_schema_{_norm_schema_tag(_DEF_SCHEMA_VER)}.json"
if not schema_def_path.exists():
    # fallback legacy a v1, se proprio
    schema_def_path = SCHEMAS_DIR / "output_schema_v1.json"

# 2) Carica schema
if not schema_def_path.exists():
    print(f"❌ Schema file not found: {schema_def_path}")
    schema_def = None
else:
    with schema_def_path.open("r", encoding="utf-8") as f:
        schema_def = json.load(f)
    print(f"🔎 Using schema: {schema_def_path.name}")

# 3) Helper per JSON-compat (np types) e errori leggibili
def _to_jsonable(obj: dict) -> dict:
    # serializza con NumpyJSONEncoder e ricarica come dict puro
    return json.loads(json.dumps(obj, cls=NumpyJSONEncoder, ensure_ascii=False))

def _format_error(e: ValidationError) -> str:
    path = ".".join(str(p) for p in e.path) or "<root>"
    spath = " → ".join(str(p) for p in e.schema_path)
    return f"at '{path}': {e.message}  [schema: {spath}]"

# 4) Valida il single_output
if schema_def:
    try:
        validate(instance=_to_jsonable(single_output), schema=schema_def)
        print("✅ Strict schema validation passed (single_output).")
    except ValidationError as e:
        print("❌ Strict schema validation failed (single_output):", _format_error(e))

# 5) Confronto struttura PROFONDO con l'esempio (se presente)
def _deep_keys(d, prefix=""):
    keys = set()
    if isinstance(d, dict):
        for k, v in d.items():
            newp = f"{prefix}.{k}" if prefix else k
            keys.add(newp)
            keys |= _deep_keys(v, newp)
    elif isinstance(d, list):
        if d:
            keys |= _deep_keys(d[0], prefix + "[]")
        else:
            keys.add(prefix + "[]")
    return keys

if example_path.exists():
    with example_path.open("r", encoding="utf-8") as f:
        example = json.load(f)
    ex_keys = _deep_keys(example)
    out_keys = _deep_keys(single_output)
    only_in_output = sorted(out_keys - ex_keys)
    only_in_example = sorted(ex_keys - out_keys)
    if not only_in_output and not only_in_example:
        print("✅ single_output matches example structure (deep).")
    else:
        if only_in_output:
            print("⚠️ Extra keys vs example (deep):", only_in_output[:10], ("…+" if len(only_in_output) > 10 else ""))
        if only_in_example:
            print("⚠️ Missing keys vs example (deep):", only_in_example[:10], ("…+" if len(only_in_example) > 10 else ""))
else:
    print(f"ℹ️ Example file not found: {example_path}")

# 6) Valida anche i batch outputs (se presenti)
if schema_def and isinstance(globals().get("batch_outputs"), list) and batch_outputs:
    validator = Validator(schema_def)
    errors = []
    for idx, rec in enumerate(batch_outputs, start=1):
        for err in validator.iter_errors(_to_jsonable(rec)):
            errors.append((idx, err))
    if not errors:
        print(f"✅ Batch outputs: all {len(batch_outputs)} records pass schema validation.")
    else:
        print(f"❌ Batch outputs: {len(errors)} schema errors found on {len(set(i for i, _ in errors))} records.")
        for i, err in errors[:5]:
            print(f"   • [#{i}] {_format_error(err)}")
        if len(errors) > 5:
            print(f"   … and {len(errors)-5} more")

🔎 Using schema: output_schema_v2.json
✅ Strict schema validation passed (single_output).
ℹ️ Example file not found: ..\schemas\output_example.json
✅ Batch outputs: all 4 records pass schema validation.


### Test API via curl

In [34]:
# === Test API via requests (configurable, robust) ===
import os
import time
import json
from uuid import uuid4
from pathlib import Path
import requests

from shared.common.utils import canonical_json_dumps, NumpyJSONEncoder

# Carica payload da file (fallback: sample_property validato)
sample_path = Path("../data/sample_property.json")
try:
    sample_payload = json.loads(sample_path.read_text(encoding="utf-8"))
except Exception as e:
    print(f"ℹ️ Using in-notebook sample; cannot load {sample_path.name}: {e}")
    sample_payload = None

payload = sample_payload or sample_property

# Parametri API
PUBLISH = os.getenv("PUBLISH", "false").lower() in {"1", "true", "yes", "y"}
url = f"{API_BASE}/predict/{ASSET_TYPE}"
params = {"publish": "true"} if PUBLISH else {}

headers = {"Content-Type": "application/json"}
token = os.getenv("AXM_TOKEN")
if token:
    headers["Authorization"] = f"Bearer {token}"
# opzionale: idempotenza lato server
headers["X-Idempotency-Key"] = uuid4().hex

# Prepara payload JSON-safe (senza NaN/np types)
payload_json = json.loads(json.dumps(payload, cls=NumpyJSONEncoder, ensure_ascii=False))

try:
    t0 = time.perf_counter()
    resp = requests.post(url, params=params, json=payload_json, headers=headers, timeout=12)
    latency_ms = round((time.perf_counter() - t0) * 1000, 2)

    if resp.ok:
        api_json = resp.json()

        # Helper: estrai metrica (v1/v2 compat)
        def _pick_pred_metrics(payload: dict):
            m = (payload or {}).get("metrics", {}) if isinstance(payload, dict) else {}
            pred = m.get("valuation_k") or m.get("valuation_base_k") or m.get("valuation") or payload.get("prediction")
            ci_low = m.get("confidence_low_k") or payload.get("confidence_low_k")
            ci_high = m.get("confidence_high_k") or payload.get("confidence_high_k")
            unc = m.get("uncertainty_k") or m.get("uncertainty")
            return pred, ci_low, ci_high, unc

        api_pred, api_low, api_high, api_unc = _pick_pred_metrics(api_json)

        if api_pred is None:
            print("❌ API OK but missing 'metrics.valuation_*' in response")
            print(json.dumps(api_json, indent=2)[:800])
        else:
            local_pred = float(single_output["metrics"]["valuation_k"])
            local_low  = float(single_output["metrics"]["confidence_low_k"])
            local_high = float(single_output["metrics"]["confidence_high_k"])

            delta = abs(float(api_pred) - local_pred)
            pct = (delta / max(1e-9, abs(local_pred))) * 100.0

            ci_overlap = None
            if api_low is not None and api_high is not None:
                try:
                    ci_overlap = not (float(api_high) < local_low or float(api_low) > local_high)
                except Exception:
                    ci_overlap = None

            print(
                f"✅ API Call Success in {latency_ms} ms | "
                f"API={float(api_pred):.3f} k€ (unc={api_unc}) | "
                f"LOCAL={local_pred:.3f} k€ | Δ={delta:.4f} ({pct:.2f}%) | "
                f"CI overlap: {ci_overlap if ci_overlap is not None else 'n/a'}"
            )
    else:
        print(f"❌ API Call Failed: {resp.status_code}")
        print(resp.text[:800])

except Exception as e:
    print(f"❌ Exception during API request: {e}")

ℹ️ Using in-notebook sample; cannot load sample_property.json: [Errno 2] No such file or directory: '..\\data\\sample_property.json'
❌ API Call Failed: 500
{"detail":"Inference error: boolean value of NA is ambiguous"}
