### Imports & Config

In [1]:
# 01) Imports & Config — hardened (path bootstrap, legacy shims, safe predict)
from __future__ import annotations

import os, re, json, logging, warnings, hashlib, sys, types, importlib
from pathlib import Path
from typing import Dict, Any, Optional, List, Tuple

# ---------------------------------------------------------------------
# Path bootstrap
# ---------------------------------------------------------------------
NB_DIR = Path.cwd().resolve()
if NB_DIR.name.lower() == "notebooks" and (NB_DIR.parent / "shared").exists():
    PROJECT_ROOT = NB_DIR.parent.resolve()
else:
    PROJECT_ROOT = NB_DIR if (NB_DIR / "shared").exists() else NB_DIR.parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# ---------------------------------------------------------------------
# Env: limita i thread nativi (evita crash con MKL/OpenBLAS)
# ---------------------------------------------------------------------
for _k in ("OMP_NUM_THREADS", "OPENBLAS_NUM_THREADS", "MKL_NUM_THREADS", "NUMEXPR_NUM_THREADS"):
    os.environ.setdefault(_k, "1")

# ---------------------------------------------------------------------
# Legacy shims (installali SUBITO, prima di qualsiasi joblib.load)
# ---------------------------------------------------------------------
def _install_legacy_aliases():
    # crea gli scheletri dei parent package
    for name in ("notebooks", "notebooks.shared", "notebooks.shared.common"):
        if name not in sys.modules:
            sys.modules[name] = types.ModuleType(name)

    # carica i moduli reali
    new_const = importlib.import_module("shared.common.constants")
    sys.modules["notebooks.shared.common.constants"] = new_const

    # garantisci simboli legacy eventualmente cercati dagli artifact
    if not hasattr(new_const, "EXPECTED_PRICE_PER_SQM_EUR_RANGE"):
        setattr(
            new_const, "EXPECTED_PRICE_PER_SQM_EUR_RANGE",
            getattr(new_const, "EXPECTED_PRED_RANGE", (20.0, 20000.0))
        )

    # alias serving_transformers → quello moderno
    new_st = importlib.import_module("shared.common.serving_transformers")
    sys.modules["notebooks.shared.common.serving_transformers"] = new_st

    # rendi navigabile "notebooks.shared.common"
    sys.modules["notebooks"].shared = sys.modules["notebooks.shared"]
    sys.modules["notebooks.shared"].common = sys.modules["notebooks.shared.common"]

# Installa SUBITO gli alias (prima di qualunque import che possa toccare i pickle)
_install_legacy_aliases()

# ---------------------------------------------------------------------
# Third-party
# ---------------------------------------------------------------------
import joblib
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.utils.validation import check_is_fitted

# ---------------------------------------------------------------------
# Shared modern modules
# ---------------------------------------------------------------------
from shared.common.config import configure_logger
from shared.common.utils import canonical_json_dumps
from shared.common.sanity_checks import leakage_gate, scale_gate
from shared.common.constants import SCHEMA_VERSION, NOTE_MAX_BYTES

# ---------------------------------------------------------------------
# Logger & dirs
# ---------------------------------------------------------------------
ASSET_TYPE = "property"
PREFERRED_MODEL_VERSION = os.getenv("MODEL_VERSION", "v2")

MODELS_ROOT_CANDIDATES: List[Path] = [
    PROJECT_ROOT / "notebooks" / "outputs" / "modeling",
    PROJECT_ROOT / "outputs" / "modeling",
]
env_root = os.getenv("MODELS_ROOT")
if env_root and env_root.strip():
    MODELS_ROOT_CANDIDATES.insert(0, Path(env_root))

MODELS_ROOT = next((c for c in MODELS_ROOT_CANDIDATES if c.exists()), PROJECT_ROOT / "outputs" / "modeling")
MODEL_DIR = MODELS_ROOT / ASSET_TYPE
MODEL_DIR.mkdir(parents=True, exist_ok=True)

INFER_DIR = PROJECT_ROOT / "outputs" / "inference"
INFER_DIR.mkdir(parents=True, exist_ok=True)
LOG_PATH = PROJECT_ROOT / "outputs" / "logs" / "predictions_log.jsonl"
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

API_BASE = os.getenv("API_BASE", "http://127.0.0.1:8000")
COMPARE_WITH_API = os.getenv("COMPARE_WITH_API", "true").lower() in {"1", "true", "yes", "y"}

LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
LOG_JSON = os.getenv("LOG_JSON", "false").lower() in {"1", "true", "yes", "y"}
logger = configure_logger(level=LOG_LEVEL, name="nb04_infer", json_format=LOG_JSON)
warnings.filterwarnings("ignore", category=UserWarning)

# ---------------------------------------------------------------------
# Utils
# ---------------------------------------------------------------------
_version_re = re.compile(r"value_regressor_(v\d+)\.joblib$")

def _list_versions(dirpath: Path) -> List[str]:
    out: List[Tuple[int, str]] = []
    for p in dirpath.glob("value_regressor_v*.joblib"):
        m = _version_re.search(p.name)
        if not m:
            continue
        v = m.group(1)
        try:
            n = int(v[1:])
        except Exception:
            n = -1
        out.append((n, v))
    out.sort(reverse=True)
    return [v for _, v in out]

def _is_fitted(obj) -> bool:
    try:
        if isinstance(obj, TransformedTargetRegressor):
            est = getattr(obj, "regressor_", None) or getattr(obj, "regressor", None)
            if est is not None:
                return _is_fitted(est)
            check_is_fitted(obj)
            return True
        if isinstance(obj, Pipeline):
            last = obj.steps[-1][1]
            return _is_fitted(last)
        check_is_fitted(obj)
        return True
    except Exception:
        return False

def _read_json(path: Path) -> Dict[str, Any]:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def _sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1<<20), b""):
            h.update(chunk)
    return h.hexdigest()

def _dedup_preserve(seq: List[str]) -> List[str]:
    seen, out = set(), []
    for s in seq:
        if s not in seen:
            seen.add(s); out.append(s)
    return out

def _safe_joblib_load(path: Path):
    try:
        return joblib.load(path)
    except ModuleNotFoundError:
        _install_legacy_aliases()
        return joblib.load(path)

# ---------------------------------------------------------------------
# Carica modello FITTED (preferito → fallback più recente)
# ---------------------------------------------------------------------
def resolve_fitted_model(base_dir: Path, preferred: Optional[str]) -> Dict[str, Any]:
    def _try(ver: str):
        p = base_dir / f"value_regressor_{ver}.joblib"
        m = base_dir / f"value_regressor_{ver}_meta.json"
        if p.exists() and m.exists():
            pl = _safe_joblib_load(p)
            if _is_fitted(pl):
                return {"version": ver, "pipeline": p, "meta": m, "manifest": base_dir / "training_manifest.json", "obj": pl}
        return None

    if preferred:
        got = _try(preferred)
        if got:
            return got
        logger.warning("Model %s presente ma non fitted/leggibile; cerco fallback…", preferred)

    for ver in _list_versions(base_dir):
        got = _try(ver)
        if got:
            return got

    raise FileNotFoundError(f"Nessun modello fitted trovato in {base_dir}")

# ---- GO ----
resolved = resolve_fitted_model(MODEL_DIR, PREFERRED_MODEL_VERSION)
MODEL_VERSION: str = resolved["version"]
PIPELINE_PATH: Path = resolved["pipeline"]
META_PATH: Path = resolved["meta"]
MANIFEST_PATH: Path = resolved["manifest"]
pipeline = resolved["obj"]

# integrità bundle
model_meta: Dict[str, Any] = _read_json(META_PATH)
expected_hash = model_meta.get("model_hash") or model_meta.get("pipeline_sha256")
actual_hash = _sha256_file(PIPELINE_PATH)
if expected_hash and expected_hash != actual_hash:
    raise ValueError(f"Bundle manomesso: meta={expected_hash[:8]}… != actual={actual_hash[:8]}…")

# expected features
feature_order_candidates: List[Path] = []
manifest: Dict[str, Any] = {}
if MANIFEST_PATH.exists():
    try:
        manifest = _read_json(MANIFEST_PATH)
        p_from_manifest = (manifest.get("paths", {}) or {}).get("feature_order") or manifest.get("feature_order_path")
        if p_from_manifest:
            feature_order_candidates.append(Path(p_from_manifest))
    except Exception as e:
        logger.warning("Manifest presente ma non leggibile; fallback a meta.json", extra={"error": str(e)})

feature_order_candidates.append(PIPELINE_PATH.parent / "feature_order.json")
FEATURE_ORDER_PATH: Optional[Path] = next((p for p in feature_order_candidates if p and p.exists()), None)

categorical_expected: List[str] = list(model_meta.get("features_categorical", []) or [])
numeric_expected: List[str] = list(model_meta.get("features_numeric", []) or [])

if FEATURE_ORDER_PATH:
    try:
        feature_order: List[str] = _read_json(FEATURE_ORDER_PATH)
        ALL_EXPECTED: List[str] = list(map(str, feature_order))
    except Exception as e:
        logger.warning("feature_order.json non leggibile; uso meta/manifest", extra={"error": str(e)})
        ALL_EXPECTED = _dedup_preserve(categorical_expected + [c for c in numeric_expected if c not in categorical_expected])
else:
    try:
        feats_from_manifest = (manifest.get("feature_order")
                               or manifest.get("expected_features")
                               or manifest.get("model", {}).get("feature_list")
                               or manifest.get("model", {}).get("features"))
        if isinstance(feats_from_manifest, dict):
            categorical_expected = feats_from_manifest.get("categorical", categorical_expected) or categorical_expected
            numeric_expected = feats_from_manifest.get("numeric", numeric_expected) or numeric_expected
    except Exception:
        pass
    ALL_EXPECTED = _dedup_preserve(categorical_expected + [c for c in numeric_expected if c not in categorical_expected])

print(f"✅ Loaded FITTED model {MODEL_VERSION} from {PIPELINE_PATH.parent}")
print(f"   Features: {len(ALL_EXPECTED)} (cat={len(categorical_expected)}, num={len(numeric_expected)})")
print(f"   Inference dir: {INFER_DIR.as_posix()}")
print(f"   API compare: {COMPARE_WITH_API} → {API_BASE}")

# ---------------------------------------------------------------------
# MONKEY PATCH: PriorsGuard → pulizia missing/nullable (come API)
# ---------------------------------------------------------------------
def _to_numpy_nan(x):
    try:
        import pandas as _pd
        if x is _pd.NA:
            return np.nan
    except Exception:
        pass
    try:
        if isinstance(x, float) and (x != x):
            return np.nan
    except Exception:
        pass
    return x

def _is_nullable_int_or_bool_dtype(dtype) -> bool:
    s = str(dtype)
    return s.startswith("Int") or s == "boolean"

def _df_map(df: pd.DataFrame, func):
    if hasattr(pd.DataFrame, "map"):
        return df.map(func)
    return df.applymap(func)

def _to_numpy_nan(x):
    try:
        import pandas as _pd
        if x is _pd.NA:
            return np.nan
    except Exception:
        pass
    try:
        if isinstance(x, float) and (x != x):  # NaN
            return np.nan
    except Exception:
        pass
    return x

def _is_nullable_int_or_bool_dtype(dtype) -> bool:
    s = str(dtype)
    return s.startswith("Int") or s == "boolean"

def _clean_missing_df(df: pd.DataFrame) -> pd.DataFrame:
    if not isinstance(df, pd.DataFrame):
        df = pd.DataFrame(df)
    # usa compat wrapper (niente più FutureWarning)
    df = _df_map(df, _to_numpy_nan)
    for c in df.columns:
        dt = df[c].dtype
        if _is_nullable_int_or_bool_dtype(dt):
            df[c] = df[c].astype("float64")
    return df

def _patch_priorsguard_cleaner(p):
    """Se nella pipeline c'è PriorsGuard, wrappa .transform per ripulire il DF (evita crash)."""
    try:
        from sklearn.pipeline import Pipeline as _SkPipeline
        if isinstance(p, _SkPipeline):
            new_steps = []
            for name, step in p.steps:
                clsname = getattr(step, "__class__", type(None)).__name__
                if clsname == "PriorsGuard" and hasattr(step, "transform"):
                    orig_transform = step.transform
                    def _wrapped_transform(X, _orig=orig_transform):
                        Y = _orig(X)
                        try:
                            if isinstance(Y, pd.DataFrame):
                                Y = _clean_missing_df(Y)
                        except Exception:
                            pass
                        return Y
                    step.transform = _wrapped_transform
                new_steps.append((name, step))
            p.steps = new_steps
    except Exception:
        pass

_patch_priorsguard_cleaner(pipeline)

# (diagnostica minima + layout pipeline)
try:
    import shared.common.constants as _cmod
    print("constants module:", getattr(_cmod, "__file__", "<in-memory>"))
    print("SCHEMA_VERSION =", getattr(_cmod, "SCHEMA_VERSION", None))
    import notebooks.shared.common.constants as _lcmod  # legacy alias deve coincidere
    assert _lcmod is _cmod
    print("Legacy alias OK → notebooks.shared.common.constants → shared.common.constants")
except Exception as e:
    print("⚠️ Legacy alias check:", e)

# Layout sintetico
try:
    print("=== PIPELINE LAYOUT ===")
    if isinstance(pipeline, Pipeline):
        print(f"Pipeline(steps={len(pipeline.steps)})")
        for i, (nm, st) in enumerate(pipeline.steps):
            print(f"  [{i}] {nm}: {st.__class__.__name__}")
    else:
        print(type(pipeline).__name__)
except Exception:
    pass

✅ Loaded FITTED model v2 from C:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\modeling\property
   Features: 53 (cat=23, num=41)
   Inference dir: C:/Users/anven/OneDrive/Documenti/GitHub/axiomatic_oracle/notebooks/outputs/inference
   API compare: True → http://127.0.0.1:8000
constants module: C:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\shared\common\constants.py
SCHEMA_VERSION = v2
Legacy alias OK → notebooks.shared.common.constants → shared.common.constants
=== PIPELINE LAYOUT ===
Pipeline(steps=4)
  [0] canon_geo: GeoCanonizer
  [1] priors_guard: PriorsGuard
  [2] derive: EnsureDerivedFeatures
  [3] core: TransformedTargetRegressor


### Input Schema & Validation

In [2]:
# 02) Normalizzazione/validazione LEGGERA — no full validator, no side-effects
from __future__ import annotations
from typing import Dict, Any, Tuple, Optional, List
from datetime import datetime

import numpy as np
import pandas as pd

from shared.common.constants import ASSET_ID, LOCATION
from shared.common.utils import canonical_location, get_utc_now

_CELLA02_START = True

# --- Alias: manteniamo il CANONICO 'garage' (non 'has_garage') ---
_KEY_ALIASES = {
    "sqm":"size_m2","size":"size_m2","m2":"size_m2",
    "year":"year_built","built_year":"year_built",
    "balcony":"has_balcony","garden":"has_garden",
    "has_garage":"garage",               # ← alias → canonico
    "air_quality":"air_quality_index","noise":"noise_level",
    "valuation":"valuation_k","price_k":"valuation_k",
    "n_rooms":"rooms","room_count":"rooms",
    "n_bathrooms":"bathrooms","bathroom_count":"bathrooms",
    "elevator":"has_elevator","city_name":"city",
}

# Derivate consentite (no leakage)
_SAFE_DERIVED = {
    "age_years","luxury_score","env_score","location","city",
    "is_top_floor","listing_month", ASSET_ID,
}

def _canonicalize_keys(rec: Dict[str, Any]) -> Dict[str, Any]:
    return {_KEY_ALIASES.get(k, k): v for k, v in rec.items()}

def _autofill_safe(rec: Dict[str, Any]) -> Dict[str, Any]:
    r = _canonicalize_keys(dict(rec))

    # age_years
    if "age_years" not in r and r.get("year_built") not in (None, ""):
        try:
            r["age_years"] = max(0, datetime.utcnow().year - int(r["year_built"]))
        except Exception:
            pass

    # luxury_score (has_garden/has_balcony/garage)
    if "luxury_score" not in r:
        g  = 1.0 if bool(r.get("has_garden", 0)) else 0.0
        b  = 1.0 if bool(r.get("has_balcony", 0)) else 0.0
        ga = 1.0 if bool(r.get("garage", 0)) else 0.0
        r["luxury_score"] = (g + b + ga) / 3.0

    # env_score
    if "env_score" not in r:
        try:
            aq = float(r.get("air_quality_index", 0.0))
            nz = float(r.get("noise_level", 0.0))
            r["env_score"] = float(np.clip((aq/100.0) * (1.0 - nz/100.0), 0.0, 1.0))
        except Exception:
            r["env_score"] = None

    # location normalize (passiamo la stringa) + fallback
    if LOCATION in r and isinstance(r.get(LOCATION), str) and r[LOCATION].strip():
        try:
            _val = canonical_location(r[LOCATION])
            if _val:  # evita di azzerare valori già buoni
                r[LOCATION] = _val
        except Exception:
            pass

    # se manca 'city', prova a derivarla da location
    if not r.get("city") and r.get(LOCATION):
        try:
            r["city"] = str(r[LOCATION]).strip().title()
        except Exception:
            r["city"] = None

    # listing_month
    if "listing_month" not in r or r.get("listing_month") in (None, "", 0):
        r["listing_month"] = int(datetime.utcnow().month)

    # is_top_floor
    try:
        if "is_top_floor" not in r and r.get("floor") is not None and r.get("building_floors") is not None:
            r["is_top_floor"] = int(r.get("floor") == r.get("building_floors"))
    except Exception:
        pass

    # default minimi e innocui (evitiamo 'view'/'heating' che scatenano errori)
    r.setdefault("public_transport_nearby", int(bool(r.get("public_transport_nearby", 1))))
    r.setdefault("garage", int(bool(r.get("garage", r.get("has_garage", 0)))))

    return r

def validate_input_record(
    record: Dict[str, Any],
    all_expected: Optional[List[str]] = None,
    *,
    strict: bool = False,                 # compat, ignorato in LIGHT
    drop_extras: bool = True,
    allowed_features: Optional[List[str]] = None,
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    LIGHT validation: alias + derivate sicure + normalizzazione location + filtro feature.
    Nessuna chiamata a validator 'full' (niente stampe, niente side-effect).
    """
    base = _autofill_safe(record)

    expected = allowed_features or all_expected or list(globals().get("ALL_EXPECTED", []))
    allowed = set(expected) | _SAFE_DERIVED | {"city"}

    if drop_extras:
        for k in list(base.keys()):
            if k not in allowed:
                base.pop(k, None)

    report = {
        "ok": True,
        "errors": [],
        "flags": ["light_validation"],
        "normalized": {
            "asset_type": "property",
            "last_verified_ts": get_utc_now().replace(microsecond=0).isoformat().replace("+00:00","Z"),
        },
    }
    return base, report

def detect_anomalies(record: Dict[str, Any]) -> Tuple[bool, Dict[str, Any]]:
    """Sempre non-bloccante in modalità LIGHT."""
    _, report = validate_input_record(record, strict=False)
    return (False, report)

_CELLA02_DONE = True
print("CELLA 02 eseguita. (_CELLA02_DONE =", _CELLA02_DONE, ")")

CELLA 02 eseguita. (_CELLA02_DONE = True )


### Sample + Predict (+CI) + Drift + Batch

In [4]:
# R3) Sample → Single Predict → Batch (LIGHT validation + SAFE predict)
from __future__ import annotations
from uuid import uuid4
from copy import deepcopy
from typing import List, Optional, Dict, Any
import time
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline  # usato in _predict_safe

from shared.common.constants import ASSET_ID, LOCATION, SCHEMA_VERSION, NOTE_MAX_BYTES
from shared.common.sanity_checks import leakage_gate
from shared.common.utils import get_utc_now, sha256_hex, canonical_json_dumps

# Reusa i cleaner della Cella 02 (_clean_missing_df è in globals)

def _ensure_feature_frame(rec: Dict[str, Any], expected: List[str]) -> pd.DataFrame:
    row = {k: rec.get(k, np.nan) for k in expected}
    df = pd.DataFrame([row], columns=expected)
    return _clean_missing_df(df)

def _boolify_inplace(r: Dict[str, Any]) -> None:
    candidates = [k for k in r.keys() if k.startswith("has_")] + [
        "owner_occupied", "public_transport_nearby",
        "garage", "parking_spot", "cellar", "attic", "concierge",
        "is_top_floor", "is_ground_floor", "has_elevator", "has_balcony", "has_garden",
    ]
    for k in candidates:
        if k in r:
            try:
                r[k] = int(bool(r[k]))
            except Exception:
                pass

def _simple_ci(y_hat_k: float) -> Dict[str, float]:
    conf = 0.80
    margin = float(max(5.0, 0.15 * float(y_hat_k)))
    low = max(0.0, float(y_hat_k) - margin)
    high = float(y_hat_k) + margin
    return {"confidence": conf, "ci_margin_k": margin, "confidence_low_k": low, "confidence_high_k": high}

def _predict_safe(pipeline_obj, X_df: pd.DataFrame) -> float:
    """Replica la robustezza dell'API: predict → last step → fallback senza far esplodere lo stack."""
    X_df = _clean_missing_df(X_df)
    # 1) pipeline.predict
    try:
        y = pipeline_obj.predict(X_df)
        return float(np.ravel(y)[0])
    except RecursionError:
        # 2) prova last-step estimator
        try:
            if isinstance(pipeline_obj, Pipeline):
                last = pipeline_obj.steps[-1][1]
                y = last.predict(X_df)
                return float(np.ravel(y)[0])
        except Exception:
            pass
        # 3) fallback minimale (evita crash)
        s = X_df.get("size_m2")
        if s is not None and len(s) > 0 and pd.notna(s.iloc[0]):
            return float(max(1.0, 0.8 * float(s.iloc[0])))
        return 0.0

def predict_asset(record: Dict[str, Any], *, asset_id: Optional[str] = None) -> Dict[str, Any]:
    if "ALL_EXPECTED" not in globals():
        raise RuntimeError("ALL_EXPECTED non definito (manca la cella 01).")
    if "pipeline" not in globals():
        raise RuntimeError("pipeline non caricata (manca la cella 01).")

    # Validazione/normalizzazione input (LIGHT)
    base, val_report = validate_input_record(
        record,
        allowed_features=ALL_EXPECTED,   # ora supportato
        drop_extras=True,
        strict=False,
    )
    _boolify_inplace(base)

    aid = asset_id or base.get(ASSET_ID) or f"asset_infer_{uuid4().hex[:8]}"
    base[ASSET_ID] = aid

    X = _ensure_feature_frame(base, ALL_EXPECTED)

    ok_leak, bad = leakage_gate(list(X.columns))
    if not ok_leak and bad:
        X = X.drop(columns=list(bad), errors="ignore")

    # --- SAFE predict (evita stack overflow)
    y_hat = float(_predict_safe(pipeline, X))
    if not np.isfinite(y_hat) or y_hat < 0:
        y_hat = 0.0

    ci = _simple_ci(y_hat)
    out = {
        "schema_version": SCHEMA_VERSION,
        "asset_type": ASSET_TYPE,
        "asset_id": aid,
        "timestamp": get_utc_now().replace(microsecond=0).isoformat().replace("+00:00", "Z"),
        "model_meta": {
            "value_model_version": MODEL_VERSION,
            "unit": model_meta.get("unit", "k_eur"),
            "feature_order_sha256": model_meta.get("feature_order_sha256"),
            "pipeline_sha256": model_meta.get("pipeline_sha256") or model_meta.get("model_hash"),
        },
        "input": base,
        "metrics": {"valuation_k": y_hat, **ci},
        "validation": val_report,
    }
    return out

# --- Sample singolo ----------------------------------------------------------
sample_property_raw = {
    "location": "Milan",
    "size_m2": 120,
    "rooms": 4,
    "bathrooms": 2,
    "year_built": 1999,
    "floor": 2,
    "building_floors": 6,
    "has_elevator": 1,
    "has_garden": 0,
    "has_balcony": 1,
    "garage": 1,               # canonico (poi alias → has_garage)
    "energy_class": "B",
    "humidity_level": 50.0,
    "temperature_avg": 20.5,
    "noise_level": 40,
    "air_quality_index": 70,
    "owner_occupied": 1,
    "public_transport_nearby": 1,
    "distance_to_center_km": 2.5,
}

sample_property, validation_report = validate_input_record(
    sample_property_raw,
    allowed_features=ALL_EXPECTED,
    drop_extras=True,
)

for k in [k for k in sample_property if k.startswith("has_")] + ["owner_occupied","public_transport_nearby","garage"]:
    if k in sample_property:
        sample_property[k] = int(bool(sample_property[k]))

if not sample_property.get(ASSET_ID):
    sample_property[ASSET_ID] = f"asset_infer_{uuid4().hex[:8]}"

print(f"✅ Sample validated. asset_id={sample_property.get(ASSET_ID)}  location={sample_property.get(LOCATION)}")

t0 = time.perf_counter()
single_output = predict_asset(sample_property, asset_id=sample_property.get(ASSET_ID))
latency_ms_single = round((time.perf_counter() - t0) * 1000, 2)

print(
    f"ŷ_single = {single_output['metrics']['valuation_k']:.2f} k€  "
    f"(±{single_output['metrics']['ci_margin_k']:.2f} @ {int(single_output['metrics']['confidence']*100)}%)  "
    f"[{latency_ms_single} ms]"
)

# --- Batch -------------------------------------------------------------------
batch_samples: List[dict] = [
    deepcopy(sample_property),
    {**sample_property, ASSET_ID: None, LOCATION: "Rome",     "size_m2":  90, "energy_class": "C"},
    {**sample_property, ASSET_ID: None, LOCATION: "Florence", "size_m2":  70, "has_garden": 1, "energy_class": "A"},
    {**sample_property, ASSET_ID: None, LOCATION: "Turin",    "size_m2": 150, "energy_class": "D"},
]

batch_outputs: List[dict] = []
for i, raw in enumerate(batch_samples, start=1):
    out = predict_asset(raw, asset_id=raw.get(ASSET_ID) or f"asset_batch_{i:03}")
    batch_outputs.append(out)

pd.DataFrame(
    [{"asset_id": o["asset_id"], "location": o["input"].get("location"), "valuation_k": o["metrics"]["valuation_k"]}
     for o in batch_outputs]
)

df_batch = pd.DataFrame(
    [{
        "asset_id": o["asset_id"],
        "location": o["input"].get("location"),
        "valuation_k": o["metrics"]["valuation_k"],
        "ci_low_k": o["metrics"]["confidence_low_k"],
        "ci_high_k": o["metrics"]["confidence_high_k"],
    } for o in batch_outputs]
).sort_values("location")

print("\nBatch summary:")
print(df_batch.to_string(index=False))

# --- Compact Note ------------------------------------------------------------
def build_compact_note(out: dict) -> dict:
    return {
        "schema_version": "v2",
        "asset_id": out["asset_id"],
        "asset_type": out["asset_type"],
        "timestamp": out["timestamp"],
        "model": {
            "version": out["model_meta"]["value_model_version"],
            "hash": model_meta.get("pipeline_sha256") or model_meta.get("model_hash"),
        },
        "metrics": {
            "valuation_k": out["metrics"]["valuation_k"],
            "confidence": out["metrics"]["confidence"],
            "ci": [out["metrics"]["confidence_low_k"], out["metrics"]["confidence_high_k"]],
        },
    }

note = build_compact_note(single_output)
note_bytes = canonical_json_dumps(note).encode("utf-8")
note_size = len(note_bytes)
note_sha256 = sha256_hex(note_bytes)

single_output.setdefault("publish", {}).update({
    "status": "skipped",
    "note_size": note_size,
    "note_sha256": note_sha256,
    "is_compacted": True,
    "fallback_url_used": False,
})

assert note_size <= NOTE_MAX_BYTES, f"Nota troppo grande: {note_size} > {NOTE_MAX_BYTES}"
print(f"Note size={note_size} bytes | sha256={note_sha256[:16]}…")

✅ Sample validated. asset_id=asset_infer_84534f2b  location=Milan
ŷ_single = 635.51 k€  (±95.33 @ 80%)  [95.53 ms]

Batch summary:
            asset_id location  valuation_k   ci_low_k  ci_high_k
     asset_batch_003 Florence   322.610753 274.219140 371.002366
asset_infer_84534f2b    Milan   635.507347 540.181245 730.833449
     asset_batch_002     Rome   361.777914 307.511227 416.044601
     asset_batch_004    Turin   595.984870 506.587139 685.382600
Note size=319 bytes | sha256=d0f2c6227180be57…


### Logging JSONL

In [5]:
# L1) JSONL Logging (atomic append) — predictions & monitoring
from __future__ import annotations
from pathlib import Path
import os, json

from shared.common.utils import canonical_json_dumps, get_utc_now

MONITOR_LOG_PATH = Path("./outputs/logs/monitoring_log.jsonl")
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
MONITOR_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

def append_jsonl(record: dict, path: Path) -> None:
    """Append atomico JSONL con timestamp UTC 'Z' + fsync (compat Windows)."""
    path.parent.mkdir(parents=True, exist_ok=True)
    payload = {**record, "_logged_at": get_utc_now().replace(microsecond=0).isoformat().replace("+00:00", "Z")}
    line = canonical_json_dumps(payload)
    fd = os.open(str(path), os.O_WRONLY | os.O_CREAT | os.O_APPEND)
    try:
        with os.fdopen(fd, "a", encoding="utf-8") as f:
            f.write(line + "\n")
            f.flush()
            os.fsync(f.fileno())
    except Exception:
        try: os.close(fd)
        except Exception: pass
        raise

def _to_monitoring(entry: dict) -> dict:
    m  = entry.get("metrics", {}) or {}
    mm = entry.get("model_meta", {}) or {}
    return {
        "asset_id": entry.get("asset_id"),
        "model_version": mm.get("value_model_version", MODEL_VERSION),
        "model_class": mm.get("value_model_name"),
        "latency_ms": m.get("latency_ms"),
        "valuation_k": m.get("valuation_k") or m.get("valuation_base_k"),
        "uncertainty_k": m.get("uncertainty_k") or m.get("uncertainty"),
        "confidence_low_k": m.get("confidence_low_k"),
        "confidence_high_k": m.get("confidence_high_k"),
        "ci_method": m.get("ci_method"),
        "n_estimators": m.get("n_estimators"),
        "anomaly": (entry.get("flags") or {}).get("anomaly"),
        "drift_detected": (entry.get("flags") or {}).get("drift_detected"),
    }

# --- write predictions log ---
n_batch = len(globals().get("batch_outputs", []) or [])
if "single_output" in globals():
    append_jsonl(single_output, LOG_PATH)
for o in (globals().get("batch_outputs", []) or []):
    append_jsonl(o, LOG_PATH)
print(f"Appended {int('single_output' in globals()) + n_batch} predictions → {LOG_PATH}")

# --- write monitoring log (derived) ---
if "single_output" in globals():
    append_jsonl(_to_monitoring(single_output), MONITOR_LOG_PATH)
for o in (globals().get("batch_outputs", []) or []):
    append_jsonl(_to_monitoring(o), MONITOR_LOG_PATH)
print(f"Appended monitoring for {int('single_output' in globals()) + n_batch} records → {MONITOR_LOG_PATH}")

Appended 5 predictions → C:\Users\anven\OneDrive\Documenti\GitHub\axiomatic_oracle\notebooks\outputs\logs\predictions_log.jsonl
Appended monitoring for 5 records → outputs\logs\monitoring_log.jsonl


### Sensitivity Check (vary size_m2)

In [9]:
# L2) Sensitivity (“what-if”) su size_m2 — riusa predict_asset (no duplicazioni)
import warnings
import pandas as pd

sizes = [60, 90, 130, 170, 210]
rows  = []

# baseline dal sample già validato (usa predict_asset per coerenza)
_base = predict_asset(sample_property, asset_id="asset_sensitivity_base")
base_pred = float(_base["metrics"]["valuation_k"])

for s in sizes:
    rec_raw = {**sample_property, "size_m2": s}
    try:
        out = predict_asset(rec_raw, asset_id=f"asset_size_{s}")
        m = out["metrics"]
        # fallback robusti: se una chiave non c'è, usa None o un valore derivato
        ci_margin = float(m.get("ci_margin_k", max(5.0, 0.15 * float(m["valuation_k"]))))
        ci_low    = float(m.get("confidence_low_k",  max(0.0, float(m["valuation_k"]) - ci_margin)))
        ci_high   = float(m.get("confidence_high_k", float(m["valuation_k"]) + ci_margin))
        rows.append({
            "size_m2": s,
            "prediction_k": float(m["valuation_k"]),
            "ci_low_k": ci_low,
            "ci_high_k": ci_high,
            "ci_margin_k": ci_margin,
            # 'uncertainty_k' può non esistere nella CI “light”: esponiamola se presente, altrimenti None
            "uncertainty_k": m.get("uncertainty_k"),
            "delta_vs_base_k": round(float(m["valuation_k"]) - base_pred, 3),
        })
    except Exception as e:
        rows.append({"size_m2": s, "prediction_k": None, "error": str(e)})

warnings.filterwarnings("ignore", message="X does not have valid feature names")

df_sens = pd.DataFrame(rows)
print("Sensitivity vs size_m2")
df_sens

Sensitivity vs size_m2


Unnamed: 0,size_m2,prediction_k,ci_low_k,ci_high_k,ci_margin_k,uncertainty_k,delta_vs_base_k
0,60,293.834095,249.758981,337.909209,44.075114,,-341.673
1,90,430.028203,365.523972,494.532433,64.50423,,-205.479
2,130,667.613635,567.471589,767.75568,100.142045,,32.106
3,170,897.896049,763.211642,1032.580457,134.684407,,262.389
4,210,972.688827,826.785503,1118.592151,145.903324,,337.181


### API Checks

In [7]:
# L3) API checks (consistency + optional publish) — unificata
import os, json, time
import requests
from uuid import uuid4
from shared.common.utils import NumpyJSONEncoder

if COMPARE_WITH_API:
    def _pick_pred_metrics(payload: dict):
        """Estrae predizione/CI da response v1/v2/flat."""
        if not isinstance(payload, dict):
            return None, None, None, None
        m = payload.get("metrics", {}) if isinstance(payload.get("metrics"), dict) else {}
        pred = (m.get("valuation_k") or m.get("valuation_base_k")
                or m.get("valuation") or payload.get("valuation_k")
                or payload.get("prediction"))
        ci_low  = m.get("confidence_low_k")  or payload.get("confidence_low_k")
        ci_high = m.get("confidence_high_k") or payload.get("confidence_high_k")
        unc     = m.get("uncertainty_k")     or m.get("uncertainty") or payload.get("uncertainty_k")
        return pred, ci_low, ci_high, unc

    def _model_version(payload: dict):
        mm = (payload or {}).get("model_meta", {}) if isinstance(payload, dict) else {}
        return mm.get("value_model_version") or mm.get("model_version")

    headers = {"Content-Type": "application/json", "X-Idempotency-Key": uuid4().hex}
    token = os.getenv("AXM_TOKEN")
    if token:
        headers["Authorization"] = f"Bearer {token}"

    url = f"{API_BASE}/predict/{ASSET_TYPE}"

    # --- A) Consistency: confronta la singola locale vs API ---
    try:
        payload_json = json.loads(json.dumps(sample_property, cls=NumpyJSONEncoder, ensure_ascii=False))
        t0 = time.perf_counter()
        resp = requests.post(url, json=payload_json, headers=headers, timeout=10)
        lat_ms = round((time.perf_counter() - t0) * 1000, 2)
        if resp.ok:
            api_json = resp.json()
            api_pred, api_low, api_high, api_unc = _pick_pred_metrics(api_json)
            if api_pred is None:
                print(f"[API] ❌ OK ma 'metrics.valuation_*' mancante.")
            else:
                local_pred = float(single_output["metrics"]["valuation_k"])
                local_low  = float(single_output["metrics"]["confidence_low_k"])
                local_high = float(single_output["metrics"]["confidence_high_k"])
                delta = abs(float(api_pred) - local_pred)
                pct   = (delta / max(1e-9, abs(local_pred))) * 100.0
                ci_overlap = None
                if api_low is not None and api_high is not None:
                    try:
                        ci_overlap = not (float(api_high) < local_low or float(api_low) > local_high)
                    except Exception:
                        ci_overlap = None
                ver_note = ""
                av, lv = _model_version(api_json), MODEL_VERSION
                if av and av != lv:
                    ver_note = f" | ⚠️ model_version API={av} vs LOCAL={lv}"
                print(f"[API] {lat_ms} ms | API={float(api_pred):.3f} k€ | LOCAL={local_pred:.3f} k€ | "
                      f"Δ={delta:.4f} ({pct:.2f}%) | CI overlap: {ci_overlap if ci_overlap is not None else 'n/a'}{ver_note}")
        else:
            print(f"[API] ❌ {resp.status_code} | {resp.text[:200]}")
    except Exception as e:
        print(f"[API] ⚠️ Consistency check skipped: {e}")

    # --- B) Optional publish path (toggle via env PUBLISH=1) ---
    PUBLISH = os.getenv("PUBLISH", "false").lower() in {"1", "true", "yes", "y"}
    if PUBLISH:
        try:
            payload_json = json.loads(json.dumps(sample_property, cls=NumpyJSONEncoder, ensure_ascii=False))
            t0 = time.perf_counter()
            resp = requests.post(url, params={"publish": "true"}, json=payload_json, headers=headers, timeout=15)
            lat_ms = round((time.perf_counter() - t0) * 1000, 2)
            if resp.ok:
                api_json = resp.json()
                api_pred, api_low, api_high, api_unc = _pick_pred_metrics(api_json)
                print(f"✅ API publish ok in {lat_ms} ms | pred={api_pred} k€ | unc={api_unc}")
            else:
                print(f"❌ API publish failed: {resp.status_code} | {resp.text[:200]}")
        except Exception as e:
            print(f"❌ API publish exception: {e}")
else:
    print("ℹ️ COMPARE_WITH_API disabled — skip API checks.")

[API] ⚠️ Consistency check skipped: HTTPConnectionPool(host='127.0.0.1', port=8000): Max retries exceeded with url: /predict/property (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001BCF1D0FA90>: Failed to establish a new connection: [WinError 10061] Impossibile stabilire la connessione. Rifiuto persistente del computer di destinazione'))


### Artifact Audit

In [8]:
# L4) Artifacts audit — file hash vs meta/manifest
import hashlib, json
from pathlib import Path

def file_sha256(path: Path, chunk_size: int = 1 << 20) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""): h.update(chunk)
    return h.hexdigest()

def manifest_expected_hash(manifest_path: Path) -> str | None:
    if not manifest_path or not manifest_path.exists():
        return None
    try:
        mf = json.loads(manifest_path.read_text(encoding="utf-8"))
        return (
            (mf.get("paths") or {}).get("pipeline_sha256")
            or (mf.get("model_meta") or {}).get("model_hash")
            or (mf.get("artifacts") or {}).get("pipeline_sha256")
            or (mf.get("artifacts") or {}).get("model_sha256")
            or (mf.get("model") or {}).get("sha256")
            or mf.get("pipeline_sha256")
        )
    except Exception as e:
        try: logger.info("Manifest unreadable for hash", extra={"error": str(e)})
        except Exception: pass
        return None

model_sha = file_sha256(PIPELINE_PATH)
expected_sha_meta = (model_meta.get("model_hash") or model_meta.get("pipeline_sha256"))
expected_sha_manifest = manifest_expected_hash(MANIFEST_PATH)

print(f"Model SHA256: {model_sha} (first16={model_sha[:16]})")
if expected_sha_meta:
    print(f"Meta expects   : {expected_sha_meta} (match: {expected_sha_meta == model_sha})")
if expected_sha_manifest:
    ok = (expected_sha_manifest == model_sha)
    print(f"Manifest expects: {expected_sha_manifest} (match: {ok})")
    if not ok:
        try:
            logger.warning("Pipeline hash mismatch with manifest",
                           extra={"expected": expected_sha_manifest, "actual": model_sha})
        except Exception:
            pass

Model SHA256: c0f7d4dbd41cd80282af4d4e4001ccd9b831db304e68d12cd9576ef98dcb5c65 (first16=c0f7d4dbd41cd802)
Meta expects   : c0f7d4dbd41cd80282af4d4e4001ccd9b831db304e68d12cd9576ef98dcb5c65 (match: True)
