### Imports & Config

In [None]:
# 01) Imports & Config
from __future__ import annotations

import os, re, json, logging, warnings, hashlib
from pathlib import Path
from typing import Dict, Any, Optional, List, Tuple

import joblib                     # type: ignore
import numpy as np                # type: ignore
import pandas as pd               # type: ignore

# sklearn (per verificare che il modello sia fitted)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.utils.validation import check_is_fitted

# Shared modules (config, utils, constants)
from shared.common.config import configure_logger
from shared.common.utils import canonical_json_dumps
from shared.common.sanity_checks import leakage_gate, scale_gate
from shared.common.constants import SCHEMA_VERSION, NOTE_MAX_BYTES

# -----------------------------------------------------------------------------
# Setup & Config
# -----------------------------------------------------------------------------
ASSET_TYPE = "property"
PREFERRED_MODEL_VERSION = os.getenv("MODEL_VERSION", "v2")

# Root modelli: ENV → ./outputs/modeling → fallback
_candidates: List[Path] = []
env_root = os.getenv("MODELS_ROOT")
if env_root and env_root.strip():
    _candidates.append(Path(env_root))
_candidates += [Path("./outputs/modeling")]
MODELS_ROOT: Path = next((c for c in _candidates if c.exists()), Path("./outputs/modeling"))
MODEL_DIR = MODELS_ROOT / ASSET_TYPE
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Cartelle inference & log
INFER_DIR = Path("./outputs/inference")
INFER_DIR.mkdir(parents=True, exist_ok=True)
LOG_PATH = Path("./outputs/logs/predictions_log.jsonl")
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

# API comparison toggle (opzionale)
API_BASE = os.getenv("API_BASE", "http://127.0.0.1:8000")
COMPARE_WITH_API = os.getenv("COMPARE_WITH_API", "true").lower() in {"1", "true", "yes", "y"}

# Logger
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
LOG_JSON = os.getenv("LOG_JSON", "false").lower() in {"1", "true", "yes", "y"}
logger = configure_logger(level=LOG_LEVEL, name="nb04_infer", json_format=LOG_JSON)
warnings.filterwarnings("ignore", category=UserWarning)
logger.info("Model roots resolved", extra={"MODELS_ROOT": str(MODELS_ROOT), "MODEL_DIR": str(MODEL_DIR)})

# -----------------------------------------------------------------------------
# Helpers
# -----------------------------------------------------------------------------
_version_re = re.compile(r"value_regressor_(v\d+)\.joblib$")

def _list_versions(dirpath: Path) -> List[str]:
    items: List[Tuple[int, str]] = []
    for p in dirpath.glob("value_regressor_v*.joblib"):
        m = _version_re.search(p.name)
        if not m:
            continue
        v = m.group(1)  # 'vN'
        try:
            n = int(v[1:])
        except Exception:
            n = -1
        items.append((n, v))
    items.sort(reverse=True)  # dalla più recente
    return [v for _, v in items]

def _is_fitted(obj) -> bool:
    """Check fittedness for Pipeline / TTR / base estimator robustly."""
    try:
        if isinstance(obj, TransformedTargetRegressor):
            # preferisci l'estimatore interno
            est = getattr(obj, "regressor_", None) or getattr(obj, "regressor", None)
            if est is not None:
                return _is_fitted(est)
            check_is_fitted(obj)  # fallback
            return True
        if isinstance(obj, Pipeline):
            # scendi fino all'ultimo estimatore
            last = obj.steps[-1][1]
            return _is_fitted(last)
        # stimatore base sklearn
        check_is_fitted(obj)
        return True
    except Exception:
        return False

def _read_json(path: Path) -> Dict[str, Any]:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def _sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h.update(chunk)
    return h.hexdigest()

def _dedup_preserve(seq: List[str]) -> List[str]:
    seen, out = set(), []
    for s in seq:
        if s not in seen:
            seen.add(s); out.append(s)
    return out

# -----------------------------------------------------------------------------
# Resolve a FITTED model (prefer requested version, else newest fitted)
# -----------------------------------------------------------------------------
def resolve_fitted_model(base_dir: Path, preferred: Optional[str]) -> Dict[str, Any]:
    # 1) tenta versione preferita
    if preferred:
        p = base_dir / f"value_regressor_{preferred}.joblib"
        m = base_dir / f"value_regressor_{preferred}_meta.json"
        if p.exists() and m.exists():
            pl = joblib.load(p)
            if _is_fitted(pl):
                return {
                    "version": preferred,
                    "pipeline": p,
                    "meta": m,
                    "manifest": base_dir / "training_manifest.json",
                    "obj": pl,
                }
            logger.warning("Model %s presente ma non fitted; cerco fallback…", preferred)

    # 2) cerca la più recente fitted tra le disponibili
    for ver in _list_versions(base_dir):
        p = base_dir / f"value_regressor_{ver}.joblib"
        m = base_dir / f"value_regressor_{ver}_meta.json"
        if not (p.exists() and m.exists()):
            continue
        pl = joblib.load(p)
        if _is_fitted(pl):
            return {
                "version": ver,
                "pipeline": p,
                "meta": m,
                "manifest": base_dir / "training_manifest.json",
                "obj": pl,
            }

    raise FileNotFoundError(f"Nessun modello fitted trovato in {base_dir}")

# -----------------------------------------------------------------------------
# Load pipeline & metadata (fitted fallback) + expected features
# -----------------------------------------------------------------------------
resolved = resolve_fitted_model(MODEL_DIR, PREFERRED_MODEL_VERSION)
MODEL_VERSION: str = resolved["version"]
PIPELINE_PATH: Path = resolved["pipeline"]
META_PATH: Path = resolved["meta"]
MANIFEST_PATH: Path = resolved["manifest"]
pipeline = resolved["obj"]

logger.info(
    "Using model artifacts",
    extra={
        "asset_type": ASSET_TYPE,
        "model_version": MODEL_VERSION,
        "pipeline": str(PIPELINE_PATH),
        "meta": str(META_PATH),
        "manifest": str(MANIFEST_PATH) if MANIFEST_PATH.exists() else None,
    },
)

# integrità bundle (hash del joblib)
model_meta: Dict[str, Any] = _read_json(META_PATH)
expected_hash = model_meta.get("model_hash") or model_meta.get("pipeline_sha256")
actual_hash = _sha256_file(PIPELINE_PATH)
if expected_hash and expected_hash != actual_hash:
    raise ValueError(
        f"Bundle manomesso: meta={expected_hash[:8]}… != actual={actual_hash[:8]}…"
    )

# expected features: preferisci feature_order.json → poi manifest → infine meta.json
feature_order_candidates: List[Path] = [PIPELINE_PATH.parent / "feature_order.json"]
manifest: Dict[str, Any] = {}
if MANIFEST_PATH.exists():
    try:
        manifest = _read_json(MANIFEST_PATH)
        p_from_manifest = manifest.get("paths", {}).get("feature_order")
        if p_from_manifest:
            feature_order_candidates.insert(0, Path(p_from_manifest))
    except Exception as e:
        logger.warning("Manifest presente ma non leggibile; fallback a meta.json", extra={"error": str(e)})

FEATURE_ORDER_PATH: Optional[Path] = next((p for p in feature_order_candidates if p and p.exists()), None)

categorical_expected: List[str] = list(model_meta.get("features_categorical", []) or [])
numeric_expected: List[str] = list(model_meta.get("features_numeric", []) or [])

if FEATURE_ORDER_PATH:
    try:
        feature_order: List[str] = _read_json(FEATURE_ORDER_PATH)
        ALL_EXPECTED: List[str] = list(feature_order)
    except Exception as e:
        logger.warning("feature_order.json non leggibile; uso meta/manifest", extra={"error": str(e)})
        ALL_EXPECTED = _dedup_preserve(categorical_expected + [c for c in numeric_expected if c not in categorical_expected])
else:
    # fallback senza feature_order.json → prova anche il manifest
    try:
        feats_from_manifest = (manifest.get("feature_order")
                               or manifest.get("expected_features")
                               or manifest.get("model", {}).get("feature_list")
                               or manifest.get("model", {}).get("features"))
        if isinstance(feats_from_manifest, dict):
            categorical_expected = feats_from_manifest.get("categorical", categorical_expected) or categorical_expected
            numeric_expected = feats_from_manifest.get("numeric", numeric_expected) or numeric_expected
        elif isinstance(feats_from_manifest, list):
            categorical_expected = categorical_expected  # invariato
            numeric_expected = numeric_expected          # invariato
            ALL_EXPECTED = list(map(str, feats_from_manifest))
    except Exception:
        pass
    if 'ALL_EXPECTED' not in globals():
        ALL_EXPECTED = _dedup_preserve(categorical_expected + [c for c in numeric_expected if c not in categorical_expected])

print(f"✅ Loaded FITTED model {MODEL_VERSION} from {PIPELINE_PATH.parent}")
print(f"   Features: {len(ALL_EXPECTED)} (cat={len(categorical_expected)}, num={len(numeric_expected)})")
print(f"   Inference dir: {INFER_DIR.as_posix()}")
print(f"   API compare: {COMPARE_WITH_API} → {API_BASE}")

### Input Schema & Validation

In [None]:
# 02) Input Schema & Validation (aliases, safe-derives, domain checks, JSON-schema)

from __future__ import annotations
from typing import Dict, Any, Tuple, Optional, List
from pathlib import Path
import json
import numpy as np

from shared.common.pricing import explain_price
from shared.common.constants import LOCATION, SCHEMA_VERSION as _DEF_SCHEMA_VER, NOTE_MAX_BYTES
from shared.common.utils import canonical_location, get_utc_now, NumpyJSONEncoder
from shared.common.sanity_checks import price_benchmark, validate_property

# ---------------------------------------------------------------------
# A) INPUT utilities: alias → canonico, derivate **non-leaky**, validator
# ---------------------------------------------------------------------

# Alias comuni → chiavi canoniche (soft matching su lower-case)
_CANONICAL_ALIASES = {
    "sqm": "size_m2", "size": "size_m2", "m2": "size_m2",
    "year": "year_built", "built_year": "year_built",
    "balcony": "has_balcony", "garden": "has_garden", "garage": "has_garage",
    "air_quality": "air_quality_index", "noise": "noise_level",
    "valuation": "valuation_k", "price_k": "valuation_k",
}

# Derivate consentite (no leakage dal target)
_SAFE_DERIVED = {"age_years", "luxury_score", "env_score"}

def _canonicalize_keys(record: Dict[str, Any]) -> Dict[str, Any]:
    """Mappa alias comuni verso i nomi campo canonici (lower-case keys)."""
    out: Dict[str, Any] = {}
    for k, v in record.items():
        k_lc = (str(k) if k is not None else "").strip().lower()
        out[_CANONICAL_ALIASES.get(k_lc, k_lc)] = v
    return out

def _autofill_safe(record: Dict[str, Any]) -> Dict[str, Any]:
    """
    Deriva SOLO campi sicuri:
      - age_years da year_built
      - luxury_score: media di has_garden / has_balcony / has_garage
      - env_score: combinazione di air_quality_index (↑) e noise_level (↓) normalizzati in [0,1]
    """
    r = dict(record)

    # age_years
    if "age_years" not in r and r.get("year_built") not in (None, ""):
        try:
            r["age_years"] = max(0, get_utc_now().year - int(r["year_built"]))
        except Exception:
            pass

    # luxury_score (0..1)
    if "luxury_score" not in r:
        g = 1.0 if bool(r.get("has_garden", 0)) else 0.0
        b = 1.0 if bool(r.get("has_balcony", 0)) else 0.0
        ga = 1.0 if bool(r.get("has_garage", 0)) else 0.0
        r["luxury_score"] = float((g + b + ga) / 3.0)

    # env_score (0..1): aria buona * (1 - rumore)
    if "env_score" not in r:
        try:
            aq = float(r.get("air_quality_index", 0.0))
            nz = float(r.get("noise_level", 0.0))
            aq_n = float(np.clip(aq / 100.0, 0.0, 1.0))
            nz_n = float(np.clip(nz / 100.0, 0.0, 1.0))
            r["env_score"] = float(np.clip(aq_n * (1.0 - nz_n), 0.0, 1.0))
        except Exception:
            r["env_score"] = None

    return r

def validate_input_record(
    record: Dict[str, Any],
    *,
    strict: bool = True,
    drop_extras: bool = True,
    allowed_features: Optional[List[str]] = None,
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Canonicalizza → deriva campi sicuri → **VALIDA su versione di dominio** → filtra per il modello.
    Ritorna (record_per_modello, report_validator).
    """
    # Snapshot dominio (no rinomina) per il validator
    dom = dict(record)
    if LOCATION in dom and dom.get(LOCATION):
        try:
            dom[LOCATION] = canonical_location(dom[LOCATION])
        except Exception:
            pass
    dom.setdefault("asset_type", "property")
    dom.setdefault("last_verified_ts", get_utc_now().replace(microsecond=0).isoformat().replace("+00:00","Z"))

    # Versione per il modello: alias + derive + normalizzazione location
    base = _canonicalize_keys(record)
    base = _autofill_safe(base)
    if LOCATION in base and base.get(LOCATION):
        try:
            base[LOCATION] = canonical_location(base[LOCATION])
        except Exception:
            pass

    # Validazione di dominio
    report = validate_property(dom)
    if strict and not report.get("ok", True):
        raise ValueError(f"❌ Property validation failed: {report.get('errors') or report}")

    # Filtra extra: consentiti = ALL_EXPECTED (se globale) ∪ derivate sicure
    allowed = set(allowed_features or globals().get("ALL_EXPECTED", [])).union(_SAFE_DERIVED)
    extras = [k for k in list(base.keys()) if k not in allowed] if allowed else []
    if drop_extras:
        for k in extras:
            base.pop(k, None)
    elif strict and extras:
        raise ValueError(f"❌ Unexpected extra features: {extras}")

    return base, report

def detect_anomalies(record: Dict[str, Any]) -> Tuple[bool, Dict[str, Any]]:
    """True se il validator riporta blocker (fuori dominio, misure impossibili)."""
    _, report = validate_input_record(record, strict=False)
    return (not report.get("ok", True)), report

def _maybe_explain_price(rec: dict) -> dict | None:
    """Breakdown euristico del prezzo (no ML), utile per UI/trasparenza."""
    try:
        return explain_price(rec)
    except Exception as e:
        try: logger.info("explain_price unavailable", extra={"error": str(e)})
        except Exception: pass
        return None

def _price_benchmark_flag(rec: dict, yhat_k: float) -> dict | None:
    """Flag 'fuori banda' rispetto a profili location-based (se disponibili)."""
    try:
        return price_benchmark(location=rec.get("location"), valuation_k=float(yhat_k))
    except Exception as e:
        try: logger.info("price_benchmark unavailable", extra={"error": str(e)})
        except Exception: pass
        return None

# ---------------------------------------------------------------------
# B) OUTPUT JSON-schema validation (riusabile per single + batch)
# ---------------------------------------------------------------------

try:
    # usa il validator più recente se presente
    from jsonschema import Draft202012Validator as _Validator
except Exception:
    from jsonschema import Draft7Validator as _Validator  # type: ignore

def _norm_schema_tag(ver: str) -> str:
    """'2'/'2.0'/'v2' → 'v2' ; altrimenti lower verbatim."""
    v = (ver or "").strip().lower()
    if v in {"2", "2.0", "02", "v2"}: return "v2"
    return v if v.startswith("v") else v

def _to_jsonable(obj: dict) -> dict:
    """Converte tipi NumPy in JSON-safe."""
    return json.loads(json.dumps(obj, cls=NumpyJSONEncoder, ensure_ascii=False))

def _format_error(e) -> str:
    path = ".".join(str(p) for p in getattr(e, "path", [])) or "<root>"
    spath = " → ".join(str(p) for p in getattr(e, "schema_path", []))
    return f"at '{path}': {getattr(e,'message','error')}  [schema: {spath}]"

def _deep_keys(d, prefix=""):
    keys = set()
    if isinstance(d, dict):
        for k, v in d.items():
            newp = f"{prefix}.{k}" if prefix else k
            keys.add(newp)
            keys |= _deep_keys(v, newp)
    elif isinstance(d, list):
        if d:
            keys |= _deep_keys(d[0], prefix + "[]")
        else:
            keys.add(prefix + "[]")
    return keys

def validate_outputs(
    single_output: Optional[dict] = None,
    batch_outputs: Optional[List[dict]] = None,
    *,
    schema_version: Optional[str] = None,
    schemas_dir: Path = Path("../schemas"),
    example_filename: str = "output_example.json",
) -> dict:
    """
    Valida single_output e/o batch_outputs contro lo schema JSON.
    Ritorna un report con esiti e differenze profonde vs esempio (se presente).
    """
    report = {
        "schema": None,
        "single_ok": None,
        "batch_ok": None,
        "batch_error_count": 0,
        "extra_vs_example": [],
        "missing_vs_example": [],
    }

    # Seleziona schema
    tag = _norm_schema_tag(schema_version or _DEF_SCHEMA_VER)
    schema_path = schemas_dir / f"output_schema_{tag}.json"
    if not schema_path.exists():
        schema_path = schemas_dir / f"output_schema_{_norm_schema_tag(_DEF_SCHEMA_VER)}.json"
    if not schema_path.exists():
        schema_path = schemas_dir / "output_schema_v1.json"

    if not schema_path.exists():
        msg = f"❌ Schema file not found: {schema_path}"
        print(msg)
        return report

    schema = json.loads(schema_path.read_text(encoding="utf-8"))
    report["schema"] = schema_path.name
    print(f"🔎 Using schema: {schema_path.name}")

    # Single
    if single_output is not None:
        from jsonschema import validate, ValidationError
        try:
            validate(instance=_to_jsonable(single_output), schema=schema)
            print("✅ Strict schema validation passed (single_output).")
            report["single_ok"] = True
        except ValidationError as e:
            print("❌ Strict schema validation failed (single_output):", _format_error(e))
            report["single_ok"] = False

        # Deep diff vs example (se disponibile)
        ex_path = schemas_dir / example_filename
        if ex_path.exists():
            example = json.loads(ex_path.read_text(encoding="utf-8"))
            ex_keys = _deep_keys(example)
            out_keys = _deep_keys(single_output)
            extra = sorted(out_keys - ex_keys)
            missing = sorted(ex_keys - out_keys)
            report["extra_vs_example"] = extra
            report["missing_vs_example"] = missing
            if not extra and not missing:
                print("✅ single_output matches example structure (deep).")
            else:
                if extra:
                    head = extra[:10]
                    print("⚠️ Extra keys vs example (deep):", head, ("…+" if len(extra) > 10 else ""))
                if missing:
                    head = missing[:10]
                    print("⚠️ Missing keys vs example (deep):", head, ("…+" if len(missing) > 10 else ""))

    # Batch
    if batch_outputs:
        validator = _Validator(schema)
        errors = []
        for idx, rec in enumerate(batch_outputs, start=1):
            for err in validator.iter_errors(_to_jsonable(rec)):
                errors.append((idx, err))
        if not errors:
            print(f"✅ Batch outputs: all {len(batch_outputs)} records pass schema validation.")
            report["batch_ok"] = True
        else:
            print(f"❌ Batch outputs: {len(errors)} schema errors found on {len(set(i for i, _ in errors))} records.")
            for i, err in errors[:5]:
                print(f"   • [#{i}] {_format_error(err)}")
            if len(errors) > 5:
                print(f"   … and {len(errors)-5} more")
            report["batch_ok"] = False
            report["batch_error_count"] = len(errors)

    return report

# ---------------------------------------------------------------------
# C) Auto-run (solo se chi ha già calcolato single_output / batch_outputs)
# ---------------------------------------------------------------------
if "single_output" in globals() or "batch_outputs" in globals():
    _so = globals().get("single_output")
    _bo = globals().get("batch_outputs")
    _ = validate_outputs(single_output=_so, batch_outputs=_bo)

### Sample + Predict (+CI) + Drift + Batch

In [None]:
# R3) Sample → Single Predict → Batch (refactor unico, riusa helpers esistenti)

from __future__ import annotations
from uuid import uuid4
from copy import deepcopy
from typing import List
import time
import numpy as np
import pandas as pd

from shared.common.constants import ASSET_ID, LOCATION

# --- A) Sample grezzo (puoi modificare qui liberamente)
sample_property_raw = {
    "location": "Milan",
    "size_m2": 120,
    "rooms": 4,
    "bathrooms": 2,
    "year_built": 1999,
    "floor": 2,
    "building_floors": 6,
    "has_elevator": 1,
    "has_garden": 0,
    "has_balcony": 1,
    "has_garage": 1,
    "energy_class": "B",
    "humidity_level": 50.0,
    "temperature_avg": 20.5,
    "noise_level": 40,
    "air_quality_index": 70,
    "owner_occupied": 1,
    "public_transport_nearby": 1,
    "distance_to_center_km": 2.5,
}

# --- B) Validazione + normalizzazioni leggere (riusa la tua cella di validazione)
sample_property, validation_report = validate_input_record(sample_property_raw, strict=True)

# boolean-like → {0,1} (coerenza con serving/schema)
_bool_like = [k for k in sample_property if k.startswith("has_")] + ["owner_occupied", "public_transport_nearby"]
for k in _bool_like:
    if k in sample_property:
        sample_property[k] = int(bool(sample_property[k]))

# asset_id sintetico se mancante (non è feature del modello)
if not sample_property.get(ASSET_ID):
    sample_property[ASSET_ID] = f"asset_infer_{uuid4().hex[:8]}"

print(f"✅ Sample validated. asset_id={sample_property.get(ASSET_ID)}  location={sample_property.get(LOCATION)}")

# --- C) Predizione singola con API locale (predict_asset) — nessuna duplicazione
t0 = time.perf_counter()
single_output = predict_asset(sample_property, asset_id=sample_property.get(ASSET_ID))
latency_ms_single = round((time.perf_counter() - t0) * 1000, 2)

print(
    f"ŷ_single = {single_output['metrics']['valuation_k']:.2f} k€  "
    f"(±{single_output['metrics']['ci_margin_k']:.2f} @ {int(single_output['metrics']['confidence']*100)}%)"
)

# --- D) Batch inference (esempi variati) — riusa predict_asset
batch_samples: List[dict] = [
    deepcopy(sample_property),
    {**sample_property, ASSET_ID: None, LOCATION: "Rome",     "size_m2":  90, "energy_class": "C"},
    {**sample_property, ASSET_ID: None, LOCATION: "Florence", "size_m2":  70, "has_garden": 1, "energy_class": "A"},
    {**sample_property, ASSET_ID: None, LOCATION: "Turin",    "size_m2": 150, "energy_class": "D"},
]

batch_outputs: List[dict] = []
for i, raw in enumerate(batch_samples, start=1):
    out = predict_asset(raw, asset_id=raw.get(ASSET_ID) or f"asset_batch_{i:03}")
    batch_outputs.append(out)

# Riepilogo compatto batch
pd.DataFrame([{"asset_id": o["asset_id"], "valuation_k": o["metrics"]["valuation_k"]} for o in batch_outputs])

In [None]:
# R4) Compact Note (idempotente) — riusa single_output
from shared.common.utils import canonical_json_dumps, sha256_hex
from shared.common.constants import NOTE_MAX_BYTES

def build_compact_note(out: dict) -> dict:
    return {
        "schema_version": "v2",
        "asset_id": out["asset_id"],
        "asset_type": out["asset_type"],
        "timestamp": out["timestamp"],
        "model": {
            "version": out["model_meta"]["value_model_version"],
            "hash": model_meta.get("pipeline_sha256") or model_meta.get("model_hash"),
        },
        "metrics": {
            "valuation_k": out["metrics"]["valuation_k"],
            "confidence": out["metrics"]["confidence"],
            "ci": [out["metrics"]["confidence_low_k"], out["metrics"]["confidence_high_k"]],
        },
    }

note = build_compact_note(single_output)
note_bytes = canonical_json_dumps(note).encode("utf-8")
note_size = len(note_bytes)
note_sha256 = sha256_hex(note_bytes)

single_output.setdefault("publish", {}).update({
    "status": "skipped",
    "note_size": note_size,
    "note_sha256": note_sha256,
    "is_compacted": True,
    "fallback_url_used": False,
})

assert note_size <= NOTE_MAX_BYTES, f"Nota troppo grande: {note_size} > {NOTE_MAX_BYTES}"
print(f"Note size={note_size} bytes | sha256={note_sha256[:16]}…")

### Logging JSONL

In [None]:
# L1) JSONL Logging (atomic append) — predictions & monitoring
from __future__ import annotations
from datetime import datetime
from pathlib import Path
import os, json

from shared.common.utils import canonical_json_dumps, get_utc_now

MONITOR_LOG_PATH = Path("./outputs/logs/monitoring_log.jsonl")
LOG_PATH.parent.mkdir(parents=True, exist_ok=True)
MONITOR_LOG_PATH.parent.mkdir(parents=True, exist_ok=True)

def append_jsonl(record: dict, path: Path) -> None:
    """Append atomico JSONL con timestamp UTC 'Z' + fsync."""
    path.parent.mkdir(parents=True, exist_ok=True)
    payload = {**record, "_logged_at": get_utc_now().replace(microsecond=0).isoformat().replace("+00:00", "Z")}
    line = canonical_json_dumps(payload)
    fd = os.open(str(path), os.O_WRONLY | os.O_CREAT | os.O_APPEND)
    try:
        with os.fdopen(fd, "a", encoding="utf-8") as f:
            f.write(line + "\n")
            f.flush()
            os.fsync(f.fileno())
    except Exception:
        try: os.close(fd)
        except Exception: pass
        raise

def _to_monitoring(entry: dict) -> dict:
    m  = entry.get("metrics", {}) or {}
    mm = entry.get("model_meta", {}) or {}
    return {
        "asset_id": entry.get("asset_id"),
        "model_version": mm.get("value_model_version", MODEL_VERSION),
        "model_class": mm.get("value_model_name"),
        "latency_ms": m.get("latency_ms"),
        "valuation_k": m.get("valuation_k") or m.get("valuation_base_k"),
        "uncertainty_k": m.get("uncertainty_k") or m.get("uncertainty"),
        "confidence_low_k": m.get("confidence_low_k"),
        "confidence_high_k": m.get("confidence_high_k"),
        "ci_method": m.get("ci_method"),
        "n_estimators": m.get("n_estimators"),
        "anomaly": (entry.get("flags") or {}).get("anomaly"),
        "drift_detected": (entry.get("flags") or {}).get("drift_detected"),
    }

# --- write predictions log ---
n_batch = len(globals().get("batch_outputs", []) or [])
if "single_output" in globals():
    append_jsonl(single_output, LOG_PATH)
for o in (globals().get("batch_outputs", []) or []):
    append_jsonl(o, LOG_PATH)
print(f"Appended {int('single_output' in globals()) + n_batch} predictions → {LOG_PATH}")

# --- write monitoring log (derived) ---
if "single_output" in globals():
    append_jsonl(_to_monitoring(single_output), MONITOR_LOG_PATH)
for o in (globals().get("batch_outputs", []) or []):
    append_jsonl(_to_monitoring(o), MONITOR_LOG_PATH)
print(f"Appended monitoring for {int('single_output' in globals()) + n_batch} records → {MONITOR_LOG_PATH}")

### Sensitivity Check (vary size_m2)

In [None]:
# L2) Sensitivity (“what-if”) su size_m2 — riusa predict_asset (no duplicazioni)
import warnings
import pandas as pd

sizes = [60, 90, 130, 170, 210]
rows  = []

# baseline dal sample già validato (usa predict_asset per coerenza)
_base = predict_asset(sample_property, asset_id="asset_sensitivity_base")
base_pred = float(_base["metrics"]["valuation_k"])

for s in sizes:
    rec_raw = {**sample_property, "size_m2": s}
    try:
        out = predict_asset(rec_raw, asset_id=f"asset_size_{s}")
        ci_low  = out["metrics"]["confidence_low_k"]
        ci_high = out["metrics"]["confidence_high_k"]
        rows.append({
            "size_m2": s,
            "prediction_k": out["metrics"]["valuation_k"],
            "ci_low_k": ci_low,
            "ci_high_k": ci_high,
            "ci_margin_k": out["metrics"]["ci_margin_k"],
            "uncertainty_k": out["metrics"]["uncertainty_k"],
            "delta_vs_base_k": round(out["metrics"]["valuation_k"] - base_pred, 3),
        })
    except Exception as e:
        rows.append({"size_m2": s, "prediction_k": None, "error": str(e)})

warnings.filterwarnings("ignore", message="X does not have valid feature names")
pd.DataFrame(rows)

### API Checks

In [None]:
# L3) API checks (consistency + optional publish) — unificata
import os, json, time
import requests
from uuid import uuid4
from shared.common.utils import NumpyJSONEncoder

if COMPARE_WITH_API:
    def _pick_pred_metrics(payload: dict):
        """Estrae predizione/CI da response v1/v2/flat."""
        if not isinstance(payload, dict):
            return None, None, None, None
        m = payload.get("metrics", {}) if isinstance(payload.get("metrics"), dict) else {}
        pred = (m.get("valuation_k") or m.get("valuation_base_k")
                or m.get("valuation") or payload.get("valuation_k")
                or payload.get("prediction"))
        ci_low  = m.get("confidence_low_k")  or payload.get("confidence_low_k")
        ci_high = m.get("confidence_high_k") or payload.get("confidence_high_k")
        unc     = m.get("uncertainty_k")     or m.get("uncertainty") or payload.get("uncertainty_k")
        return pred, ci_low, ci_high, unc

    def _model_version(payload: dict):
        mm = (payload or {}).get("model_meta", {}) if isinstance(payload, dict) else {}
        return mm.get("value_model_version") or mm.get("model_version")

    headers = {"Content-Type": "application/json", "X-Idempotency-Key": uuid4().hex}
    token = os.getenv("AXM_TOKEN")
    if token:
        headers["Authorization"] = f"Bearer {token}"

    url = f"{API_BASE}/predict/{ASSET_TYPE}"

    # --- A) Consistency: confronta la singola locale vs API ---
    try:
        payload_json = json.loads(json.dumps(sample_property, cls=NumpyJSONEncoder, ensure_ascii=False))
        t0 = time.perf_counter()
        resp = requests.post(url, json=payload_json, headers=headers, timeout=10)
        lat_ms = round((time.perf_counter() - t0) * 1000, 2)
        if resp.ok:
            api_json = resp.json()
            api_pred, api_low, api_high, api_unc = _pick_pred_metrics(api_json)
            if api_pred is None:
                print(f"[API] ❌ OK ma 'metrics.valuation_*' mancante.")
            else:
                local_pred = float(single_output["metrics"]["valuation_k"])
                local_low  = float(single_output["metrics"]["confidence_low_k"])
                local_high = float(single_output["metrics"]["confidence_high_k"])
                delta = abs(float(api_pred) - local_pred)
                pct   = (delta / max(1e-9, abs(local_pred))) * 100.0
                ci_overlap = None
                if api_low is not None and api_high is not None:
                    try:
                        ci_overlap = not (float(api_high) < local_low or float(api_low) > local_high)
                    except Exception:
                        ci_overlap = None
                ver_note = ""
                av, lv = _model_version(api_json), MODEL_VERSION
                if av and av != lv:
                    ver_note = f" | ⚠️ model_version API={av} vs LOCAL={lv}"
                print(f"[API] {lat_ms} ms | API={float(api_pred):.3f} k€ | LOCAL={local_pred:.3f} k€ | "
                      f"Δ={delta:.4f} ({pct:.2f}%) | CI overlap: {ci_overlap if ci_overlap is not None else 'n/a'}{ver_note}")
        else:
            print(f"[API] ❌ {resp.status_code} | {resp.text[:200]}")
    except Exception as e:
        print(f"[API] ⚠️ Consistency check skipped: {e}")

    # --- B) Optional publish path (toggle via env PUBLISH=1) ---
    PUBLISH = os.getenv("PUBLISH", "false").lower() in {"1", "true", "yes", "y"}
    if PUBLISH:
        try:
            payload_json = json.loads(json.dumps(sample_property, cls=NumpyJSONEncoder, ensure_ascii=False))
            t0 = time.perf_counter()
            resp = requests.post(url, params={"publish": "true"}, json=payload_json, headers=headers, timeout=15)
            lat_ms = round((time.perf_counter() - t0) * 1000, 2)
            if resp.ok:
                api_json = resp.json()
                api_pred, api_low, api_high, api_unc = _pick_pred_metrics(api_json)
                print(f"✅ API publish ok in {lat_ms} ms | pred={api_pred} k€ | unc={api_unc}")
            else:
                print(f"❌ API publish failed: {resp.status_code} | {resp.text[:200]}")
        except Exception as e:
            print(f"❌ API publish exception: {e}")
else:
    print("ℹ️ COMPARE_WITH_API disabled — skip API checks.")


### Artifact Audit

In [None]:
# L4) Artifacts audit — file hash vs meta/manifest
import hashlib, json
from pathlib import Path

def file_sha256(path: Path, chunk_size: int = 1 << 20) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""): h.update(chunk)
    return h.hexdigest()

def manifest_expected_hash(manifest_path: Path) -> str | None:
    if not manifest_path or not manifest_path.exists():
        return None
    try:
        mf = json.loads(manifest_path.read_text(encoding="utf-8"))
        return (
            (mf.get("paths") or {}).get("pipeline_sha256")
            or (mf.get("model_meta") or {}).get("model_hash")
            or (mf.get("artifacts") or {}).get("pipeline_sha256")
            or (mf.get("artifacts") or {}).get("model_sha256")
            or (mf.get("model") or {}).get("sha256")
            or mf.get("pipeline_sha256")
        )
    except Exception as e:
        try: logger.info("Manifest unreadable for hash", extra={"error": str(e)})
        except Exception: pass
        return None

model_sha = file_sha256(PIPELINE_PATH)
expected_sha_meta = (model_meta.get("model_hash") or model_meta.get("pipeline_sha256"))
expected_sha_manifest = manifest_expected_hash(MANIFEST_PATH)

print(f"Model SHA256: {model_sha} (first16={model_sha[:16]})")
if expected_sha_meta:
    print(f"Meta expects   : {expected_sha_meta} (match: {expected_sha_meta == model_sha})")
if expected_sha_manifest:
    ok = (expected_sha_manifest == model_sha)
    print(f"Manifest expects: {expected_sha_manifest} (match: {ok})")
    if not ok:
        try:
            logger.warning("Pipeline hash mismatch with manifest",
                           extra={"expected": expected_sha_manifest, "actual": model_sha})
        except Exception:
            pass