## 01. Imports & Paths

In [36]:
import os
import json
from datetime import datetime
from pathlib import Path
import hashlib
import pandas as pd
import numpy as np
import joblib
import requests

ASSET_TYPE = "property"
MODEL_VERSION = "v1"               # aggiorna se cambi modello
MODEL_DIR = Path(f"../models/{ASSET_TYPE}")
PIPELINE_PATH = MODEL_DIR / f"value_regressor_{MODEL_VERSION}.joblib"
META_PATH     = MODEL_DIR / f"value_regressor_{MODEL_VERSION}_meta.json"
LOG_PATH      = Path("../data/predictions_log.jsonl")
API_BASE      = "http://127.0.0.1:8000"  # opzionale: endpoint FastAPI
COMPARE_WITH_API = True                   # metti False se non vuoi fare richieste HTTP

assert PIPELINE_PATH.exists(), f"Missing pipeline file: {PIPELINE_PATH}"
assert META_PATH.exists(), f"Missing metadata file: {META_PATH}"
print("Loaded model + metadata paths OK.")

pipeline = joblib.load(PIPELINE_PATH)
with META_PATH.open("r", encoding="utf-8") as f:
    model_meta = json.load(f)

categorical_expected = model_meta["features_categorical"]
numeric_expected = model_meta["features_numeric"]
ALL_EXPECTED = categorical_expected + numeric_expected

print("Expected features:", ALL_EXPECTED)

Loaded model + metadata paths OK.
Expected features: ['location', 'energy_class', 'has_elevator', 'has_garden', 'has_balcony', 'garage', 'size_m2', 'rooms', 'bathrooms', 'year_built', 'floor', 'building_floors', 'humidity_level', 'temperature_avg', 'noise_level', 'air_quality_index', 'age_years']


## 02. Load Pipeline & Metadata

In [37]:
pipeline = joblib.load(PIPELINE_PATH)
with META_PATH.open("r", encoding="utf-8") as f:
    model_meta = json.load(f)

categorical_expected = model_meta["features_categorical"]
numeric_expected = model_meta["features_numeric"]
ALL_EXPECTED = categorical_expected + numeric_expected

print("Expected features:", len(ALL_EXPECTED))

Expected features: 17


## 03. Validation Utilities

In [38]:
def autofill_derived(record: dict) -> dict:
    """If age_years missing but year_built present, derive it."""
    if "age_years" not in record and "year_built" in record:
        record = {**record, "age_years": datetime.utcnow().year - int(record["year_built"])}
    return record

def validate_input_record(record: dict, strict=True):
    """
    Validates that all expected features are present.
    If strict=True, rejects extra keys.
    Auto-fills derived features if possible.
    Raises ValueError on problems.
    """
    record = autofill_derived(record)
    missing = [f for f in ALL_EXPECTED if f not in record]
    extras = [f for f in record if f not in ALL_EXPECTED]
    if missing:
        raise ValueError(f"Missing required features: {missing}")
    if strict and extras:
        raise ValueError(f"Unexpected extra features: {extras}")
    return record

## 04. Sample Single Property

In [39]:
sample_property = {
    "location": "Milan",
    "size_m2": 95,
    "rooms": 4,
    "bathrooms": 2,
    "year_built": 1999,
    "floor": 2,
    "building_floors": 6,
    "has_elevator": 1,
    "has_garden": 0,
    "has_balcony": 1,
    "garage": 1,
    "energy_class": "B",
    "humidity_level": 50.0,
    "temperature_avg": 20.5,
    "noise_level": 40,
    "air_quality_index": 70,
}

sample_property = validate_input_record(sample_property, strict=True)

## 05. Local Prediction

In [40]:
df_input = pd.DataFrame([sample_property])
pred_value = float(pipeline.predict(df_input)[0])
print(f"[LOCAL] Predicted valuation_k: {pred_value:.3f} (k€)")

[LOCAL] Predicted valuation_k: 271.799 (k€)


## 06. Output Schema Builder

In [41]:
def build_output_schema(
    asset_id: str,
    asset_type: str,
    valuation_k: float,
    model_meta: dict,
    condition_score: float = None,
    risk_score: float = None,
    anomaly: bool = False,
    needs_review: bool = False,
    extra_metrics: dict = None
):
    out = {
        "asset_id": asset_id,
        "asset_type": asset_type,
        "timestamp": datetime.utcnow().isoformat(timespec="seconds") + "Z",
        "metrics": {
            "valuation_base_k": round(float(valuation_k), 3)
        },
        "flags": {
            "anomaly": anomaly,
            "needs_review": needs_review
        },
        "model_meta": {
            "value_model_version": model_meta.get("model_version"),
            "value_model_name": model_meta.get("model_class")
        },
        "offchain_refs": {
            "detail_report_hash": None,
            "sensor_batch_hash": None
        }
    }
    if condition_score is not None:
        out["metrics"]["condition_score"] = round(float(condition_score), 3)
    if risk_score is not None:
        out["metrics"]["risk_score"] = round(float(risk_score), 3)
    if extra_metrics:
        for k, v in extra_metrics.items():
            out["metrics"][k] = v
    return out

single_output = build_output_schema(
    asset_id="asset_manual_0001",
    asset_type=ASSET_TYPE,
    valuation_k=pred_value,
    model_meta=model_meta
)

single_output

{'asset_id': 'asset_manual_0001',
 'asset_type': 'property',
 'timestamp': '2025-07-20T15:05:31Z',
 'metrics': {'valuation_base_k': 271.799},
 'flags': {'anomaly': False, 'needs_review': False},
 'model_meta': {'value_model_version': 'v1',
  'value_model_name': 'RandomForestRegressor'},
 'offchain_refs': {'detail_report_hash': None, 'sensor_batch_hash': None}}

## 07. Batch Inference

In [42]:
batch_samples = [
    sample_property,
    {**sample_property, "location": "Rome", "size_m2": 120, "energy_class": "C"},
    {**sample_property, "location": "Florence", "size_m2": 70, "has_garden": 1, "energy_class": "A"},
    {**sample_property, "location": "Turin", "size_m2": 150, "energy_class": "D"}
]

validated_batch = [validate_input_record(r, strict=True) for r in batch_samples]
df_batch = pd.DataFrame(validated_batch)
batch_preds = pipeline.predict(df_batch)

batch_outputs = [
    build_output_schema(
        asset_id=f"asset_batch_{i:03}",
        asset_type=ASSET_TYPE,
        valuation_k=float(val),
        model_meta=model_meta
    )
    for i, val in enumerate(batch_preds, start=1)
]

pd.DataFrame([{"asset_id": o["asset_id"], "valuation_k": o["metrics"]["valuation_base_k"]} for o in batch_outputs])

Unnamed: 0,asset_id,valuation_k
0,asset_batch_001,271.799
1,asset_batch_002,331.09
2,asset_batch_003,198.326
3,asset_batch_004,371.052


## 08. Logging JSON

In [43]:
def append_jsonl(record: dict, path: Path):
    record = {**record, "_logged_at": datetime.utcnow().isoformat() + "Z"}
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record) + "\n")

append_jsonl(single_output, LOG_PATH)
for o in batch_outputs:
    append_jsonl(o, LOG_PATH)

print(f"Appended {1 + len(batch_outputs)} predictions to {LOG_PATH}")

Appended 5 predictions to ..\data\predictions_log.jsonl


## 09. Utility: Single Prediction Function For Reuse

In [44]:
def predict_asset(record: dict, asset_id: str, asset_type: str = ASSET_TYPE):
    rec = validate_input_record(record, strict=True)
    df_in = pd.DataFrame([rec])
    val = float(pipeline.predict(df_in)[0])
    return build_output_schema(
        asset_id=asset_id,
        asset_type=asset_type,
        valuation_k=val,
        model_meta=model_meta
    )

test_output = predict_asset(sample_property, asset_id="asset_function_test")
test_output

{'asset_id': 'asset_function_test',
 'asset_type': 'property',
 'timestamp': '2025-07-20T15:05:31Z',
 'metrics': {'valuation_base_k': 271.799},
 'flags': {'anomaly': False, 'needs_review': False},
 'model_meta': {'value_model_version': 'v1',
  'value_model_name': 'RandomForestRegressor'},
 'offchain_refs': {'detail_report_hash': None, 'sensor_batch_hash': None}}

## 10. Sensitivity Check (vary size_m2)

In [45]:
sizes = [60, 90, 130, 170, 210]
size_variations = []
for s in sizes:
    rec = {**sample_property, "size_m2": s}
    rec = validate_input_record(rec, strict=True)
    val = float(pipeline.predict(pd.DataFrame([rec]))[0])
    size_variations.append({"size_m2": s, "prediction_k": round(val, 3)})

pd.DataFrame(size_variations)

Unnamed: 0,size_m2,prediction_k
0,60,106.793
1,90,230.003
2,130,339.309
3,170,379.731
4,210,498.415


## 11. Compare With API Prediction Consistency

In [46]:
if COMPARE_WITH_API:
    try:
        api_resp = requests.post(f"{API_BASE}/predict/{ASSET_TYPE}", json=sample_property, timeout=5)
        if api_resp.status_code == 200:
            api_json = api_resp.json()
            api_pred = api_json["metrics"]["valuation_base_k"]
            delta = abs(api_pred - pred_value)
            print(f"[API] Pred: {api_pred:.3f} k€ | Local: {pred_value:.3f} k€ | Δ={delta:.4f}")
        else:
            print(f"[API] Request failed status={api_resp.status_code} body={api_resp.text}")
    except Exception as e:
        print(f"[API] Compare skipped: {e}")

[API] Compare skipped: HTTPConnectionPool(host='127.0.0.1', port=8000): Max retries exceeded with url: /predict/property (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001FF0E779DD0>: Failed to establish a new connection: [WinError 10061] Impossibile stabilire la connessione. Rifiuto persistente del computer di destinazione'))


## 12. Hash Pipeline File (Audit)

In [47]:
def file_sha256(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

print("Model file hash (sha256, first 16 chars):", file_sha256(PIPELINE_PATH)[:16])

Model file hash (sha256, first 16 chars): cb0798a3a14e95b2


## 13. Schema Validation

In [48]:
from jsonschema import validate, ValidationError
import json
schema = json.load(open("../schemas/output_example.json"))
try:
    validate(single_output, schema)
    print("single_output matches schema")
except ValidationError as e:
    print("Schema mismatch:", e.message)

single_output matches schema


### TODO:
* Add dynamic condition/risk estimation (using regression or rule-based logic).
* Add batch scoring function from a CSV file.
* Export results to an inference_results.csv file for debugging purposes.