### Imports & Paths

In [1]:
import pandas as pd
import numpy as np
import joblib
import json
import os
from datetime import datetime
from pathlib import Path

MODEL_DIR = Path("../models/property")
PIPELINE_PATH = MODEL_DIR / "value_regressor_v1.joblib"
META_PATH = MODEL_DIR / "value_regressor_v1_meta.json"

assert PIPELINE_PATH.exists(), f"Missing pipeline file: {PIPELINE_PATH}"
assert META_PATH.exists(), f"Missing metadata file: {META_PATH}"

print("Loaded paths OK.")

Loaded paths OK.


### Load pipeline & metadata

In [2]:
pipeline = joblib.load(PIPELINE_PATH)

with open(META_PATH, "r") as f:
    model_meta = json.load(f)

model_meta

{'asset_type': 'property',
 'model_task': 'valuation_regression',
 'model_version': 'v1',
 'model_class': 'RandomForestRegressor',
 'n_rows': 150,
 'training_rows': 120,
 'test_rows': 30,
 'features_categorical': ['location',
  'energy_class',
  'has_elevator',
  'has_garden',
  'has_balcony',
  'garage'],
 'features_numeric': ['size_m2',
  'rooms',
  'bathrooms',
  'year_built',
  'floor',
  'building_floors',
  'humidity_level',
  'temperature_avg',
  'noise_level',
  'air_quality_index',
  'age_years'],
 'target': 'valuation_k',
 'metrics': {'mae_k': 82.7092, 'rmse_k': 101.8724, 'r2': 0.282},
 'generated_at': '2025-07-19T16:09:13.261646Z',
 'feature_importance_top5': [{'feature': 'size_m2',
   'importance': 0.7312063540803266},
  {'feature': 'year_built', 'importance': 0.029560496520703257},
  {'feature': 'humidity_level', 'importance': 0.027331626340557257},
  {'feature': 'air_quality_index', 'importance': 0.026413472362091128},
  {'feature': 'temperature_avg', 'importance': 0.0250

### Utility: feature expectations & validator

In [3]:
# Dalle metadata ricaviamo le feature ORIGINALI (prima dell'OHE)
categorical_expected = model_meta["features_categorical"]
numeric_expected = model_meta["features_numeric"]

ALL_EXPECTED = categorical_expected + numeric_expected

def validate_input_record(record: dict, strict=True):
    """
    Controlla se tutte le feature attese sono presenti.
    Se strict=True, segnala anche eventuali campi extra.
    """
    missing = [f for f in ALL_EXPECTED if f not in record]
    extras = [f for f in record if f not in ALL_EXPECTED]
    
    if missing:
        raise ValueError(f"Missing required features: {missing}")
    if strict and extras:
        raise ValueError(f"Unexpected extra features: {extras}")
    return True

print("Expected features:", ALL_EXPECTED)

Expected features: ['location', 'energy_class', 'has_elevator', 'has_garden', 'has_balcony', 'garage', 'size_m2', 'rooms', 'bathrooms', 'year_built', 'floor', 'building_floors', 'humidity_level', 'temperature_avg', 'noise_level', 'air_quality_index', 'age_years']


### Definizione di un singolo sample manuale

In [4]:
sample_property = {
    "location": "Milan",
    "size_m2": 95,
    "rooms": 4,
    "bathrooms": 2,
    "year_built": 1999,
    "floor": 2,
    "building_floors": 6,
    "has_elevator": 1,
    "has_garden": 0,
    "has_balcony": 1,
    "garage": 1,
    "energy_class": "B",
    "humidity_level": 50.0,
    "temperature_avg": 20.5,
    "noise_level": 40,
    "air_quality_index": 70,
    "age_years": datetime.utcnow().year - 1999  # deve essere presente se nel training
}

validate_input_record(sample_property)
print("Sample validated.")


Sample validated.


### Predizione singola

In [6]:
df_input = pd.DataFrame([sample_property])
pred_value = pipeline.predict(df_input)[0]

print(f"Predicted valuation_k: {pred_value:.3f} (k€)")

Predicted valuation_k: 211.364 (k€)


### Wrap in unified output schema (multi-RWA ready)

In [7]:
def build_output_schema(asset_id: str,
                        asset_type: str,
                        valuation_k: float,
                        model_meta: dict,
                        condition_score: float = None,
                        risk_score: float = None,
                        anomaly: bool = False,
                        needs_review: bool = False,
                        extra_metrics: dict = None):
    """
    Crea un dizionario conforme al futuro schema multi-asset.
    """
    out = {
        "asset_id": asset_id,
        "asset_type": asset_type,
        "timestamp": datetime.utcnow().isoformat(timespec="seconds") + "Z",
        "metrics": {
            "valuation_base_k": round(float(valuation_k), 3)
        },
        "flags": {
            "anomaly": anomaly,
            "needs_review": needs_review
        },
        "model_meta": {
            "value_model_version": model_meta.get("model_version"),
            "value_model_name": model_meta.get("model_class")
        },
        "offchain_refs": {
            "detail_report_hash": None,
            "sensor_batch_hash": None
        }
    }
    
    # Aggiungi condition/risk se disponibili
    if condition_score is not None:
        out["metrics"]["condition_score"] = round(float(condition_score), 3)
    if risk_score is not None:
        out["metrics"]["risk_score"] = round(float(risk_score), 3)
        
    # Aggiungi metriche extra arbitrarie
    if extra_metrics:
        for k, v in extra_metrics.items():
            out["metrics"][k] = v
    
    return out

# Per ora non abbiamo condition e risk calcolati dinamicamente a inferenza
output_record = build_output_schema(
    asset_id="asset_manual_0001",
    asset_type="property",
    valuation_k=pred_value,
    model_meta=model_meta
)

output_record

{'asset_id': 'asset_manual_0001',
 'asset_type': 'property',
 'timestamp': '2025-07-19T16:18:59Z',
 'metrics': {'valuation_base_k': 211.364},
 'flags': {'anomaly': False, 'needs_review': False},
 'model_meta': {'value_model_version': 'v1',
  'value_model_name': 'RandomForestRegressor'},
 'offchain_refs': {'detail_report_hash': None, 'sensor_batch_hash': None}}

### Predizioni batch da un mini DataFrame

In [8]:
batch_samples = [
    sample_property,
    {**sample_property, "location": "Rome", "size_m2": 120, "energy_class": "C"},
    {**sample_property, "location": "Florence", "size_m2": 70, "has_garden": 1, "energy_class": "A"}
]

for rec in batch_samples:
    validate_input_record(rec)

df_batch = pd.DataFrame(batch_samples)
batch_preds = pipeline.predict(df_batch)

batch_outputs = []
for rec, val in zip(batch_samples, batch_preds):
    batch_outputs.append(
        build_output_schema(
            asset_id=f"asset_batch_{rec['location']}_{rec['size_m2']}",
            asset_type="property",
            valuation_k=val,
            model_meta=model_meta
        )
    )

batch_outputs[:2]

[{'asset_id': 'asset_batch_Milan_95',
  'asset_type': 'property',
  'timestamp': '2025-07-19T16:19:23Z',
  'metrics': {'valuation_base_k': 211.364},
  'flags': {'anomaly': False, 'needs_review': False},
  'model_meta': {'value_model_version': 'v1',
   'value_model_name': 'RandomForestRegressor'},
  'offchain_refs': {'detail_report_hash': None, 'sensor_batch_hash': None}},
 {'asset_id': 'asset_batch_Rome_120',
  'asset_type': 'property',
  'timestamp': '2025-07-19T16:19:23Z',
  'metrics': {'valuation_base_k': 306.229},
  'flags': {'anomaly': False, 'needs_review': False},
  'model_meta': {'value_model_version': 'v1',
   'value_model_name': 'RandomForestRegressor'},
  'offchain_refs': {'detail_report_hash': None, 'sensor_batch_hash': None}}]

### Logging dei risultati

In [9]:
LOG_PATH = Path("../data/predictions_log.jsonl")

def append_jsonl(record: dict, path: Path):
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record) + "\n")

append_jsonl(output_record, LOG_PATH)

for rec in batch_outputs:
    append_jsonl(rec, LOG_PATH)

print(f"Appended {1 + len(batch_outputs)} prediction records to {LOG_PATH}")


Appended 4 prediction records to ..\data\predictions_log.jsonl


### Funzione di inferenza riutilizzabile

In [10]:
def predict_asset(record: dict,
                  asset_id: str,
                  asset_type: str = "property",
                  pipeline_path=PIPELINE_PATH,
                  meta_path=META_PATH):
    """
    Carica pipeline (se non già in memoria), valida record, predice e ritorna lo schema standard.
    """
    validate_input_record(record)
    df_in = pd.DataFrame([record])
    val = pipeline.predict(df_in)[0]
    return build_output_schema(
        asset_id=asset_id,
        asset_type=asset_type,
        valuation_k=val,
        model_meta=model_meta
    )

test_out = predict_asset(sample_property, asset_id="asset_function_test")
test_out


{'asset_id': 'asset_function_test',
 'asset_type': 'property',
 'timestamp': '2025-07-19T16:20:19Z',
 'metrics': {'valuation_base_k': 211.364},
 'flags': {'anomaly': False, 'needs_review': False},
 'model_meta': {'value_model_version': 'v1',
  'value_model_name': 'RandomForestRegressor'},
 'offchain_refs': {'detail_report_hash': None, 'sensor_batch_hash': None}}

###  Quick sanity checks

In [11]:
# Confronta predizioni modificate su feature chiave (es. size_m2)
sizes = [60, 90, 130, 170]
size_records = []
for s in sizes:
    rec = {**sample_property, "size_m2": s}
    val = pipeline.predict(pd.DataFrame([rec]))[0]
    size_records.append({"size_m2": s, "prediction_k": round(float(val), 3)})

pd.DataFrame(size_records)


Unnamed: 0,size_m2,prediction_k
0,60,118.846
1,90,190.49
2,130,310.285
3,170,391.535
