### Imports and dataset upload

In [1]:
import os
import json
import hashlib
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

ASSET_TYPE = "property"
DATA_PATH = "../data/property_dataset_v1.csv"
MODEL_BASE_DIR = "../models"

ASSET_CONFIG = {
    "property": {
        "target": "valuation_k",
        "categorical": [
            "location", "energy_class",
            "has_elevator", "has_garden", "has_balcony", "garage"
        ],
        "numeric": [
            "size_m2", "rooms", "bathrooms", "year_built",
            "floor", "building_floors",
            "humidity_level", "temperature_avg",
            "noise_level", "air_quality_index",  # base environment
            # "age_years" sarà aggiunta se esiste / derivata
        ],
        "exclude": [
            "asset_id", "asset_type",
            "condition_score", "risk_score", "last_verified_ts"
        ]
    },
    # Placeholder for future assets
    "art": {
        "target": "valuation_k",
        "categorical": [],
        "numeric": [],
        "exclude": []
    }
}

assert ASSET_TYPE in ASSET_CONFIG, f"Unknown asset_type: {ASSET_TYPE}"
cfg = ASSET_CONFIG[ASSET_TYPE]

### Load dataset

In [3]:
df = pd.read_csv(DATA_PATH)
print("Loaded dataset:", DATA_PATH, "| shape:", df.shape)

Loaded dataset: ../data/property_dataset_v1.csv | shape: (150, 23)


### Normalization / Derivations

In [4]:
current_year = datetime.utcnow().year
if "year_build" in df.columns and "year_built" not in df.columns:
    df = df.rename(columns={"year_build": "year_built"})

if "age_years" not in df.columns and "year_built" in df.columns:
    df["age_years"] = current_year - df["year_built"]

# Ensure age_years in numeric list if present
if "age_years" in df.columns and "age_years" not in cfg["numeric"]:
    cfg["numeric"].append("age_years")

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (150, 23)


Unnamed: 0,asset_id,asset_type,location,size_m2,rooms,bathrooms,year_built,age_years,floor,building_floors,...,garage,energy_class,humidity_level,temperature_avg,noise_level,air_quality_index,valuation_k,condition_score,risk_score,last_verified_ts
0,asset_0000,property,Naples,142,5,1,1964,61,2,7,...,1,B,53.9,17.8,42,104,348.41,0.852,0.14,2025-06-03T09:40:42Z
1,asset_0001,property,Milan,170,6,2,1979,46,1,9,...,0,A,69.7,20.0,77,51,222.1,0.73,0.261,2025-07-15T07:09:42Z
2,asset_0002,property,Palermo,54,4,3,2013,12,0,3,...,1,F,64.4,20.8,28,68,78.45,0.742,0.271,2025-07-05T22:46:42Z
3,asset_0003,property,Palermo,48,3,1,1951,74,3,7,...,0,B,47.6,13.6,27,76,90.58,0.776,0.216,2025-06-29T05:14:42Z
4,asset_0004,property,Rome,171,3,2,1955,70,1,5,...,1,D,37.4,24.6,45,73,591.7,0.764,0.254,2025-06-28T01:45:42Z


### Sanity checks

In [5]:
required_base = [cfg["target"]] + cfg["categorical"] + cfg["numeric"]
missing = [c for c in required_base if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns in dataset: {missing}")

# Remove excluded & target from feature candidates
excluded = set(cfg["exclude"] + [cfg["target"]])
feature_candidates = [c for c in df.columns if c not in excluded]

print("Target:", cfg["target"])
print("Categorical:", cfg["categorical"])
print("Numeric:", cfg["numeric"])
print("Excluded:", cfg["exclude"])
print("Feature candidates (pre-filter):", feature_candidates)

Target: valuation_k
Categorical: ['location', 'energy_class', 'has_elevator', 'has_garden', 'has_balcony', 'garage']
Numeric: ['size_m2', 'rooms', 'bathrooms', 'year_built', 'floor', 'building_floors', 'humidity_level', 'temperature_avg', 'noise_level', 'air_quality_index', 'age_years']
Excluded: ['asset_id', 'asset_type', 'condition_score', 'risk_score', 'last_verified_ts']
Feature candidates (pre-filter): ['location', 'size_m2', 'rooms', 'bathrooms', 'year_built', 'age_years', 'floor', 'building_floors', 'has_elevator', 'has_garden', 'has_balcony', 'garage', 'energy_class', 'humidity_level', 'temperature_avg', 'noise_level', 'air_quality_index']


### Final feature list = categorical + numeric (explicit control)

In [6]:
feature_list = cfg["categorical"] + cfg["numeric"]
print("Final feature_list used:", feature_list)

X = df[feature_list].copy()
y = df[cfg["target"]].copy()

Final feature_list used: ['location', 'energy_class', 'has_elevator', 'has_garden', 'has_balcony', 'garage', 'size_m2', 'rooms', 'bathrooms', 'year_built', 'floor', 'building_floors', 'humidity_level', 'temperature_avg', 'noise_level', 'air_quality_index', 'age_years']


### Train/test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE
)

### Preprocessor & model pipeline

In [8]:
categorical_cols = cfg["categorical"]
numeric_cols = cfg["numeric"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ],
    remainder="drop"
)

regressor = RandomForestRegressor(
    n_estimators=220,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", regressor)
])

### Fit & Evaluation

In [17]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"MAE:  {mae:.3f} k€")
print(f"RMSE: {rmse:.3f} k€")
print(f"R²:   {r2:.3f}")

MAE:  81.713 k€
RMSE: 104.809 k€
R²:   0.335


### Feature importance (only for tree model)

In [18]:
ohe = pipeline.named_steps["preprocessor"].named_transformers_["cat"]
encoded_cat_features = list(ohe.get_feature_names_out(categorical_cols))
encoded_feature_names = encoded_cat_features + numeric_cols

importances = pipeline.named_steps["regressor"].feature_importances_
feat_importance = (
    pd.DataFrame({"feature": encoded_feature_names, "importance": importances})
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)
print("\nTop 10 feature importances:")
display(feat_importance.head(10))


Top 10 feature importances:


Unnamed: 0,feature,importance
0,size_m2,0.736971
1,temperature_avg,0.035404
2,humidity_level,0.034904
3,air_quality_index,0.023537
4,age_years,0.022832
5,year_built,0.02245
6,noise_level,0.017944
7,building_floors,0.017651
8,floor,0.010627
9,rooms,0.007303


### Save model & metadata

In [19]:
os.makedirs(f"{MODEL_BASE_DIR}/{ASSET_TYPE}", exist_ok=True)
model_version = "v1"  # bump manually when retraining with significant changes

pipeline_filename = f"{MODEL_BASE_DIR}/{ASSET_TYPE}/value_regressor_{model_version}.joblib"
joblib.dump(pipeline, pipeline_filename)

['../models/property/value_regressor_v1.joblib']

### Dataset hash for provenance

In [20]:
with open(DATA_PATH, "rb") as f:
    dataset_hash = hashlib.sha256(f.read()).hexdigest()

metadata = {
    "asset_type": ASSET_TYPE,
    "model_task": "valuation_regression",
    "model_version": model_version,
    "model_class": type(regressor).__name__,
    "random_state": RANDOM_STATE,
    "dataset_file": DATA_PATH,
    "dataset_hash_sha256": dataset_hash,
    "n_rows_total": int(len(df)),
    "n_rows_train": int(len(X_train)),
    "n_rows_test": int(len(X_test)),
    "features_categorical": categorical_cols,
    "features_numeric": numeric_cols,
    "feature_list_ordered": feature_list,
    "encoded_feature_count": len(encoded_feature_names),
    "metrics": {
        "mae_k": float(round(mae, 4)),
        "rmse_k": float(round(rmse, 4)),
        "r2": float(round(r2, 4))
    },
    "generated_at": datetime.utcnow().isoformat() + "Z",
    "feature_importance_top10": feat_importance.head(10).to_dict(orient="records")
}

meta_filename = f"{MODEL_BASE_DIR}/{ASSET_TYPE}/value_regressor_{model_version}_meta.json"
with open(meta_filename, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2)

print("\nSaved pipeline:", pipeline_filename)
print("Saved metadata:", meta_filename)


Saved pipeline: ../models/property/value_regressor_v1.joblib
Saved metadata: ../models/property/value_regressor_v1_meta.json
