### Imports and dataset upload

In [10]:
import pandas as pd
import numpy as np
import json, time, os
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

DATA_PATH = "../data/property_dataset_mvp_multi_rwa.csv"
MODEL_BASE_DIR = "../models"
ASSET_TYPE = "property"  # in futuro: "art", "greenhouse", etc.

# Config multi-asset (estendibile)
ASSET_CONFIG = {
    "property": {
        "target": "valuation_k",
        "categorical": ["location", "energy_class", "has_elevator",
                        "has_garden", "has_balcony", "garage"],
        "numeric": ["size_m2", "rooms", "bathrooms", "year_built",
                    "floor", "building_floors", "humidity_level",
                    "temperature_avg", "noise_level", "air_quality_index"],
        # features che NON devono essere usate (ID, derivati, ecc.)
        "exclude": ["asset_id", "asset_type", "condition_score", "risk_score", "last_verified_ts"]
    },
    # Example placeholder per futuro
    "art": {
        "target": "valuation_k",        # o cambierai
        "categorical": [],              # placeholder
        "numeric": [],                  # placeholder
        "exclude": []
    }
}

### Load dataset & derive generic features

In [11]:
df = pd.read_csv(DATA_PATH)
assert ASSET_TYPE in ASSET_CONFIG, f"Unknown asset_type: {ASSET_TYPE}"

cfg = ASSET_CONFIG[ASSET_TYPE]

# Derivata generica (asset age) – non la aggiungi se già esiste
if "age_years" not in df.columns and "year_built" in df.columns:
    current_year = datetime.utcnow().year
    df["age_years"] = current_year - df["year_built"]
    # Aggiungila al config numerico
    if "age_years" not in cfg["numeric"]:
        cfg["numeric"].append("age_years")

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (150, 23)


Unnamed: 0,asset_id,asset_type,location,size_m2,rooms,bathrooms,year_built,floor,building_floors,has_elevator,...,energy_class,humidity_level,temperature_avg,noise_level,air_quality_index,valuation_k,condition_score,risk_score,last_verified_ts,age_years
0,asset_0000,property,Palermo,170,6,2,1979,1,9,1,...,A,69.7,20.0,77,51,217.1,0.73,0.261,2025-07-17T15:32:04Z,46
1,asset_0001,property,Palermo,54,4,3,2013,0,3,0,...,B,64.4,20.8,28,68,91.77,0.822,0.191,2025-07-05T01:00:04Z,12
2,asset_0002,property,Palermo,48,3,1,1951,3,7,1,...,B,47.6,13.6,27,76,90.58,0.776,0.216,2025-06-28T07:28:04Z,74
3,asset_0003,property,Rome,171,3,2,1955,1,5,1,...,D,37.4,24.6,45,73,591.7,0.764,0.254,2025-06-27T03:59:04Z,70
4,asset_0004,property,Palermo,87,5,1,2011,4,9,1,...,A,40.9,22.8,45,118,190.65,0.857,0.146,2025-07-13T22:19:04Z,14


### Build feature matrix & target dynamically

In [12]:
all_excluded = set(cfg["exclude"] + [cfg["target"]])
candidate_cols = [c for c in df.columns if c not in all_excluded]

# Controllo incrocio (debug)
print("Candidate feature columns:", candidate_cols)

X = df[candidate_cols]
y = df[cfg["target"]]

Candidate feature columns: ['location', 'size_m2', 'rooms', 'bathrooms', 'year_built', 'floor', 'building_floors', 'has_elevator', 'has_garden', 'has_balcony', 'garage', 'energy_class', 'humidity_level', 'temperature_avg', 'noise_level', 'air_quality_index', 'age_years']


### Select effective categorical/numeric subsets

In [13]:
categorical_cols = [c for c in cfg["categorical"] if c in X.columns]
numeric_cols = [c for c in cfg["numeric"] if c in X.columns]

print("Categorical used:", categorical_cols)
print("Numeric used:", numeric_cols)

Categorical used: ['location', 'energy_class', 'has_elevator', 'has_garden', 'has_balcony', 'garage']
Numeric used: ['size_m2', 'rooms', 'bathrooms', 'year_built', 'floor', 'building_floors', 'humidity_level', 'temperature_avg', 'noise_level', 'air_quality_index', 'age_years']


### Features preparation

In [4]:
# Target
target = "valuation_k"

# Feature selezionate (escludiamo asset_id e il target)
features = [
    "location", "size_m2", "rooms", "bathrooms", "year_built", "floor",
    "building_floors", "has_elevator", "has_garden", "has_balcony", "garage",
    "energy_class", "humidity_level", "temperature_avg", "noise_level", "air_quality_index"
]

X = df[features]
y = df[target]

### Preprocessing + Model Pipeline

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ],
    remainder='drop'
)

model = RandomForestRegressor(
    n_estimators=180,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

### Train/test split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Categorical features encoding

In [6]:
categorical_cols = ["location", "energy_class"]
numerical_cols = list(set(features) - set(categorical_cols))

# One-hot encoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = encoder.fit_transform(X[categorical_cols])
X_cat_df = pd.DataFrame(X_cat, columns=encoder.get_feature_names_out(categorical_cols))

# Unione delle colonne
X_encoded = pd.concat([X[numerical_cols].reset_index(drop=True), X_cat_df], axis=1)

### Split train/test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

### Fit & Evaluate

In [23]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.3f} (k€)")
print(f"RMSE: {rmse:.3f} (k€)")
print(f"R²: {r2:.3f}")

MAE: 82.709 (k€)
RMSE: 101.872 (k€)
R²: 0.282


### Feature Importance (approx.)

In [18]:
# Ottieni nomi delle feature dopo l'encoding
ohe = pipeline.named_steps["preprocessor"].named_transformers_["cat"]
cat_feature_names = list(ohe.get_feature_names_out(categorical_cols))
all_feature_names = cat_feature_names + numeric_cols

importances = pipeline.named_steps["model"].feature_importances_

feat_imp = (pd.DataFrame({
    "feature": all_feature_names,
    "importance": importances
}).sort_values("importance", ascending=False))

feat_imp.head(15)

Unnamed: 0,feature,importance
23,size_m2,0.731206
26,year_built,0.02956
29,humidity_level,0.027332
32,air_quality_index,0.026413
30,temperature_avg,0.025051
33,age_years,0.023781
31,noise_level,0.019749
28,building_floors,0.019179
13,energy_class_F,0.014372
25,bathrooms,0.012918


### Save pipeline & metadata (versioned)

In [24]:
model_dir = f"{MODEL_BASE_DIR}/{ASSET_TYPE}"
os.makedirs(model_dir, exist_ok=True)

model_version = "v1"  # increment manually or build logic
pipeline_filename = f"{model_dir}/value_regressor_{model_version}.joblib"
joblib.dump(pipeline, pipeline_filename)

metadata = {
    "asset_type": ASSET_TYPE,
    "model_task": "valuation_regression",
    "model_version": model_version,
    "model_class": "RandomForestRegressor",
    "n_rows": len(df),
    "training_rows": len(X_train),
    "test_rows": len(X_test),
    "features_categorical": categorical_cols,
    "features_numeric": numeric_cols,
    "target": cfg["target"],
    "metrics": {
        "mae_k": float(round(mae, 4)),
        "rmse_k": float(round(rmse, 4)),
        "r2": float(round(r2, 4))
    },
    "generated_at": datetime.utcnow().isoformat() + "Z",
    "feature_importance_top5": feat_imp.head(5).to_dict(orient="records")
}

with open(f"{model_dir}/value_regressor_{model_version}_meta.json", "w") as f:
    json.dump(metadata, f, indent=2)

print("Saved pipeline and metadata:")
print(pipeline_filename)

Saved pipeline and metadata:
../models/property/value_regressor_v1.joblib


In [25]:
# Restituisce pipeline + meta
def train_asset_model(asset_type: str, df: pd.DataFrame, version="v1"):
    cfg = ASSET_CONFIG[asset_type]