In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
from sklearn.metrics import make_scorer, mean_absolute_error

# === PARAMETRI ===
INPUT_FILE = "dataset_hr.csv"     # CSV con colonne: subject_id, HR_mean, HR_std, HR_min, HR_max, arousal_label
OUTPUT_MODEL = "models/hr2arousal.pkl"

# === CARICAMENTO DATI ===
df = pd.read_csv(INPUT_FILE)
df["HR_zscore"] = (df["HR"] - df["HR"].mean()) / df["HR"].std()
df["HR_log"] = np.log(df["HR"])
df["HR_sq"] = df["HR"] ** 2 
df["HR_rolling_mean"] = df["HR"].rolling(20, center=False).mean() 
df["HR_rolling_std"] = df["HR"].rolling(20, center=False).std() 
df["HR_lag_5"] = df["HR"].shift(5)

features = ["HR_zscore", "HR_log", "HR_sq", "HR_rolling_mean", "HR_rolling_std", "HR_lag_5"]
X = df[features].values
y = df["arousal"].values

# === TRAIN / TEST ===
train_size = 0.8
split_index = int(len(X)*train_size)
X_train = X[:split_index]
X_test = X[split_index:]
y_train = y[:split_index]
y_test = y[split_index:]

model = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt']
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring = {'MAE': make_scorer(mean_absolute_error, greater_is_better=False)},
    refit='MAE',  # si rif√† sul MAE migliore
    cv=5,
    n_jobs=-1,
    verbose=2
)


grid_search.fit(X_train, y_train)

print("Migliori parametri:", grid_search.best_params_)
print("Miglior MAE:", -grid_search.best_score_)


# === TEST ===
y_pred = grid_search.predict(X_test)
mask = y_test != 0
mape = np.mean(np.abs((y_test[mask] - y_pred[mask]) / y_test[mask])) * 100
print(f"MAPE = {mape:.3f}%")

# === SALVATAGGIO ===
#joblib.dump(model, OUTPUT_MODEL)
#print(f"Modello salvato in {OUTPUT_MODEL}")


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Migliori parametri: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}
Miglior MAE: 0.12882498695631672
MAPE = 174.125%


In [12]:
print(y_test, y_pred)

[0.35 0.37 0.39 0.41 0.43 0.46 0.49 0.52 0.55 0.57 0.59 0.6  0.62 0.63
 0.64 0.64 0.63 0.65 0.67 0.67 0.68 0.7  0.7  0.66 0.63 0.57 0.5  0.44
 0.37 0.31 0.26 0.19 0.13 0.12 0.12 0.13 0.13 0.14 0.13 0.12 0.11 0.11
 0.1  0.09 0.08 0.05 0.03 0.02 0.01 0.01 0.01 0.01 0.01 0.02 0.02 0.04
 0.05 0.06 0.08 0.09 0.11 0.13 0.14 0.15 0.15 0.16 0.15 0.14 0.12 0.11
 0.09 0.06 0.05 0.04 0.02 0.02 0.04 0.04 0.05 0.05 0.05 0.05 0.05 0.04
 0.04 0.03 0.01 0.   0.   0.01 0.01 0.01 0.02 0.03 0.05 0.06 0.11 0.15
 0.19 0.24 0.29 0.31 0.33 0.33 0.33 0.33 0.29 0.25 0.2  0.16 0.11 0.09
 0.06 0.06 0.06 0.05 0.06 0.06 0.07 0.07 0.08 0.08 0.08 0.08 0.08 0.09
 0.1  0.12 0.13 0.15 0.16 0.18 0.19 0.2  0.21 0.2  0.19 0.2  0.21 0.22
 0.23 0.24 0.25 0.25 0.25 0.26 0.27 0.27 0.26 0.25 0.24 0.24 0.23 0.23
 0.23 0.22 0.21 0.2  0.19 0.19 0.18 0.18 0.18 0.17 0.16 0.15 0.14 0.14
 0.14 0.12 0.11 0.11 0.13 0.14 0.15 0.15 0.16 0.17 0.18 0.19 0.19 0.19
 0.17 0.17 0.17 0.18 0.18 0.17 0.17 0.16 0.17 0.17 0.17 0.17 0.16 0.16
 0.17 