In [19]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, confusion_matrix

# -----------------------
# 0) Caricamento dati
# -----------------------
df_downsampled = pd.read_pickle('../data/processed/train_data_dowsampled_walt.pkl')

# -----------------------
# 1) Pulizia base target
# -----------------------
features_df = df_downsampled.copy()
features_df['health_level'] = pd.to_numeric(features_df['health_level'], errors='coerce')
features_df = features_df.dropna(subset=['health_level']).reset_index(drop=True)

# -----------------------
# 2) Definisci colonne
# -----------------------
scalar_cols = ['velocita', 'torque']  # numeriche scalari già pronte
array_cols = ['horizontal_acceleration', 'axial_acceleration', 'vertical_acceleration', 'tachometer_signal']

# Se alcune colonne mancano, filtrale
scalar_cols = [c for c in scalar_cols if c in features_df.columns]
array_cols  = [c for c in array_cols  if c in features_df.columns]

# -----------------------
# 3) Funzione: array -> feature
# -----------------------
def _safe_array(x):
    """Converte in np.array 1D, rimuove NaN/inf, gestisce liste/serie."""
    if isinstance(x, (list, tuple, np.ndarray, pd.Series)):
        arr = np.asarray(x, dtype=float).ravel()
    else:
        # se è None/float/singolo valore, prova a castare
        try:
            arr = np.asarray([x], dtype=float)
        except Exception:
            arr = np.asarray([], dtype=float)
    # pulizia
    arr = arr[np.isfinite(arr)]
    return arr

def _signal_feats(arr):
    """Ritorna un dict di feature robuste da un array 1D."""
    if arr.size == 0:
        return {
            'mean': np.nan, 'std': np.nan, 'min': np.nan, 'max': np.nan,
            'median': np.nan, 'p10': np.nan, 'p90': np.nan, 'rms': np.nan,
            'skew': np.nan, 'kurt': np.nan, 'ptp': np.nan, 'iqr': np.nan,
            'zcr': np.nan
        }
    mean = float(np.mean(arr))
    std  = float(np.std(arr))
    feats = {
        'mean': mean,
        'std': std,
        'min': float(np.min(arr)),
        'max': float(np.max(arr)),
        'median': float(np.median(arr)),
        'p10': float(np.percentile(arr, 10)),
        'p90': float(np.percentile(arr, 90)),
        'rms': float(np.sqrt(np.mean(arr**2))),
        'skew': float(stats.skew(arr, bias=False)) if arr.size > 2 else 0.0,
        'kurt': float(stats.kurtosis(arr, fisher=True, bias=False)) if arr.size > 3 else 0.0,
        'ptp': float(np.ptp(arr)),
        'iqr': float(np.subtract(*np.percentile(arr, [75, 25]))),
        # zero crossing rate (approssimata)
        'zcr': float(np.mean(np.abs(np.diff(np.sign(arr))) > 0)) if arr.size > 1 else 0.0,
    }
    return feats

# --- Parametri del tachimetro ---
FS = None              # Hz, metti qui la frequenza di campionamento se la conosci (es. 1000)
PULSES_PER_REV = 1     # 1 impulso per rotazione come da tua descrizione
TACH_COL_NAME = 'tachometer_signal'

def _tach_feats(arr, fs=None, pulses_per_rev=1, threshold=0.0):
    """
    Estrae feature evento-centriche da un segnale a impulsi (tachimetro).
    - arr: array 1D
    - fs: frequenza di campionamento (Hz). Se None, gli intervalli sono in campioni.
    - pulses_per_rev: impulsi per rivoluzione (default 1)
    - threshold: soglia per binarizzare (valori > soglia = impulso)
    """
    if arr.size == 0:
        return {
            'pulse_count': 0, 'rpm': np.nan,
            'ipi_mean': np.nan, 'ipi_std': np.nan, 'ipi_cv': np.nan,
            'ipi_min': np.nan, 'ipi_max': np.nan,
            'duty_cycle': np.nan, 'max_gap': np.nan
        }

    # binarizzazione robusta
    bin_sig = (arr > threshold).astype(np.uint8)
    idx = np.flatnonzero(bin_sig)  # posizioni impulsi (campioni > soglia)

    pulse_count = int(idx.size)
    duty_cycle = float(bin_sig.mean())  # frazione di campioni "alti"

    # finestra in secondi o campioni
    if fs is not None:
        window_sec = arr.size / float(fs)
        max_gap = (np.diff(np.r_[[-1], idx, [arr.size]]) - 1).max() / float(fs) if idx.size > 0 else window_sec
    else:
        window_sec = None
        max_gap = (np.diff(np.r_[[-1], idx, [arr.size]]) - 1).max() if idx.size > 0 else arr.size

    # RPM
    if fs is not None and window_sec and window_sec > 0 and pulses_per_rev > 0:
        revs = pulse_count / float(pulses_per_rev)
        rpm = (revs / window_sec) * 60.0
    else:
        rpm = np.nan

    # Inter‑Pulse Intervals (IPI)
    if idx.size > 1:
        if fs is not None:
            ipi = np.diff(idx) / float(fs)  # in secondi
        else:
            ipi = np.diff(idx).astype(float)  # in campioni

        ipi_mean = float(np.mean(ipi))
        ipi_std  = float(np.std(ipi))
        ipi_cv   = float(ipi_std / ipi_mean) if ipi_mean > 0 else np.nan
        ipi_min  = float(np.min(ipi))
        ipi_max  = float(np.max(ipi))
    else:
        ipi_mean = ipi_std = ipi_cv = ipi_min = ipi_max = np.nan

    return {
        'pulse_count': pulse_count,
        'ipi_mean': ipi_mean, 'ipi_std': ipi_std, 'ipi_cv': ipi_cv,
        'ipi_min': ipi_min, 'ipi_max': ipi_max,
        'duty_cycle': float(duty_cycle),
        'max_gap': float(max_gap),
    }

def expand_array_features(df, cols):
    """Espande ciascuna colonna array in più colonne numeriche."""
    out = []
    for col in cols:
        feats_rows = []
        for v in df[col].values:
            arr = _safe_array(v)
            if col == TACH_COL_NAME:
                feats_rows.append(_tach_feats(arr, fs=FS, pulses_per_rev=PULSES_PER_REV, threshold=0.0))
            else:
                feats_rows.append(_signal_feats(arr))
        tmp = pd.DataFrame(feats_rows)
        tmp.columns = [f"{col}__{c}" for c in tmp.columns]
        out.append(tmp)
    return pd.concat(out, axis=1) if out else pd.DataFrame(index=df.index)


In [17]:
# -----------------------
# 4) Costruzione matrice feature X
# -----------------------
X_arrays = expand_array_features(features_df, array_cols)
X_scalars = features_df[scalar_cols].copy() if scalar_cols else pd.DataFrame(index=features_df.index)

X_full = pd.concat([X_scalars, X_arrays], axis=1)

# Imputazione semplice + cast
X_full = X_full.replace([np.inf, -np.inf], np.nan)
X_full = X_full.fillna(X_full.median(numeric_only=True))
X = X_full.astype(np.float32).values

# Target
y = features_df['health_level'].astype(np.float32).values


In [20]:
X_full = X_full.drop(columns=['horizontal_tachometer_signal__rpm'], errors='ignore')
X_full

Unnamed: 0,velocita,torque,horizontal_acceleration__mean,horizontal_acceleration__std,horizontal_acceleration__min,horizontal_acceleration__max,horizontal_acceleration__median,horizontal_acceleration__p10,horizontal_acceleration__p90,horizontal_acceleration__rms,...,vertical_acceleration__zcr,tachometer_signal__pulse_count,tachometer_signal__rpm,tachometer_signal__ipi_mean,tachometer_signal__ipi_std,tachometer_signal__ipi_cv,tachometer_signal__ipi_min,tachometer_signal__ipi_max,tachometer_signal__duty_cycle,tachometer_signal__max_gap
0,100,50,-0.169319,0.162043,-0.794698,0.118969,-0.136461,-0.413488,0.017492,0.234365,...,0.368610,3,,22159.500000,13.500000,0.000609,22146.0,22173.0,0.000049,22172.0
1,100,50,0.000265,0.071211,-0.282226,0.157426,0.003846,-0.097011,0.089568,0.071211,...,0.767753,3,,22162.500000,4.500000,0.000203,22158.0,22167.0,0.000049,22166.0
2,100,50,-0.000159,0.074463,-0.336066,0.162016,0.003598,-0.102221,0.093041,0.074463,...,0.768535,2,,22143.000000,0.000000,0.000000,22143.0,22143.0,0.000033,22142.0
3,100,50,-0.000540,0.075078,-0.290537,0.164125,0.003101,-0.103723,0.093662,0.075080,...,0.768974,2,,22173.000000,0.000000,0.000000,22173.0,22173.0,0.000033,22172.0
4,100,50,-0.000262,0.075467,-0.300958,0.184594,0.003722,-0.103090,0.094406,0.075467,...,0.769495,3,,22167.000000,4.000000,0.000180,22163.0,22171.0,0.000049,22170.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011,3600,50,0.003271,0.936696,-4.257450,3.650200,0.005645,-1.185720,1.187097,0.936702,...,0.392731,100,,614.414141,0.492573,0.000802,614.0,615.0,0.001628,614.0
2012,3600,50,-0.000045,0.932316,-4.302978,4.262537,0.001116,-1.186712,1.181998,0.932316,...,0.395335,100,,614.434343,0.495670,0.000807,614.0,615.0,0.001628,614.0
2013,3600,100,-0.001060,1.102834,-4.373690,4.765580,0.004094,-1.419564,1.389803,1.102835,...,0.335634,100,,614.434343,0.495670,0.000807,614.0,615.0,0.001628,614.0
2014,3600,100,-0.003609,1.074484,-4.859490,4.633213,0.003349,-1.377658,1.362250,1.074491,...,0.345139,100,,614.424242,0.494227,0.000804,614.0,615.0,0.001628,614.0


In [25]:
# -----------------------
# 5) Train/valid split
# -----------------------
y_int = y.astype(int)

X_tr, X_va, y_tr, y_va = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y_int
)


In [27]:
# -----------------------
# 6) Random Forest
# -----------------------
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=6,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_tr, y_tr)

def predict_with_conf(model, X):
    y_cont = model.predict(X)
    y_cont = np.clip(y_cont, 0, 10)
    y_cls = np.rint(y_cont).astype(int)
    all_preds = np.stack([t.predict(X) for t in model.estimators_], axis=1)
    std = all_preds.std(axis=1)
    conf = 1 - (std / (std.max() + 1e-9))
    return y_cls, y_cont, conf

yhat_cls, yhat_cont, conf = predict_with_conf(rf, X_va)

print("MAE (severità continua):", mean_absolute_error(y_va, yhat_cont))

cm = confusion_matrix(np.rint(y_va).astype(int), yhat_cls, labels=list(range(11)))
cm_df = pd.DataFrame(
    cm, index=[f"true_{i}" for i in range(11)],
    columns=[f"pred_{i}" for i in range(11)]
)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
print(cm_df)


# (opzionale) nomi colonna finali usati nel modello:
feature_names = list(X_full.columns)


MAE (severità continua): 1.086229459405871
         pred_0  pred_1  pred_2  pred_3  pred_4  pred_5  pred_6  pred_7  \
true_0       34      15       7       2       0       0       0       0   
true_1        0      10      25      16       6       2       0       0   
true_2        1       1      12      25      12       5       2       0   
true_3        0       0      10      30      13       1       0       0   
true_4        0       0       1      18      40       2       0       0   
true_5        0       0       0       0       0       0       0       0   
true_6        0       0       2      15       7       7      24       0   
true_7        0       0       0       0       0       0       0       0   
true_8        0       0       0       3       3       5      11      28   
true_9        0       0       0       0       0       0       0       0   
true_10       0       0       0       0       0       0       0       0   

         pred_8  pred_9  pred_10  
true_0        0      