In [None]:
# ================================
# Built-in
# ================================
import os
import re
import warnings
from pprint import pprint
import contextlib
import joblib

# ================================
# Scientific computing
# ================================
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy import signal
from scipy.signal import hilbert, welch
from scipy.fft import fft, fftfreq

# ================================
# Machine learning - core
# ================================
import lightgbm as lgb
from xgboost import XGBRegressor

# ================================
# Machine learning - scikit-learn
# ================================
from sklearn.ensemble import (
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.inspection import permutation_importance
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.metrics import (
    confusion_matrix,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    train_test_split,
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# ================================
# Progress bar
# ================================
from tqdm.auto import tqdm
from tqdm_joblib import tqdm_joblib

In [28]:
# Converts relative paths to absolute ones
ROOT_TRAIN_DATA_FOLDER ='../data/raw/B - PHM America 2023 - Dataset/Data_Challenge_PHM2023_training_data'
ROOT_TEST_DATA_FOLDER = '../data/raw/B - PHM America 2023 - Dataset/Data_Challenge_PHM2023_test_data'

## Parsing

In [7]:
def parse_vibration_dataset(dataset_path):
    """
    Parsa il dataset di vibrazione e crea un DataFrame pandas con una riga per ogni file.
    Ogni riga contiene gli array completi delle serie temporali di vibrazione.

    Args:
        dataset_path (str): Percorso alla cartella principale del dataset

    Returns:
        pd.DataFrame: DataFrame con colonne etichetta, velocità, torque, rep e array di dati di vibrazione (una riga per file)
    """

    data_list = []
    
    # Prima passata: conta il numero totale di file .txt
    print("Conteggio file in corso...")
    total_files = 0
    for root, dirs, files in os.walk(dataset_path):
        total_files += sum(1 for file in files if file.endswith('.txt'))
    
    print(f"Trovati {total_files} file .txt da processare")

    # Seconda passada: processa i file con progress bar
    with tqdm(total=total_files, desc="Parsing dataset", unit="file") as pbar:
        for root, dirs, files in os.walk(dataset_path):
            for file in files:
                if file.endswith('.txt'):
                    folder_name = os.path.basename(root)
                    if 'Pitting_degradation_level_' in folder_name:
                        etichetta_full = folder_name.replace('Pitting_degradation_level_', '')
                        if '(' in etichetta_full:
                            etichetta = etichetta_full.split('(')[0].strip()
                            descrizione = etichetta_full.split('(')[1].replace(')', '').strip()
                        else:
                            etichetta = etichetta_full.strip()
                            descrizione = None
                    else:
                        etichetta = folder_name
                        descrizione = None

                    # ERRORE CORRETTO: regex pattern per V100_200N_2.txt
                    pattern = r'V(\d+)_(\d+)N_(\d+)\.txt'
                    match = re.search(pattern, file)

                    if match:
                        velocita = int(match.group(1))
                        torque = int(match.group(2))
                        rep = int(match.group(3))

                        file_path = os.path.join(root, file)
                        try:
                            # Carica tutti i dati del file una volta sola
                            data = np.loadtxt(file_path)
                            
                            # Calcola informazioni aggiuntive sui dati
                            sampling_rate = 20480  # Hz come specificato nella documentazione
                            duration = len(data) / sampling_rate
                            
                            # Crea un record per l'intero file
                            record = {
                                'file_name': file,
                                'etichetta': etichetta,
                                'health_level': int(etichetta) if etichetta.isdigit() else etichetta,
                                'velocita': velocita,
                                'torque': torque,
                                'rep': rep,
                                'horizontal_acceleration': data[:, 0],  # Array completo
                                'axial_acceleration': data[:, 1],       # Array completo
                                'vertical_acceleration': data[:, 2],    # Array completo
                                'tachometer_signal': data[:, 3],        # Array completo
                                'sampling_rate': sampling_rate,
                                'duration': duration,
                                'num_samples': len(data),
                                'descrizione': descrizione
                            }
                            # ERRORE CORRETTO: aggiungi record invece di data
                            data_list.append(record)

                        except Exception as e:
                            tqdm.write(f"Errore nel leggere il file {file_path}: {e}")
                            continue
                    else:
                        tqdm.write(f"Nome file non riconosciuto: {file}")
                    
                    # Aggiorna la progress bar
                    pbar.update(1)

    df = pd.DataFrame(data_list)

    if not df.empty:
        print("\nOrdinamento dataset...")
        df = df.sort_values(['health_level', 'velocita', 'torque', 'rep']).reset_index(drop=True)
        print(f"Dataset caricato: {len(df)} file processati")
        print(f"Health levels disponibili: {sorted(df['health_level'].unique())}")
        print(f"Condizioni operative (rpm): {sorted(df['velocita'].unique())}")
        print(f"Condizioni operative (torque): {sorted(df['torque'].unique())}")

    return df


def extract_signal_features(df, signal_column):
    """
    Estrae features statistiche e nel dominio della frequenza da una colonna di segnali.
    
    Args:
        df (pd.DataFrame): DataFrame con i dati
        signal_column (str): Nome della colonna contenente gli array dei segnali
    
    Returns:
        pd.DataFrame: DataFrame con features estratte
    """
    
    features_list = []
    
    # Progress bar per l'estrazione delle features
    with tqdm(total=len(df), desc=f"Estraendo features da {signal_column}", unit="segnale") as pbar:
        for idx, row in df.iterrows():
            signal = row[signal_column]
            
            # Features statistiche nel dominio del tempo
            features = {
                f'{signal_column}_mean': np.mean(signal),
                f'{signal_column}_std': np.std(signal),
                f'{signal_column}_rms': np.sqrt(np.mean(signal**2)),
                f'{signal_column}_peak': np.max(np.abs(signal)),
                f'{signal_column}_peak_to_peak': np.ptp(signal),
                f'{signal_column}_skewness': stats.skew(signal),
                f'{signal_column}_kurtosis': stats.kurtosis(signal),
                f'{signal_column}_crest_factor': np.max(np.abs(signal)) / np.sqrt(np.mean(signal**2)),
            }
            
            # Features nel dominio della frequenza
            sampling_rate = row['sampling_rate']
            fft_vals = np.abs(fft(signal))
            freqs = fftfreq(len(signal), 1/sampling_rate)
            
            # Considera solo le frequenze positive
            positive_freq_idx = freqs > 0
            fft_positive = fft_vals[positive_freq_idx]
            freqs_positive = freqs[positive_freq_idx]
            
            # Trova la frequenza dominante
            dominant_freq_idx = np.argmax(fft_positive)
            features[f'{signal_column}_dominant_freq'] = freqs_positive[dominant_freq_idx]
            features[f'{signal_column}_dominant_magnitude'] = fft_positive[dominant_freq_idx]
            
            # Features spettrali
            total_power = np.sum(fft_positive**2)
            features[f'{signal_column}_spectral_centroid'] = np.sum(freqs_positive * fft_positive**2) / total_power
            features[f'{signal_column}_spectral_rolloff'] = freqs_positive[np.where(np.cumsum(fft_positive**2) >= 0.85 * total_power)[0][0]]
            
            features_list.append(features)
            pbar.update(1)
    
    return pd.DataFrame(features_list)


def extract_all_features(df):
    """
    Estrae tutte le features da tutti i segnali di accelerazione con progress bar complessiva.
    
    Args:
        df (pd.DataFrame): DataFrame con i dati grezzi
    
    Returns:
        pd.DataFrame: DataFrame completo con tutte le features
    """
    print("Iniziando estrazione features da tutti i segnali...")
    
    # Estrai features da tutti i segnali di accelerazione
    horizontal_features = extract_signal_features(df, 'horizontal_acceleration')
    axial_features = extract_signal_features(df, 'axial_acceleration')
    vertical_features = extract_signal_features(df, 'vertical_acceleration')

    print("Combinando tutte le features...")
    # Combina tutte le features
    features_df = pd.concat([
        df[['file_name', 'health_level', 'velocita', 'torque', 'rep']],
        horizontal_features,
        axial_features, 
        vertical_features
    ], axis=1)

    print(f"Features dataset completato! Shape: {features_df.shape}")
    return features_df


# Esempio di utilizzo:
# Carica il dataset
print("=== CARICAMENTO DATASET ===")
df = parse_vibration_dataset(ROOT_TRAIN_DATA_FOLDER)

df.head()

=== CARICAMENTO DATASET ===
Conteggio file in corso...
Trovati 2016 file .txt da processare


Parsing dataset: 100%|██████████| 2016/2016 [01:27<00:00, 23.17file/s]



Ordinamento dataset...
Dataset caricato: 2016 file processati
Health levels disponibili: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(6), np.int64(8)]
Condizioni operative (rpm): [np.int64(100), np.int64(200), np.int64(300), np.int64(400), np.int64(500), np.int64(600), np.int64(700), np.int64(800), np.int64(900), np.int64(1000), np.int64(1200), np.int64(2100), np.int64(2700), np.int64(3000), np.int64(3600)]
Condizioni operative (torque): [np.int64(50), np.int64(100), np.int64(200), np.int64(300), np.int64(400), np.int64(500)]


Unnamed: 0,file_name,etichetta,health_level,velocita,torque,rep,horizontal_acceleration,axial_acceleration,vertical_acceleration,tachometer_signal,sampling_rate,duration,num_samples,descrizione
0,V100_50N_1.txt,0,0,100,50,1,"[-0.4626018245, -0.7124489885, -0.5509291238, ...","[-0.4991110174, -0.7377190438, -0.5699753646, ...","[-0.4254987892, -0.814484938, -0.5429899587, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20480,12.05,246784,
1,V100_50N_2.txt,0,0,100,50,2,"[0.08373725705, -0.01240551956, -0.05421212049...","[0.04308363971, -0.05226539899, -0.09993991833...","[0.0949569657, -0.06374649093, -0.1336386738, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20480,12.05,246784,
2,V100_50N_3.txt,0,0,100,50,3,"[-0.06376437055, 0.09676305259, -0.04503203601...","[-0.07804495389, 0.07745637958, -0.06497860415...","[-0.0985720786, 0.153401291, -0.08254507805, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20480,12.05,246784,
3,V100_50N_4.txt,0,0,100,50,4,"[0.07418500699, -0.03634817232, -0.08572214018...","[0.06933405406, -0.03107672372, -0.07251235535...","[0.1519552459, -0.01518347421, -0.08507565708,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20480,12.05,246784,
4,V100_50N_5.txt,0,0,100,50,5,"[-0.02816052941, 0.04416364964, 0.05731350038,...","[-0.06439002983, 0.02001152664, 0.03849276007,...","[-0.07965298772, 0.031089971, 0.06398749846, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20480,12.05,246784,


In [8]:
df_downsampled.to_pickle('../data/processed/train_data_dowsampled_walt.pkl')

NameError: name 'df_downsampled' is not defined

In [None]:
df = pd.read_pickle('../data/processed/train_data_walt.pkl')

## Downsampling

In [9]:
def downsample(signal, fs=20480, sec=3):
    """
    Filtra e fa il resample di un segnale ad una lunghezza fissa.
    """
    if len(signal) == fs*sec:
        return signal
    return signal[:fs*sec]  # Truncate or pad to the desired length

def downsample_vibration_dataframe(df, sec=3):
    """
    Applica filtraggio e resample a tutti i segnali accelerometrici e tachimetro nel DataFrame.
    Ritorna un nuovo DataFrame con colonne preprocessate.
    """
    processed_records = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Preprocessing"):
        try:
            record = {
                'file_name': row['file_name'],
                'etichetta': row['etichetta'],
                'health_level': row['health_level'],
                'velocita': row['velocita'],
                'torque': row['torque'],
                'rep': row['rep'],
                'sampling_rate': row['sampling_rate'],
                'descrizione': row['descrizione'],
                'duration': row['duration'],
                'num_samples': row['num_samples']
            }

            # Applica filtro + resample ai 4 segnali
            record['horizontal_acceleration'] = downsample(
                row['horizontal_acceleration'], fs=row['sampling_rate'], sec=sec
            )
            record['axial_acceleration'] = downsample(
                row['axial_acceleration'], fs=row['sampling_rate'], sec=sec
            )
            record['vertical_acceleration'] = downsample(
                row['vertical_acceleration'], fs=row['sampling_rate'], sec=sec
            )
            record['tachometer_signal'] = downsample(
                row['tachometer_signal'], fs=row['sampling_rate'], sec=sec
            )

            processed_records.append(record)

        except Exception as e:
            tqdm.write(f"Errore durante il preprocessing del file {row['file_name']}: {e}")
            continue

    return pd.DataFrame(processed_records)


In [10]:
features_df = downsample_vibration_dataframe(df, sec=3)

Preprocessing: 100%|██████████| 2016/2016 [00:00<00:00, 28524.50it/s]


In [11]:
features_df = pd.read_pickle("../data/processed/train_data_dowsampled_walt.pkl")

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/train_data_dowsampled_walt.pkl'

## Feature extraction

In [12]:
# -----------------------
# 1) Pulizia base target
# -----------------------
features_df['health_level'] = pd.to_numeric(features_df['health_level'], errors='coerce')
features_df = features_df.dropna(subset=['health_level']).reset_index(drop=True)

# -----------------------
# 2) Definisci colonne
# -----------------------
scalar_cols = ['velocita', 'torque']  # numeriche scalari già pronte
array_cols = ['horizontal_acceleration', 'axial_acceleration', 'vertical_acceleration', 'tachometer_signal']

scalar_cols = [c for c in scalar_cols if c in features_df.columns]
array_cols  = [c for c in array_cols  if c in features_df.columns]

# Parametri segnale
FS = 20480  # Hz (dal task PHM)
PULSES_PER_REV = 1
TACH_COL_NAME = 'tachometer_signal'

# -----------------------
# 3) Utils
# -----------------------
def _safe_array(x):
    if isinstance(x, (list, tuple, np.ndarray, pd.Series)):
        arr = np.asarray(x, dtype=float).ravel()
    else:
        try:
            arr = np.asarray([x], dtype=float)
        except Exception:
            arr = np.asarray([], dtype=float)
    arr = arr[np.isfinite(arr)]
    return arr

def time_feats(arr):
    if arr.size == 0:
        return {
            'mean': np.nan, 'std': np.nan, 'min': np.nan, 'max': np.nan,
            'median': np.nan, 'p10': np.nan, 'p90': np.nan, 'rms': np.nan,
            'skew': np.nan, 'kurt': np.nan, 'ptp': np.nan, 'iqr': np.nan,
            'zcr': np.nan, 'envelope_rms': np.nan
        }
    feats = {
        'mean': float(np.mean(arr)),
        'std': float(np.std(arr)),
        'min': float(np.min(arr)),
        'max': float(np.max(arr)),
        'median': float(np.median(arr)),
        'p10': float(np.percentile(arr, 10)),
        'p90': float(np.percentile(arr, 90)),
        'rms': float(np.sqrt(np.mean(arr**2))),
        'skew': float(stats.skew(arr, bias=False)) if arr.size > 2 else 0.0,
        'kurt': float(stats.kurtosis(arr, fisher=True, bias=False)) if arr.size > 3 else 0.0,
        'ptp': float(np.ptp(arr)),
        'iqr': float(np.subtract(*np.percentile(arr, [75, 25]))),
        'zcr': float(np.mean(np.abs(np.diff(np.sign(arr))) > 0)) if arr.size > 1 else 0.0,
    }
    try:
        env = np.abs(hilbert(arr))
        feats['envelope_rms'] = float(np.sqrt(np.mean(env**2)))
    except Exception:
        feats['envelope_rms'] = np.nan
    return feats

def tach_feats(arr, fs=FS, pulses_per_rev=PULSES_PER_REV, threshold=0.0):
    if arr.size == 0:
        return {'rpm': np.nan, 'pulse_count': 0, 'ipi_mean': np.nan, 'ipi_std': np.nan, 'ipi_cv': np.nan}
    bin_sig = (arr > threshold).astype(np.uint8)
    idx = np.flatnonzero(bin_sig)
    pulse_count = int(idx.size)
    rpm = np.nan
    if fs is not None and arr.size > 0 and pulses_per_rev > 0:
        window_sec = arr.size / float(fs)
        if window_sec > 0:
            revs = pulse_count / float(pulses_per_rev)
            rpm = (revs / window_sec) * 60.0
    if idx.size > 1:
        ipi = np.diff(idx) / float(fs) if fs else np.diff(idx).astype(float)
        ipi_mean = float(np.mean(ipi))
        ipi_std  = float(np.std(ipi))
        ipi_cv   = float(ipi_std / ipi_mean) if ipi_mean > 0 else np.nan
    else:
        ipi_mean = ipi_std = ipi_cv = np.nan
    return {'rpm': float(rpm), 'pulse_count': pulse_count, 'ipi_mean': ipi_mean, 'ipi_std': ipi_std, 'ipi_cv': ipi_cv}

def psd_band_feats(arr, fs=FS, bands=None):
    if bands is None:
        # bande fino alla Nyquist (10240 Hz), regolabili
        bands = [(0,200),(200,500),(500,1000),(1000,2000),(2000,4000),(4000,8000)]
    if arr.size < 8:
        return {f'band_{lo}_{hi}': np.nan for (lo,hi) in bands}
    # Welch PSD
    try:
        f, Pxx = welch(arr, fs=fs, nperseg=min(2048, len(arr)))
    except Exception:
        return {f'band_{lo}_{hi}': np.nan for (lo,hi) in bands}
    feats = {}
    for (lo, hi) in bands:
        mask = (f >= lo) & (f < hi)
        feats[f'band_{lo}_{hi}'] = float(np.trapz(Pxx[mask], f[mask])) if np.any(mask) else 0.0
    # normalizza per energia totale per robustezza
    total = float(np.trapz(Pxx, f)) if f.size else 1.0
    for (lo, hi) in bands:
        key = f'band_{lo}_{hi}'
        feats[key] = feats[key] / total if total > 0 else 0.0
    return feats

def cross_axis_feats(ax, ay, az):
    feats = {}
    def safe_corr(a,b):
        if len(a) < 2 or len(b) < 2:
            return np.nan
        a = a - np.mean(a)
        b = b - np.mean(b)
        denom = (np.std(a)*np.std(b))
        if denom == 0:
            return 0.0
        return float(np.mean(a*b)/denom)
    feats['corr_xy'] = safe_corr(ax, ay)
    feats['corr_xz'] = safe_corr(ax, az)
    feats['corr_yz'] = safe_corr(ay, az)
    return feats

# -----------------------
# 4) Espansione feature
# -----------------------
def expand_features(df, array_cols, scalar_cols):
    rows = []
    for i, row in df.iterrows():
        feat_row = {}
        # scalari
        for c in scalar_cols:
            feat_row[c] = row[c]
        # array
        ax = _safe_array(row[array_cols[0]]) if 'horizontal_acceleration' in array_cols else np.array([])
        ay = _safe_array(row[array_cols[1]]) if 'axial_acceleration' in array_cols else np.array([])
        az = _safe_array(row[array_cols[2]]) if 'vertical_acceleration' in array_cols else np.array([])
        tach = _safe_array(row[TACH_COL_NAME]) if TACH_COL_NAME in array_cols else np.array([])

        # time feats per-asse
        for sig, name in [(ax,'ax'), (ay,'ay'), (az,'az')]:
            if sig.size > 0:
                tf = time_feats(sig)
                tf = {f'{name}_{k}': v for k,v in tf.items()}
                feat_row.update(tf)

                pf = psd_band_feats(sig, fs=FS)
                pf = {f'{name}_{k}': v for k,v in pf.items()}
                feat_row.update(pf)

        # cross-axis
        if ax.size > 0 and ay.size > 0 and az.size > 0:
            feat_row.update(cross_axis_feats(ax, ay, az))

        # tach
        if tach.size > 0:
            tfeat = tach_feats(tach, fs=FS, pulses_per_rev=PULSES_PER_REV, threshold=0.0)
            feat_row.update(tfeat)

        # interazioni operative
        if 'velocita' in feat_row and 'torque' in feat_row:
            v = feat_row['velocita']
            t = feat_row['torque']
            feat_row['v_times_t'] = v * t
            feat_row['v_sq'] = v**2
            feat_row['t_sq'] = t**2
            if 'rpm' in feat_row and np.isfinite(feat_row['rpm']):
                rpm = feat_row['rpm']
                # normalizzazioni semplici
                feat_row['v_over_rpm'] = v / (rpm + 1e-6)
                feat_row['t_over_rpm'] = t / (rpm + 1e-6)

        rows.append(feat_row)

    X_full = pd.DataFrame(rows)
    # imputazione semplice
    X_full = X_full.replace([np.inf, -np.inf], np.nan)
    X_full = X_full.fillna(X_full.median(numeric_only=True))
    return X_full


In [13]:
print("[*] Genero feature...")
X_full = expand_features(features_df, array_cols=array_cols, scalar_cols=scalar_cols)
y = features_df['health_level'].astype(float).values

print(f"[*] Shape X_full: {X_full.shape}, n_features = {X_full.shape[1]}")

[*] Genero feature...


  feats[f'band_{lo}_{hi}'] = float(np.trapz(Pxx[mask], f[mask])) if np.any(mask) else 0.0
  total = float(np.trapz(Pxx, f)) if f.size else 1.0


[*] Shape X_full: (2016, 75), n_features = 75


In [14]:
X_full

Unnamed: 0,velocita,torque,ax_mean,ax_std,ax_min,ax_max,ax_median,ax_p10,ax_p90,ax_rms,...,rpm,pulse_count,ipi_mean,ipi_std,ipi_cv,v_times_t,v_sq,t_sq,v_over_rpm,t_over_rpm
0,100,50,-0.169319,0.162043,-0.794698,0.118969,-0.136461,-0.413488,0.017492,0.234365,...,60.0,3,1.082007,0.000659,0.000609,5000,10000,2500,1.666667,0.833333
1,100,50,0.000265,0.071211,-0.282226,0.157426,0.003846,-0.097011,0.089568,0.071211,...,60.0,3,1.082153,0.000220,0.000203,5000,10000,2500,1.666667,0.833333
2,100,50,-0.000159,0.074463,-0.336066,0.162016,0.003598,-0.102221,0.093041,0.074463,...,40.0,2,1.081201,0.000000,0.000000,5000,10000,2500,2.500000,1.250000
3,100,50,-0.000540,0.075078,-0.290537,0.164125,0.003101,-0.103723,0.093662,0.075080,...,40.0,2,1.082666,0.000000,0.000000,5000,10000,2500,2.500000,1.250000
4,100,50,-0.000262,0.075467,-0.300958,0.184594,0.003722,-0.103090,0.094406,0.075467,...,60.0,3,1.082373,0.000195,0.000180,5000,10000,2500,1.666667,0.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011,3600,50,0.003271,0.936696,-4.257450,3.650200,0.005645,-1.185720,1.187097,0.936702,...,2000.0,100,0.030001,0.000024,0.000802,180000,12960000,2500,1.800000,0.025000
2012,3600,50,-0.000045,0.932316,-4.302978,4.262537,0.001116,-1.186712,1.181998,0.932316,...,2000.0,100,0.030002,0.000024,0.000807,180000,12960000,2500,1.800000,0.025000
2013,3600,100,-0.001060,1.102834,-4.373690,4.765580,0.004094,-1.419564,1.389803,1.102835,...,2000.0,100,0.030002,0.000024,0.000807,360000,12960000,10000,1.800000,0.050000
2014,3600,100,-0.003609,1.074484,-4.859490,4.633213,0.003349,-1.377658,1.362250,1.074491,...,2000.0,100,0.030001,0.000024,0.000804,360000,12960000,10000,1.800000,0.050000


## Training

In [15]:
# -----------------------
# 5) Split
# -----------------------
y_int = y.astype(int)
X = X_full.astype(np.float32).values

X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y_int
)

In [22]:
# Custom wrapper to fix LightGBM feature name issues
class LGBMRegressorFixed(lgb.LGBMRegressor):
    def fit(self, X, y, **kwargs):
        # Convert to DataFrame with proper column names if needed
        if hasattr(X, 'shape') and not hasattr(X, 'columns'):
            import pandas as pd
            X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
        return super().fit(X, y, **kwargs)
    
    def predict(self, X, **kwargs):
        # Convert to DataFrame with same column names for prediction
        if hasattr(X, 'shape') and not hasattr(X, 'columns'):
            import pandas as pd
            X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
        return super().predict(X, **kwargs)

# Progress bar context manager for joblib
@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

# Random Forest
rf = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("model", RandomForestRegressor(random_state=42, n_jobs=-1))
])

rf_param_dist = {
    "model__n_estimators": [200, 400, 600, 800],
    "model__max_depth": [6, 8, 10, 12, None],
    "model__min_samples_leaf": [1, 2, 5, 10],
    "model__min_samples_split": [2, 5, 10],
    "model__max_features": ["sqrt", "log2", 0.3, 0.5, 0.8],
}

# Histogram Gradient Boosting
hgb = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("model", HistGradientBoostingRegressor(random_state=42))
])

hgb_param_dist = {
    "model__max_depth": [None, 4, 6, 8],
    "model__learning_rate": [0.03, 0.05, 0.1],
    "model__max_iter": [200, 400, 800],
    "model__l2_regularization": [0.0, 0.1, 1.0],
    "model__min_samples_leaf": [10, 20, 30, 50],
}

# XGBoost
xgb = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("model", XGBRegressor(random_state=42, n_jobs=-1, verbosity=0))
])

xgb_param_dist = {
    "model__n_estimators": [200, 400, 600, 800],
    "model__max_depth": [3, 4, 6, 8],
    "model__learning_rate": [0.01, 0.05, 0.1, 0.15],
    "model__subsample": [0.8, 0.9, 1.0],
    "model__colsample_bytree": [0.8, 0.9, 1.0],
    "model__reg_alpha": [0, 0.1, 1],
    "model__reg_lambda": [1, 1.5, 2],
}

# LightGBM - Fixed version with proper feature handling
lgbm = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("model", LGBMRegressorFixed(
        random_state=42, 
        n_jobs=-1, 
        verbosity=-1
    ))
])

lgbm_param_dist = {
    "model__n_estimators": [200, 400, 600, 800],
    "model__max_depth": [3, 4, 6, 8, -1],
    "model__learning_rate": [0.01, 0.05, 0.1, 0.15],
    "model__subsample": [0.8, 0.9, 1.0],
    "model__colsample_bytree": [0.8, 0.9, 1.0],
    "model__reg_alpha": [0, 0.1, 1],
    "model__reg_lambda": [0, 0.1, 1],
    "model__num_leaves": [31, 50, 100, 200],
}

# Gradient Boosting
gb = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("model", GradientBoostingRegressor(random_state=42))
])

gb_param_dist = {
    "model__n_estimators": [200, 400, 600],
    "model__max_depth": [3, 4, 6, 8],
    "model__learning_rate": [0.01, 0.05, 0.1, 0.15],
    "model__subsample": [0.8, 0.9, 1.0],
    "model__max_features": ["sqrt", "log2", 0.3, 0.5],
}

# Extra Trees
et = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("model", ExtraTreesRegressor(random_state=42, n_jobs=-1))
])

et_param_dist = {
    "model__n_estimators": [200, 400, 600, 800],
    "model__max_depth": [6, 8, 10, 12, None],
    "model__min_samples_leaf": [1, 2, 5, 10],
    "model__min_samples_split": [2, 5, 10],
    "model__max_features": ["sqrt", "log2", 0.3, 0.5, 0.8],
}

# Ridge Regression 
ridge = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge(random_state=42))
])

ridge_param_dist = {
    "model__alpha": [0.1, 1.0, 10.0, 100.0, 1000.0],
}

# Elastic Net
elastic = Pipeline([
    ("scaler", StandardScaler()),
    ("model", ElasticNet(random_state=42))
])

elastic_param_dist = {
    "model__alpha": [0.1, 1.0, 10.0, 100.0],
    "model__l1_ratio": [0.1, 0.5, 0.7, 0.9],
}

def fit_search(name, pipe, param_dist, *, n_iter=25, cv=3):
    print(f"\n[*] Tuning {name}...")
    
    # Calculate actual parameter space size
    param_space_size = 1
    for param_values in param_dist.values():
        param_space_size *= len(param_values)
    
    # Adjust n_iter if parameter space is smaller
    actual_n_iter = min(n_iter, param_space_size)
    
    # Decide between RandomizedSearchCV and GridSearchCV
    if param_space_size <= 20:  # Small space, use GridSearchCV
        from sklearn.model_selection import GridSearchCV
        search = GridSearchCV(
            pipe,
            param_grid=param_dist,
            scoring="neg_mean_absolute_error",
            cv=cv,
            verbose=0,
            n_jobs=-1
        )
        total_steps = param_space_size * cv
        search_type = "Grid"
    else:  # Large space, use RandomizedSearchCV
        search = RandomizedSearchCV(
            pipe,
            param_distributions=param_dist,
            n_iter=actual_n_iter,
            scoring="neg_mean_absolute_error",
            cv=cv,
            verbose=0,
            random_state=42,
            n_jobs=-1
        )
        total_steps = actual_n_iter * cv
        search_type = "Random"
    
    print(f"    Using {search_type}SearchCV with {total_steps//cv} parameter combinations")
    
    # Use proper progress bar with joblib integration
    with tqdm_joblib(tqdm(total=total_steps, desc=f"Tuning {name}", leave=True, 
                          bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]')):
        search.fit(X_tr, y_tr)
    
    print(f"[+] Best MAE (cv) {name}: {-search.best_score_:.4f}")
    print(f"[+] Best params {name}:")
    pprint(search.best_params_)
    return search

# Training all models with smart search strategy
print("=" * 60)
print("TRAINING MODELLI")
print("=" * 60)

# Train each model with appropriate search method
rf_search = fit_search("RandomForest", rf, rf_param_dist, n_iter=25, cv=3)
hgb_search = fit_search("HistGradientBoosting", hgb, hgb_param_dist, n_iter=25, cv=3)
xgb_search = fit_search("XGBoost", xgb, xgb_param_dist, n_iter=30, cv=3)
lgbm_search = fit_search("LightGBM", lgbm, lgbm_param_dist, n_iter=30, cv=3)
gb_search = fit_search("GradientBoosting", gb, gb_param_dist, n_iter=25, cv=3)
et_search = fit_search("ExtraTrees", et, et_param_dist, n_iter=25, cv=3)
ridge_search = fit_search("Ridge", ridge, ridge_param_dist, n_iter=10, cv=3)  # Will auto-switch to GridSearchCV
elastic_search = fit_search("ElasticNet", elastic, elastic_param_dist, n_iter=15, cv=3)  # Will auto-switch to GridSearchCV

# Collect results
searches = {
    "RandomForest": rf_search,
    "HistGradientBoosting": hgb_search,
    "XGBoost": xgb_search,
    "LightGBM": lgbm_search,
    "GradientBoosting": gb_search,
    "ExtraTrees": et_search,
    "Ridge": ridge_search,
    "ElasticNet": elastic_search,
}

# Final comparison
print("\n" + "=" * 60)
print("RISULTATI FINALI - CONFRONTO MODELLI")
print("=" * 60)
print(f"{'Modello':<20} {'Best CV MAE':<15} {'Std':<10}")
print("-" * 45)

results = []
for name, search in searches.items():
    cv_scores = cross_val_score(
        search.best_estimator_, X_tr, y_tr, 
        cv=3, scoring="neg_mean_absolute_error"
    )
    mae_mean = -cv_scores.mean()
    mae_std = cv_scores.std()
    results.append((name, mae_mean, mae_std, search.best_estimator_))
    print(f"{name:<20} {mae_mean:<15.4f} {mae_std:<10.4f}")

# Sort by performance
results.sort(key=lambda x: x[1])
print(f"\n🏆 Miglior modello: {results[0][0]} (MAE: {results[0][1]:.4f} ± {results[0][2]:.4f})")

# Save best model
best_model = results[0][3]
print(f"\nModello selezionato salvato come 'best_model'")

TRAINING MODELLI

[*] Tuning RandomForest...
    Using RandomSearchCV with 25 parameter combinations


Tuning RandomForest: |          | 475/? [00:45<00:00]    


[+] Best MAE (cv) RandomForest: 0.7675
[+] Best params RandomForest:
{'model__max_depth': 12,
 'model__max_features': 0.5,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__n_estimators': 400}

[*] Tuning HistGradientBoosting...
    Using RandomSearchCV with 25 parameter combinations


Tuning HistGradientBoosting: |          | 150/? [00:19<00:00]    


[+] Best MAE (cv) HistGradientBoosting: 0.6020
[+] Best params HistGradientBoosting:
{'model__l2_regularization': 1.0,
 'model__learning_rate': 0.1,
 'model__max_depth': None,
 'model__max_iter': 800,
 'model__min_samples_leaf': 10}

[*] Tuning XGBoost...
    Using RandomSearchCV with 30 parameter combinations


Tuning XGBoost: 100%|██████████| 90/90 [00:28<00:00]


[+] Best MAE (cv) XGBoost: 0.5918
[+] Best params XGBoost:
{'model__colsample_bytree': 0.9,
 'model__learning_rate': 0.05,
 'model__max_depth': 6,
 'model__n_estimators': 600,
 'model__reg_alpha': 0,
 'model__reg_lambda': 1.5,
 'model__subsample': 0.8}

[*] Tuning LightGBM...
    Using RandomSearchCV with 30 parameter combinations


Tuning LightGBM: 100%|██████████| 90/90 [01:53<00:00]


[+] Best MAE (cv) LightGBM: 0.6107
[+] Best params LightGBM:
{'model__colsample_bytree': 0.8,
 'model__learning_rate': 0.1,
 'model__max_depth': -1,
 'model__n_estimators': 600,
 'model__num_leaves': 31,
 'model__reg_alpha': 0,
 'model__reg_lambda': 0,
 'model__subsample': 0.9}

[*] Tuning GradientBoosting...
    Using RandomSearchCV with 25 parameter combinations


Tuning GradientBoosting: 100%|██████████| 75/75 [00:24<00:00]


[+] Best MAE (cv) GradientBoosting: 0.5222
[+] Best params GradientBoosting:
{'model__learning_rate': 0.05,
 'model__max_depth': 8,
 'model__max_features': 'log2',
 'model__n_estimators': 400,
 'model__subsample': 0.9}

[*] Tuning ExtraTrees...
    Using RandomSearchCV with 25 parameter combinations


Tuning ExtraTrees: |          | 275/? [00:08<00:00]    


[+] Best MAE (cv) ExtraTrees: 0.6121
[+] Best params ExtraTrees:
{'model__max_depth': None,
 'model__max_features': 0.5,
 'model__min_samples_leaf': 2,
 'model__min_samples_split': 5,
 'model__n_estimators': 200}

[*] Tuning Ridge...
    Using GridSearchCV with 5 parameter combinations


Tuning Ridge: 100%|██████████| 15/15 [00:00<00:00]


[+] Best MAE (cv) Ridge: 1.4735
[+] Best params Ridge:
{'model__alpha': 0.1}

[*] Tuning ElasticNet...
    Using GridSearchCV with 16 parameter combinations


Tuning ElasticNet: 100%|██████████| 48/48 [00:00<00:00]


[+] Best MAE (cv) ElasticNet: 1.6327
[+] Best params ElasticNet:
{'model__alpha': 0.1, 'model__l1_ratio': 0.1}

RISULTATI FINALI - CONFRONTO MODELLI
Modello              Best CV MAE     Std       
---------------------------------------------
RandomForest         0.7675          0.0361    
HistGradientBoosting 0.6020          0.0278    
XGBoost              0.5918          0.0337    
LightGBM             0.6107          0.0273    
GradientBoosting     0.5222          0.0085    
ExtraTrees           0.6121          0.0164    
Ridge                1.4735          0.0210    
ElasticNet           1.6327          0.0665    

🏆 Miglior modello: GradientBoosting (MAE: 0.5222 ± 0.0085)

Modello selezionato salvato come 'best_model'


In [23]:
# -----------------------
# 7) Valutazione su validation (aggiornata per tutti i modelli)
# -----------------------

def evaluate(model, X_va, y_va, model_name, print_cm=True):
    # Predizione continua
    y_cont = model.predict(X_va).astype(float)
    y_cont = np.clip(y_cont, 0, 10)

    # Versione classificata per classi 0..10
    y_cls = np.rint(y_cont).astype(int)
    y_true_cls = np.rint(y_va).astype(int)

    # Metriche
    mae = mean_absolute_error(y_va, y_cont)
    rmse = np.sqrt(mean_squared_error(y_va, y_cont))
    r2 = r2_score(y_va, y_cont)

    # Alcune metriche “di utilità”
    medae = np.median(np.abs(y_va - y_cont))
    within_05 = np.mean(np.abs(y_va - y_cont) <= 0.5)  # % entro mezzo punto
    within_10 = np.mean(np.abs(y_va - y_cont) <= 1.0)  # % entro un punto
    exact_cls = np.mean(y_true_cls == y_cls)           # % match esatto della classe arrotondata

    print(f"\n=== {model_name} on validation ===")
    print(f"MAE:   {mae:.4f}")
    print(f"RMSE:  {rmse:.4f}")
    print(f"R^2:   {r2:.4f}")
    print(f"MedAE: {medae:.4f}")
    print(f"% entro 0.5: {within_05*100:6.2f}%   % entro 1.0: {within_10*100:6.2f}%   % match classe: {exact_cls*100:6.2f}%")

    # Confusion matrix 0..10 (su target arrotondato)
    cm = confusion_matrix(y_true_cls, y_cls, labels=list(range(11)))
    cm_df = pd.DataFrame(
        cm,
        index=[f"true_{i}" for i in range(11)],
        columns=[f"pred_{i}" for i in range(11)]
    )
    if print_cm:
        print("\nConfusion matrix (rounded 0-10):")
        print(cm_df)

    return {
        "name": model_name,
        "mae": mae,
        "rmse": rmse,
        "r2": r2,
        "medae": medae,
        "within_05": within_05,
        "within_10": within_10,
        "exact_cls": exact_cls,
        "y_cont": y_cont,
        "y_cls": y_cls,
        "cm_df": cm_df
    }

# Valuta tutti i best_estimator_ ottenuti dallo step di tuning
val_results = []
print("\n" + "=" * 60)
print("VALUTAZIONE SU VALIDATION - TUTTI I MODELLI")
print("=" * 60)

for name, search in searches.items():
    res = evaluate(search.best_estimator_, X_va, y_va, name, print_cm=True)
    val_results.append(res)

# Tabella riassuntiva ordinata per MAE
summary_rows = []
for r in val_results:
    summary_rows.append([
        r["name"], r["mae"], r["rmse"], r["r2"],
        r["medae"], r["within_05"], r["within_10"], r["exact_cls"]
    ])

summary_df = pd.DataFrame(
    summary_rows,
    columns=["Model", "MAE", "RMSE", "R2", "MedAE", "Within 0.5", "Within 1.0", "Exact Class"]
).sort_values(by="MAE")

print("\n" + "-" * 60)
print("RIEPILOGO ORDINATO PER MAE (validation)")
print("-" * 60)
print(summary_df.to_string(index=False, float_format=lambda x: f"{x:.4f}"))

# Migliore su validation per MAE
best_val = min(val_results, key=lambda r: r["mae"])
best_name_val = best_val["name"]
best_model_val = searches[best_name_val].best_estimator_
print(f"\n[***] Best model on validation (MAE): {best_name_val} (MAE={best_val['mae']:.4f})")

# (Opzionale) se vuoi che 'best_model' sia anche quello migliore su validation:
best_model = best_model_val
print("Aggiornato 'best_model' con il migliore su validation.")



VALUTAZIONE SU VALIDATION - TUTTI I MODELLI

=== RandomForest on validation ===
MAE:   0.6363
RMSE:  0.9117
R^2:   0.8790
MedAE: 0.4333
% entro 0.5:  57.43%   % entro 1.0:  78.47%   % match classe:  57.43%

Confusion matrix (rounded 0-10):
         pred_0  pred_1  pred_2  pred_3  pred_4  pred_5  pred_6  pred_7  \
true_0       38      18       2       0       0       0       0       0   
true_1        0      26      26       6       1       0       0       0   
true_2        0       1      23      24      10       0       0       0   
true_3        0       0       7      39       8       0       0       0   
true_4        0       0       1       6      53       1       0       0   
true_5        0       0       0       0       0       0       0       0   
true_6        0       0       0       4      11      17      23       0   
true_7        0       0       0       0       0       0       0       0   
true_8        0       0       0       0       0       3       7      19   
true_9   

In [24]:
# -----------------------
# 8) Importanza delle feature (permutation importance)
# -----------------------
print("\n[*] Calcolo permutation importance sul best model (può essere lento)...")
perm = permutation_importance(best_model, X_va, y_va, scoring="neg_mean_absolute_error", n_repeats=5, random_state=42, n_jobs=-1)
importances = pd.Series(perm.importances_mean, index=X_full.columns).sort_values(ascending=False)
topk = 25
print(f"\nTop {topk} feature (permutation importance):")
print(importances.head(topk))


[*] Calcolo permutation importance sul best model (può essere lento)...

Top 25 feature (permutation importance):
ipi_cv               0.212644
ay_kurt              0.206717
az_zcr               0.198699
ipi_std              0.173474
az_band_4000_8000    0.170731
az_kurt              0.149803
ax_kurt              0.145088
ax_band_4000_8000    0.144269
ax_zcr               0.114741
az_band_1000_2000    0.114570
az_p10               0.099358
ay_band_4000_8000    0.096863
az_iqr               0.093848
az_p90               0.086114
corr_yz              0.085158
ay_band_500_1000     0.083366
ay_band_0_200        0.080006
az_band_0_200        0.079397
ax_band_0_200        0.079111
ay_iqr               0.069293
ax_min               0.067775
ax_band_500_1000     0.066621
az_max               0.064131
corr_xy              0.063648
ax_skew              0.062416
dtype: float64


In [None]:
# Salva risultati utili
OUT_DIR = "./phm_outputs"
os.makedirs(OUT_DIR, exist_ok=True)
importances.to_csv(os.path.join(OUT_DIR, "feature_importance_permutation.csv"))
rf_cm.to_csv(os.path.join(OUT_DIR, "rf_confusion_matrix.csv"))
hgb_cm.to_csv(os.path.join(OUT_DIR, "hgb_confusion_matrix.csv"))
pd.DataFrame({
    "rf_pred": rf_y_cont,
    "hgb_pred": hgb_y_cont,
    "y_true": y_va
}).to_csv(os.path.join(OUT_DIR, "validation_predictions.csv"), index=False)

print(f"\n[+] File salvati in: {OUT_DIR}")
print("[+] Done.")

NameError: name 'rf_cm' is not defined

## Testing

In [29]:
def parse_test_dataset(dataset_path):
    """
    Parsa il dataset di vibrazione e crea un DataFrame pandas con una riga per ogni file.
    Ogni riga contiene gli array completi delle serie temporali di vibrazione.

    Args:
        dataset_path (str): Percorso alla cartella principale del dataset

    Returns:
        pd.DataFrame: DataFrame con colonne etichetta, velocità, torque, rep e array di dati di vibrazione (una riga per file)
    """

    data_list = []
    
    # Prima passata: conta il numero totale di file .txt
    print("Conteggio file in corso...")
    total_files = 0
    for root, dirs, files in os.walk(dataset_path):
        total_files += sum(1 for file in files if file.endswith('.txt'))
    
    print(f"Trovati {total_files} file .txt da processare")

    # Seconda passada: processa i file con progress bar
    with tqdm(total=total_files, desc="Parsing dataset", unit="file") as pbar:
        for root, dirs, files in os.walk(dataset_path):
            for file in files:
                if file.endswith('.txt'):

                    # ERRORE CORRETTO: regex pattern per V100_200N_2.txt
                    pattern = re.compile(r'^(\d+)_V(\d+)_(\d+)N(?:_(\d+))?\.txt$', re.IGNORECASE)
                    match = re.search(pattern, file)

                    if match:
                        velocita = int(match.group(2))
                        torque = int(match.group(3))
                        id = int(match.group(1))

                        file_path = os.path.join(root, file)
                        try:
                            # Carica tutti i dati del file una volta sola
                            data = np.loadtxt(file_path)
                            
                            # Calcola informazioni aggiuntive sui dati
                            sampling_rate = 20480  # Hz come specificato nella documentazione
                            duration = len(data) / sampling_rate
                            
                            # Crea un record per l'intero file
                            record = {
                                'file_name': file,
                                'id': id,
                                'velocita': velocita,
                                'torque': torque,
                                'horizontal_acceleration': data[:, 0],  # Array completo
                                'axial_acceleration': data[:, 1],       # Array completo
                                'vertical_acceleration': data[:, 2],    # Array completo
                                'tachometer_signal': data[:, 3],        # Array completo
                                'sampling_rate': sampling_rate,
                                'duration': duration,
                                'num_samples': len(data),
                            }
                            # ERRORE CORRETTO: aggiungi record invece di data
                            data_list.append(record)

                        except Exception as e:
                            tqdm.write(f"Errore nel leggere il file {file_path}: {e}")
                            continue
                    else:
                        tqdm.write(f"Nome file non riconosciuto: {file}")
                    
                    # Aggiorna la progress bar
                    pbar.update(1)

    df = pd.DataFrame(data_list)

    if not df.empty:
        print("\nOrdinamento dataset...")
        df = df.sort_values(['velocita', 'torque']).reset_index(drop=True)
        print(f"Dataset caricato: {len(df)} file processati")
        print(f"Condizioni operative (rpm): {sorted(df['velocita'].unique())}")
        print(f"Condizioni operative (torque): {sorted(df['torque'].unique())}")

    return df


test_df = parse_test_dataset(ROOT_TEST_DATA_FOLDER)

Conteggio file in corso...
Trovati 800 file .txt da processare


Parsing dataset: 100%|██████████| 800/800 [00:31<00:00, 25.16file/s]


Ordinamento dataset...
Dataset caricato: 800 file processati
Condizioni operative (rpm): [np.int64(100), np.int64(200), np.int64(300), np.int64(400), np.int64(500), np.int64(600), np.int64(700), np.int64(800), np.int64(900), np.int64(1000), np.int64(1200), np.int64(1500), np.int64(1800), np.int64(2100), np.int64(2400), np.int64(2700), np.int64(3000), np.int64(3600)]
Condizioni operative (torque): [np.int64(50), np.int64(100), np.int64(200), np.int64(300), np.int64(400), np.int64(500)]





In [23]:
test_df.to_pickle('../data/processed/test_data.pkl')

In [None]:
test_df = pd.read_pickle('../data/processed/test_data.pkl')

In [30]:
test_full = expand_features(test_df, array_cols=array_cols, scalar_cols=scalar_cols)
test_full

  feats[f'band_{lo}_{hi}'] = float(np.trapz(Pxx[mask], f[mask])) if np.any(mask) else 0.0
  total = float(np.trapz(Pxx, f)) if f.size else 1.0


Unnamed: 0,velocita,torque,ax_mean,ax_std,ax_min,ax_max,ax_median,ax_p10,ax_p90,ax_rms,...,rpm,pulse_count,ipi_mean,ipi_std,ipi_cv,v_times_t,v_sq,t_sq,v_over_rpm,t_over_rpm
0,100,50,-0.000006,0.044517,-0.194519,0.133359,0.001613,-0.059671,0.057065,0.044517,...,54.771784,11,1.082231,0.000321,0.000297,5000,10000,2500,1.825758,0.912879
1,100,50,0.000107,0.038580,-0.141051,0.111029,0.001737,-0.052475,0.049498,0.038580,...,54.545455,11,1.082144,0.000185,0.000171,5000,10000,2500,1.833333,0.916667
2,100,50,-0.000111,0.188699,-0.864913,0.400202,0.017988,-0.262501,0.223547,0.188699,...,54.771784,11,1.082217,0.000132,0.000122,5000,10000,2500,1.825758,0.912879
3,100,50,0.000458,0.044034,-0.186951,0.120458,0.002729,-0.059671,0.056073,0.044037,...,54.545455,11,1.082197,0.000199,0.000184,5000,10000,2500,1.833333,0.916667
4,100,50,-0.000101,0.152046,-0.676597,0.352317,0.007319,-0.204195,0.188564,0.152046,...,54.771784,11,1.082236,0.000264,0.000244,5000,10000,2500,1.825758,0.912879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,3600,100,-0.003586,1.274652,-4.568457,5.180049,-0.024873,-1.646833,1.676321,1.274657,...,2006.557377,102,0.030001,0.000025,0.000837,360000,12960000,10000,1.794118,0.049837
796,3600,100,0.003537,1.069601,-5.260684,5.119634,0.013212,-1.353355,1.342935,1.069607,...,2012.903226,104,0.030001,0.000024,0.000805,360000,12960000,10000,1.788462,0.049679
797,3600,100,-0.000924,1.692009,-5.513137,6.033176,-0.045776,-2.176052,2.248959,1.692010,...,1986.885246,101,0.030002,0.000029,0.000958,360000,12960000,10000,1.811881,0.050330
798,3600,100,-0.001434,1.061359,-4.559400,5.011086,-0.003225,-1.355923,1.358069,1.061360,...,1993.548387,103,0.030001,0.000024,0.000804,360000,12960000,10000,1.805825,0.050162


In [31]:
# Definiamo le classi da 0 a 10
CLASSES = np.arange(11)

def reg_to_probs(y_cont, classes=CLASSES, sigma=0.75):
    """
    Converte una previsione continua (da un modello di regressione) 
    in una distribuzione di probabilità su un insieme discreto di classi.

    Funzionamento:
    1. Clippa i valori previsti (`y_cont`) ai limiti min/max delle classi ammesse.
    2. Calcola la distanza di ogni previsione da ciascuna classe.
    3. Applica una funzione gaussiana centrata sul valore previsto 
       (larghezza controllata da `sigma`) per ottenere un peso a ogni classe.
    4. Normalizza i pesi in modo che sommino a 1, ottenendo una distribuzione di probabilità.

    Parametri:
    - y_cont: array di previsioni continue.
    - classes: array di valori discreti delle classi.
    - sigma: deviazione standard della "campana" gaussiana usata per assegnare i pesi.

    Output:
    - Matrice (n_samples x n_classes) con la probabilità di appartenenza a ciascuna classe.
    """
    y_cont = np.clip(y_cont.astype(float), classes.min(), classes.max())
    d = classes[None, :] - y_cont[:, None]
    w = np.exp(-0.5 * (d / sigma)**2)
    return w / w.sum(axis=1, keepdims=True)

# Predizioni continue
y_cont = best_model.predict(test_full).astype(float)
y_cont = np.clip(y_cont, 0, 10)

# Pseudo-probabilità (NUMERICHE)
probs = reg_to_probs(y_cont)
probs_df_num = pd.DataFrame(probs, columns=[f"prob_{c}" for c in CLASSES])

# ---- Confidenza binaria ----
# 1) soglia sulla max-prob
max_prob = probs_df_num.max(axis=1)

# 2) entropia normalizzata in [0,1]
eps = 1e-12
entropy = -(probs_df_num * np.log(probs_df_num + eps)).sum(axis=1) / np.log(len(CLASSES))

# soglie (tunable)
THRESH_MAX = 0.50     # es. almeno 50% sulla classe più probabile
THRESH_ENT = 0.70     # entropia non troppo alta (<= 0.70)

high_conf = ((max_prob >= THRESH_MAX) & (entropy <= THRESH_ENT)).astype(int)

# (Opzionale) criterio alternativo o aggiuntivo: margine top1-top2
# top2 = np.partition(probs, -2, axis=1)[:, -2]
# margin = max_prob - top2
# high_conf = ((max_prob >= THRESH_MAX) & (entropy <= THRESH_ENT) & (margin >= 0.15)).astype(int)

# ---- Formattazione per output umano ----
probs_df_fmt = (probs_df_num).round(2).astype(str)

# Unisci con gli ID e la confidenza
df_with_probs = pd.concat(
    [test_df[["id"]], probs_df_fmt],
    axis=1
)
df_with_probs["high_confidence"] = high_conf  # 0 = bassa, 1 = alta
df_with_probs = df_with_probs.sort_values(['id']).reset_index(drop=True)
print(df_with_probs)


      id prob_0 prob_1 prob_2 prob_3 prob_4 prob_5 prob_6 prob_7 prob_8  \
0      1    0.0    0.0   0.01    0.2   0.53   0.24   0.02    0.0    0.0   
1      2    0.0   0.01   0.15   0.51    0.3   0.03    0.0    0.0    0.0   
2      3    0.0    0.0    0.0    0.0   0.01    0.2   0.53   0.24   0.02   
3      4    0.0   0.03    0.3   0.51   0.15   0.01    0.0    0.0    0.0   
4      5    0.0    0.0    0.0    0.0    0.0   0.05   0.38   0.47    0.1   
..   ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
795  796    0.0    0.0    0.0    0.0    0.0   0.04   0.34   0.49   0.12   
796  797    0.1   0.48   0.37   0.05    0.0    0.0    0.0    0.0    0.0   
797  798   0.02   0.23   0.53    0.2   0.01    0.0    0.0    0.0    0.0   
798  799    0.0    0.0    0.0   0.01   0.18   0.53   0.27   0.02    0.0   
799  800   0.14   0.51   0.32   0.03    0.0    0.0    0.0    0.0    0.0   

    prob_9 prob_10  high_confidence  
0      0.0     0.0                1  
1      0.0     0.0     

