In [16]:
# Converts relative paths to absolute ones
ROOT_TRAIN_DATA_FOLDER ='../data/raw/Data_Challenge_PHM2023_training_data'
ROOT_VAL_DATA_FOLDER = 'data/raw/B - PHM America 2023 - Dataset/Data_Challenge_PHM2023_test_data'

In [7]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns

## Parsing

In [18]:
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import scipy.stats as stats
from scipy.fft import fft, fftfreq

def parse_vibration_dataset(dataset_path):
    """
    Parsa il dataset di vibrazione e crea un DataFrame pandas con una riga per ogni file.
    Ogni riga contiene gli array completi delle serie temporali di vibrazione.

    Args:
        dataset_path (str): Percorso alla cartella principale del dataset

    Returns:
        pd.DataFrame: DataFrame con colonne etichetta, velocità, torque, rep e array di dati di vibrazione (una riga per file)
    """

    data_list = []
    
    # Prima passata: conta il numero totale di file .txt
    print("Conteggio file in corso...")
    total_files = 0
    for root, dirs, files in os.walk(dataset_path):
        total_files += sum(1 for file in files if file.endswith('.txt'))
    
    print(f"Trovati {total_files} file .txt da processare")

    # Seconda passada: processa i file con progress bar
    with tqdm(total=total_files, desc="Parsing dataset", unit="file") as pbar:
        for root, dirs, files in os.walk(dataset_path):
            for file in files:
                if file.endswith('.txt'):
                    folder_name = os.path.basename(root)
                    if 'Pitting_degradation_level_' in folder_name:
                        etichetta_full = folder_name.replace('Pitting_degradation_level_', '')
                        if '(' in etichetta_full:
                            etichetta = etichetta_full.split('(')[0].strip()
                            descrizione = etichetta_full.split('(')[1].replace(')', '').strip()
                        else:
                            etichetta = etichetta_full.strip()
                            descrizione = None
                    else:
                        etichetta = folder_name
                        descrizione = None

                    # ERRORE CORRETTO: regex pattern per V100_200N_2.txt
                    pattern = r'V(\d+)_(\d+)N_(\d+)\.txt'
                    match = re.search(pattern, file)

                    if match:
                        velocita = int(match.group(1))
                        torque = int(match.group(2))
                        rep = int(match.group(3))

                        file_path = os.path.join(root, file)
                        try:
                            # Carica tutti i dati del file una volta sola
                            data = np.loadtxt(file_path)
                            
                            # Calcola informazioni aggiuntive sui dati
                            sampling_rate = 20480  # Hz come specificato nella documentazione
                            duration = len(data) / sampling_rate
                            
                            # Crea un record per l'intero file
                            record = {
                                'file_name': file,
                                'etichetta': etichetta,
                                'health_level': int(etichetta) if etichetta.isdigit() else etichetta,
                                'velocita': velocita,
                                'torque': torque,
                                'rep': rep,
                                'horizontal_acceleration': data[:, 0],  # Array completo
                                'axial_acceleration': data[:, 1],       # Array completo
                                'vertical_acceleration': data[:, 2],    # Array completo
                                'tachometer_signal': data[:, 3],        # Array completo
                                'sampling_rate': sampling_rate,
                                'duration': duration,
                                'num_samples': len(data),
                                'descrizione': descrizione
                            }
                            # ERRORE CORRETTO: aggiungi record invece di data
                            data_list.append(record)

                        except Exception as e:
                            tqdm.write(f"Errore nel leggere il file {file_path}: {e}")
                            continue
                    else:
                        tqdm.write(f"Nome file non riconosciuto: {file}")
                    
                    # Aggiorna la progress bar
                    pbar.update(1)

    df = pd.DataFrame(data_list)

    if not df.empty:
        print("\nOrdinamento dataset...")
        df = df.sort_values(['health_level', 'velocita', 'torque', 'rep']).reset_index(drop=True)
        print(f"Dataset caricato: {len(df)} file processati")
        print(f"Health levels disponibili: {sorted(df['health_level'].unique())}")
        print(f"Condizioni operative (rpm): {sorted(df['velocita'].unique())}")
        print(f"Condizioni operative (torque): {sorted(df['torque'].unique())}")

    return df


def extract_signal_features(df, signal_column):
    """
    Estrae features statistiche e nel dominio della frequenza da una colonna di segnali.
    
    Args:
        df (pd.DataFrame): DataFrame con i dati
        signal_column (str): Nome della colonna contenente gli array dei segnali
    
    Returns:
        pd.DataFrame: DataFrame con features estratte
    """
    
    features_list = []
    
    # Progress bar per l'estrazione delle features
    with tqdm(total=len(df), desc=f"Estraendo features da {signal_column}", unit="segnale") as pbar:
        for idx, row in df.iterrows():
            signal = row[signal_column]
            
            # Features statistiche nel dominio del tempo
            features = {
                f'{signal_column}_mean': np.mean(signal),
                f'{signal_column}_std': np.std(signal),
                f'{signal_column}_rms': np.sqrt(np.mean(signal**2)),
                f'{signal_column}_peak': np.max(np.abs(signal)),
                f'{signal_column}_peak_to_peak': np.ptp(signal),
                f'{signal_column}_skewness': stats.skew(signal),
                f'{signal_column}_kurtosis': stats.kurtosis(signal),
                f'{signal_column}_crest_factor': np.max(np.abs(signal)) / np.sqrt(np.mean(signal**2)),
            }
            
            # Features nel dominio della frequenza
            sampling_rate = row['sampling_rate']
            fft_vals = np.abs(fft(signal))
            freqs = fftfreq(len(signal), 1/sampling_rate)
            
            # Considera solo le frequenze positive
            positive_freq_idx = freqs > 0
            fft_positive = fft_vals[positive_freq_idx]
            freqs_positive = freqs[positive_freq_idx]
            
            # Trova la frequenza dominante
            dominant_freq_idx = np.argmax(fft_positive)
            features[f'{signal_column}_dominant_freq'] = freqs_positive[dominant_freq_idx]
            features[f'{signal_column}_dominant_magnitude'] = fft_positive[dominant_freq_idx]
            
            # Features spettrali
            total_power = np.sum(fft_positive**2)
            features[f'{signal_column}_spectral_centroid'] = np.sum(freqs_positive * fft_positive**2) / total_power
            features[f'{signal_column}_spectral_rolloff'] = freqs_positive[np.where(np.cumsum(fft_positive**2) >= 0.85 * total_power)[0][0]]
            
            features_list.append(features)
            pbar.update(1)
    
    return pd.DataFrame(features_list)


def extract_all_features(df):
    """
    Estrae tutte le features da tutti i segnali di accelerazione con progress bar complessiva.
    
    Args:
        df (pd.DataFrame): DataFrame con i dati grezzi
    
    Returns:
        pd.DataFrame: DataFrame completo con tutte le features
    """
    print("Iniziando estrazione features da tutti i segnali...")
    
    # Estrai features da tutti i segnali di accelerazione
    horizontal_features = extract_signal_features(df, 'horizontal_acceleration')
    axial_features = extract_signal_features(df, 'axial_acceleration')
    vertical_features = extract_signal_features(df, 'vertical_acceleration')

    print("Combinando tutte le features...")
    # Combina tutte le features
    features_df = pd.concat([
        df[['file_name', 'health_level', 'velocita', 'torque', 'rep']],
        horizontal_features,
        axial_features, 
        vertical_features
    ], axis=1)

    print(f"Features dataset completato! Shape: {features_df.shape}")
    return features_df


# Esempio di utilizzo:
# Carica il dataset
print("=== CARICAMENTO DATASET ===")
df = parse_vibration_dataset(ROOT_TRAIN_DATA_FOLDER)

df.head()

=== CARICAMENTO DATASET ===
Conteggio file in corso...
Trovati 2016 file .txt da processare


Parsing dataset: 100%|██████████| 2016/2016 [09:22<00:00,  3.58file/s]


Ordinamento dataset...
Dataset caricato: 2016 file processati
Health levels disponibili: [0, 1, 2, 3, 4, 6, 8]
Condizioni operative (rpm): [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200, 2100, 2700, 3000, 3600]
Condizioni operative (torque): [50, 100, 200, 300, 400, 500]





Unnamed: 0,file_name,etichetta,health_level,velocita,torque,rep,horizontal_acceleration,axial_acceleration,vertical_acceleration,tachometer_signal,sampling_rate,duration,num_samples,descrizione
0,V100_50N_1.txt,0,0,100,50,1,"[-0.4626018245, -0.7124489885, -0.5509291238, ...","[-0.4991110174, -0.7377190438, -0.5699753646, ...","[-0.4254987892, -0.814484938, -0.5429899587, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20480,12.05,246784,Healthy
1,V100_50N_2.txt,0,0,100,50,2,"[0.08373725705, -0.01240551956, -0.05421212049...","[0.04308363971, -0.05226539899, -0.09993991833...","[0.0949569657, -0.06374649093, -0.1336386738, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20480,12.05,246784,Healthy
2,V100_50N_3.txt,0,0,100,50,3,"[-0.06376437055, 0.09676305259, -0.04503203601...","[-0.07804495389, 0.07745637958, -0.06497860415...","[-0.0985720786, 0.153401291, -0.08254507805, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20480,12.05,246784,Healthy
3,V100_50N_4.txt,0,0,100,50,4,"[0.07418500699, -0.03634817232, -0.08572214018...","[0.06933405406, -0.03107672372, -0.07251235535...","[0.1519552459, -0.01518347421, -0.08507565708,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20480,12.05,246784,Healthy
4,V100_50N_5.txt,0,0,100,50,5,"[-0.02816052941, 0.04416364964, 0.05731350038,...","[-0.06439002983, 0.02001152664, 0.03849276007,...","[-0.07965298772, 0.031089971, 0.06398749846, -...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",20480,12.05,246784,Healthy


In [21]:
df.to_pickle('../data/processed/train_data_walt.pkl')

In [22]:
print("\n=== ESTRAZIONE FEATURES ===")
# Estrai tutte le features
features_df = extract_all_features(df)

print(f"\n=== COMPLETATO ===")
print(f"Dataset finale: {features_df.shape[0]} campioni, {features_df.shape[1]} features")
print(f"Colonne delle features: {[col for col in features_df.columns if col not in ['file_name', 'health_level', 'velocita', 'torque', 'rep']]}")


=== ESTRAZIONE FEATURES ===
Iniziando estrazione features da tutti i segnali...


Estraendo features da horizontal_acceleration: 100%|██████████| 2016/2016 [01:04<00:00, 31.24segnale/s]
Estraendo features da axial_acceleration: 100%|██████████| 2016/2016 [01:04<00:00, 31.06segnale/s]
Estraendo features da vertical_acceleration: 100%|██████████| 2016/2016 [01:01<00:00, 32.53segnale/s]


Combinando tutte le features...
Features dataset completato! Shape: (2016, 41)

=== COMPLETATO ===
Dataset finale: 2016 campioni, 41 features
Colonne delle features: ['horizontal_acceleration_mean', 'horizontal_acceleration_std', 'horizontal_acceleration_rms', 'horizontal_acceleration_peak', 'horizontal_acceleration_peak_to_peak', 'horizontal_acceleration_skewness', 'horizontal_acceleration_kurtosis', 'horizontal_acceleration_crest_factor', 'horizontal_acceleration_dominant_freq', 'horizontal_acceleration_dominant_magnitude', 'horizontal_acceleration_spectral_centroid', 'horizontal_acceleration_spectral_rolloff', 'axial_acceleration_mean', 'axial_acceleration_std', 'axial_acceleration_rms', 'axial_acceleration_peak', 'axial_acceleration_peak_to_peak', 'axial_acceleration_skewness', 'axial_acceleration_kurtosis', 'axial_acceleration_crest_factor', 'axial_acceleration_dominant_freq', 'axial_acceleration_dominant_magnitude', 'axial_acceleration_spectral_centroid', 'axial_acceleration_spec

## Preprocessing

In [26]:
from scipy.signal import butter, filtfilt, resample
import numpy as np

def butter_lowpass_filter(signal, cutoff, fs, order=5):
    """Applica un filtro passa basso Butterworth."""
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return filtfilt(b, a, signal)

def preprocess_and_resample(signal, fs=20480, target_length=61440, cutoff=5000, filter_order=5):
    """
    Filtra e fa il resample di un segnale ad una lunghezza fissa.
    """
    filtered = butter_lowpass_filter(signal, cutoff, fs, order=filter_order)
    return resample(filtered, target_length)


In [27]:
def preprocess_vibration_dataframe(df, target_length=61440, cutoff=5000):
    """
    Applica filtraggio e resample a tutti i segnali accelerometrici e tachimetro nel DataFrame.
    Ritorna un nuovo DataFrame con colonne preprocessate.
    """
    processed_records = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Preprocessing"):
        try:
            record = {
                'file_name': row['file_name'],
                'etichetta': row['etichetta'],
                'health_level': row['health_level'],
                'velocita': row['velocita'],
                'torque': row['torque'],
                'rep': row['rep'],
                'sampling_rate': row['sampling_rate'],
                'descrizione': row['descrizione'],
                'duration': row['duration'],
                'num_samples': row['num_samples']
            }

            # Applica filtro + resample ai 4 segnali
            record['horizontal_acceleration'] = preprocess_and_resample(
                row['horizontal_acceleration'], fs=row['sampling_rate'], target_length=target_length, cutoff=cutoff
            )
            record['axial_acceleration'] = preprocess_and_resample(
                row['axial_acceleration'], fs=row['sampling_rate'], target_length=target_length, cutoff=cutoff
            )
            record['vertical_acceleration'] = preprocess_and_resample(
                row['vertical_acceleration'], fs=row['sampling_rate'], target_length=target_length, cutoff=cutoff
            )
            record['tachometer_signal'] = preprocess_and_resample(
                row['tachometer_signal'], fs=row['sampling_rate'], target_length=target_length, cutoff=cutoff
            )

            processed_records.append(record)

        except Exception as e:
            tqdm.write(f"Errore durante il preprocessing del file {row['file_name']}: {e}")
            continue

    return pd.DataFrame(processed_records)


In [28]:
df = preprocess_vibration_dataframe(df)
df.head

Preprocessing:   1%|          | 16/2016 [00:20<42:49,  1.28s/it]


KeyboardInterrupt: 

## Train

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class GearboxFaultDetector(nn.Module):
    """
    Modello multi-input per la diagnosi di fault nei gearbox.
    Gestisce segnali di vibrazione multi-assiali e condizioni operative variabili.
    """
    
    def __init__(self, input_length=61440, num_classes=11, confidence_output=True):
        super(GearboxFaultDetector, self).__init__()
        
        self.input_length = input_length
        self.num_classes = num_classes
        self.confidence_output = confidence_output
        
        # Encoder CNN per ogni asse di vibrazione
        self.horizontal_encoder = self._make_cnn_encoder()
        self.axial_encoder = self._make_cnn_encoder()
        self.vertical_encoder = self._make_cnn_encoder()
        
        # Encoder per il segnale tachometro
        self.tacho_encoder = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=64, stride=8, padding=28),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(512)
        )
        
        # Encoder per condizioni operative (rpm, torque)
        self.operational_encoder = nn.Sequential(
            nn.Linear(2, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU()
        )
        
        # Fusion layer
        fusion_input_dim = 512 * 3 + 512 + 128  # 3 assi + tacho + operational
        self.fusion = nn.Sequential(
            nn.Linear(fusion_input_dim, 1024),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.Dropout(0.3),
            nn.ReLU()
        )
        
        # Output heads
        self.health_classifier = nn.Linear(512, num_classes)
        
        if confidence_output:
            self.confidence_head = nn.Sequential(
                nn.Linear(512, 256),
                nn.ReLU(),
                nn.Linear(256, 1),
                nn.Sigmoid()  # Output tra 0 e 1
            )
    
    def _make_cnn_encoder(self):
        """Crea un encoder CNN per i segnali di vibrazione"""
        return nn.Sequential(
            # Block 1
            nn.Conv1d(1, 64, kernel_size=64, stride=4, padding=30),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(4),
            
            # Block 2
            nn.Conv1d(64, 128, kernel_size=32, stride=2, padding=15),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(4),
            
            # Block 3
            nn.Conv1d(128, 256, kernel_size=16, stride=2, padding=7),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.MaxPool1d(4),
            
            # Block 4
            nn.Conv1d(256, 512, kernel_size=8, stride=2, padding=3),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(512)
        )
    
    def forward(self, horizontal, axial, vertical, tachometer, rpm, torque):
        """
        Forward pass del modello
        
        Args:
            horizontal, axial, vertical: tensori (batch_size, input_length)
            tachometer: tensore (batch_size, input_length)
            rpm, torque: tensori (batch_size, 1)
        """
        batch_size = horizontal.size(0)
        
        # Reshape per CNN (add channel dimension)
        horizontal = horizontal.unsqueeze(1)  # (batch_size, 1, input_length)
        axial = axial.unsqueeze(1)
        vertical = vertical.unsqueeze(1)
        tachometer = tachometer.unsqueeze(1)
        
        # Encode vibration signals
        h_feat = self.horizontal_encoder(horizontal)  # (batch_size, 512, 512)
        a_feat = self.axial_encoder(axial)
        v_feat = self.vertical_encoder(vertical)
        
        # Encode tachometer
        t_feat = self.tacho_encoder(tachometer)
        
        # Flatten spatial dimensions
        h_feat = h_feat.view(batch_size, -1)  # (batch_size, 512*512)
        a_feat = a_feat.view(batch_size, -1)
        v_feat = v_feat.view(batch_size, -1)
        t_feat = t_feat.view(batch_size, -1)
        
        # Encode operational conditions
        operational_conditions = torch.cat([rpm, torque], dim=1)  # (batch_size, 2)
        op_feat = self.operational_encoder(operational_conditions)
        
        # Fusion
        combined_features = torch.cat([h_feat, a_feat, v_feat, t_feat, op_feat], dim=1)
        fused_features = self.fusion(combined_features)
        
        # Output predictions
        health_logits = self.health_classifier(fused_features)
        health_probs = F.softmax(health_logits, dim=1)
        
        if self.confidence_output:
            confidence = self.confidence_head(fused_features)
            return health_probs, confidence
        else:
            return health_probs

class DataPreprocessor:
    """Classe per preprocessare i dati del dataset gearbox"""
    
    def __init__(self, target_length=61440, overlap=0.5):
        self.target_length = target_length
        self.overlap = overlap
    
    def extract_segments(self, signal):
        """Estrae segmenti di lunghezza fissa da segnale variabile"""
        if len(signal) < self.target_length:
            # Padding per segnali troppo corti
            padding = self.target_length - len(signal)
            signal = np.pad(signal, (0, padding), mode='constant')
            return [signal]
        
        segments = []
        step = int(self.target_length * (1 - self.overlap))
        
        for i in range(0, len(signal) - self.target_length + 1, step):
            segments.append(signal[i:i + self.target_length])
        
        return segments
    
    def normalize_signal(self, signal, method='standardize'):
        """Normalizza il segnale"""
        if method == 'standardize':
            return (signal - np.mean(signal)) / (np.std(signal) + 1e-8)
        elif method == 'minmax':
            min_val, max_val = np.min(signal), np.max(signal)
            return (signal - min_val) / (max_val - min_val + 1e-8)
        return signal
    
    def prepare_dataset(self, df):
        """Prepara il dataset per il training"""
        X_horizontal, X_axial, X_vertical, X_tacho = [], [], [], []
        X_rpm, X_torque = [], []
        y_labels = []
        
        for _, row in df.iterrows():
            # Estrai segmenti da ogni segnale
            h_segments = self.extract_segments(row['horizontal_acceleration'])
            a_segments = self.extract_segments(row['axial_acceleration'])
            v_segments = self.extract_segments(row['vertical_acceleration'])
            t_segments = self.extract_segments(row['tachometer_signal'])
            
            # Assicurati che tutti abbiano lo stesso numero di segmenti
            min_segments = min(len(h_segments), len(a_segments), 
                             len(v_segments), len(t_segments))
            
            for i in range(min_segments):
                X_horizontal.append(self.normalize_signal(h_segments[i]))
                X_axial.append(self.normalize_signal(a_segments[i]))
                X_vertical.append(self.normalize_signal(v_segments[i]))
                X_tacho.append(self.normalize_signal(t_segments[i]))
                
                X_rpm.append(row['velocita'])
                X_torque.append(row['torque'])
                y_labels.append(row['health_level'])
        
        return {
            'horizontal': np.array(X_horizontal),
            'axial': np.array(X_axial),
            'vertical': np.array(X_vertical),
            'tachometer': np.array(X_tacho),
            'rpm': np.array(X_rpm).reshape(-1, 1),
            'torque': np.array(X_torque).reshape(-1, 1),
            'labels': np.array(y_labels)
        }

# Esempio di utilizzo
def train_model(df_train):
    """Esempio di training del modello"""
    
    # Preprocessa i dati
    preprocessor = DataPreprocessor(target_length=61440, overlap=0.5)
    data = preprocessor.prepare_dataset(df_train)
    
    # Inizializza il modello
    model = GearboxFaultDetector(
        input_length=61440, 
        num_classes=11, 
        confidence_output=True
    )
    
    # Setup training
    criterion_class = nn.CrossEntropyLoss()
    criterion_conf = nn.MSELoss()  # Per la confidenza
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    
    # Convert to tensors
    h_tensor = torch.FloatTensor(data['horizontal'])
    a_tensor = torch.FloatTensor(data['axial'])
    v_tensor = torch.FloatTensor(data['vertical'])
    t_tensor = torch.FloatTensor(data['tachometer'])
    rpm_tensor = torch.FloatTensor(data['rpm'])
    torque_tensor = torch.FloatTensor(data['torque'])
    labels_tensor = torch.LongTensor(data['labels'])
    
    print(f"Dataset preparato: {len(data['labels'])} campioni")
    print(f"Forme tensori - H:{h_tensor.shape}, A:{a_tensor.shape}, V:{v_tensor.shape}")
    
    return model, (h_tensor, a_tensor, v_tensor, t_tensor, rpm_tensor, torque_tensor, labels_tensor)