# **Pirate Pain Challenge - Hyperparameters Tuning**

## üåê **Google Drive Connection or local mount**

In [55]:
import os

isColab = False
isKaggle = False

# Directory di default
current_dir = os.getcwd()

try:
    if not isColab:
        raise ImportError("We are not in google colab")
    from google.colab import drive

    drive.mount("/gdrive")
    current_dir = "/gdrive/My\\ Drive/[2025-2026]\\ AN2DL/AN2DL-challenge-1/"
    print("In esecuzione su Colab. Google Drive montato.")
    %cd $current_dir
    isColab = True

except ImportError:
    # Rilevamento ambiente Kaggle
    if os.environ.get("KAGGLE_KERNEL_RUN_TYPE") or os.path.exists("/kaggle/working") or isKaggle:
        isKaggle = True
        kaggle_work_dir = "/kaggle/working/AN2DL-challenge-1"
        os.makedirs(kaggle_work_dir, exist_ok=True)
        current_dir = kaggle_work_dir
        print("In esecuzione su Kaggle. Directory di lavoro impostata.")
    else:
        isColab = False
        isKaggle = False
        print("Esecuzione locale. Salto mount Google Drive.")
        local_pref = r"G:\Il mio Drive\Colab Notebooks\[2025-2026] AN2DL\AN2DL-challenge-1"
        current_dir = local_pref if os.path.isdir(local_pref) else os.getcwd()
        print(f"Directory corrente impostata a: {current_dir}")

# Cambio directory se non Colab (su Colab √® gi√† fatto con %cd)
if not isColab:
    os.chdir(current_dir)

print(f"Changed directory to: {current_dir}")

Esecuzione locale. Salto mount Google Drive.
Directory corrente impostata a: G:\Il mio Drive\Colab Notebooks\[2025-2026] AN2DL\AN2DL-challenge-1
Changed directory to: G:\Il mio Drive\Colab Notebooks\[2025-2026] AN2DL\AN2DL-challenge-1


## ‚öôÔ∏è **Libraries Import**

In [56]:
# Set seed for reproducibility
SEED = 42

# Import necessary libraries
import os

# Set environment variables before importing modules
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# Import necessary modules
import random
import numpy as np

# Set seeds for random number generators in NumPy and Python
np.random.seed(SEED)
random.seed(SEED)

# Import PyTorch
import torch

torch.manual_seed(SEED)
# from torchsummary import summary

logs_dir = "tensorboard"
if isColab:
    !pkill -f tensorboard
else:
    # Arresta eventuali processi tensorboard in locale (Windows)
    import os

    if os.name == 'nt':
        try:
            import psutil

            for proc in psutil.process_iter(['name', 'cmdline']):
                name = (proc.info.get('name') or '').lower()
                cmd = ' '.join(proc.info.get('cmdline') or []).lower()
                if 'tensorboard' in name or 'tensorboard' in cmd:
                    try:
                        proc.kill()
                    except Exception:
                        pass
        except ImportError:
            import subprocess

            subprocess.run(['taskkill', '/F', '/IM', 'tensorboard.exe'],
                           stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

%load_ext tensorboard
if isColab:
    !mkdir -p models
else:
    os.makedirs("../models", exist_ok=True)

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
else:
    device = torch.device("cpu")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Import other libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Configure plot display settings
sns.set(font_scale=1.4)
sns.set_style('white')
plt.rc('font', size=14)
%matplotlib inline

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
PyTorch version: 2.9.0+cu130
Device: cuda


## ‚è≥ **Data Downloading**

In [57]:
import os
import zipfile

# --- 1. Impostazioni ---
competition_name = 'an2dl2526c1'
dataset_path = 'dataset'
if isKaggle:
    dataset_path = '/kaggle/input/pirate-pain/dataset'
train_file = 'pirate_pain_train.csv'
test_file = 'pirate_pain_test.csv'
labels_file = 'pirate_pain_train_labels.csv'
sample_submission_file = 'sample_submission.csv'

# Controlla se il dataset √® gi√† stato scaricato ed estratto
if not isKaggle and not isColab and not os.path.exists(os.path.join(dataset_path, train_file)):
    # --- 2. Autenticazione e Download ---
    from kaggle.api.kaggle_api_extended import KaggleApi

    # Inizializza l'API di Kaggle
    # L'autenticazione avviene automaticamente se 'kaggle.json' √® in C:\\Users\\Bert0ns\\.kaggle\\
    api = KaggleApi()
    api.authenticate()

    print(f"Download del dataset dalla competizione '{competition_name}'...")

    # Crea la directory di destinazione se non esiste
    os.makedirs(dataset_path, exist_ok=True)

    # Scarica i file della competizione nella cartella 'dataset'
    api.competition_download_files(competition_name, path=dataset_path)

    # Estrai i file dall'archivio zip
    zip_path = os.path.join(dataset_path, f'{competition_name}.zip')
    if os.path.exists(zip_path):
        print(f"Estrazione dei file da '{zip_path}'...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(dataset_path)
        # Rimuovi il file zip dopo l'estrazione
        os.remove(zip_path)
        print("Estrazione completata e file zip rimosso.")
    else:
        print("ATTENZIONE: File zip non trovato. Il download potrebbe non essere riuscito.")
else:
    print(f"Il dataset √® gi√† presente nella cartella {dataset_path}. Download saltato.")


Il dataset √® gi√† presente nella cartella dataset. Download saltato.


## üîé **Exploration and Data Analysis**

In [58]:
import pandas as pd

# Load the datasets
if not isColab:
    dataset_df = pd.read_csv(os.path.join(dataset_path, train_file))
    kaggle_test_df = pd.read_csv(os.path.join(dataset_path, test_file))
    labels_df = pd.read_csv(os.path.join(dataset_path, labels_file))
else:
    dataset_df = pd.read_csv(dataset_path + '/pirate_pain_train.csv')
    kaggle_test_df = pd.read_csv(dataset_path + '/pirate_pain_test.csv')
    labels_df = pd.read_csv(dataset_path + '/pirate_pain_train_labels.csv')

**Convert data to a memory efficient form**

In [59]:

text_map = {
    'two': 0,
    'one+peg_leg': 1, 'one+hook_hand': 2, 'one+eye_patch': 3,
}

# Pulisce, normalizza, mappa; fallback a numerico e a cifre estratte
columns_to_convert = ['n_legs', 'n_hands', 'n_eyes']
for col in columns_to_convert:
    dataset_df[col] = dataset_df[col].str.strip().str.lower().map(text_map).astype('int8')
    kaggle_test_df[col] = kaggle_test_df[col].str.strip().str.lower().map(text_map).astype('int8')

# train_df.head(105760)

In [60]:
# Convert data types from float64 to float32 to save memory
dataset_df[dataset_df.select_dtypes(include=['float64']).columns] = dataset_df.select_dtypes(
    include=['float64']).astype(
    'float32')
kaggle_test_df[kaggle_test_df.select_dtypes(include=['float64']).columns] = kaggle_test_df.select_dtypes(
    include=['float64']).astype(
    'float32')

# Convert int64 to int32
dataset_df[dataset_df.select_dtypes(include=['int64']).columns] = dataset_df.select_dtypes(include=['int64']).astype(
    'int32')
kaggle_test_df[kaggle_test_df.select_dtypes(include=['int64']).columns] = kaggle_test_df.select_dtypes(
    include=['int64']).astype('int32')
labels_df[labels_df.select_dtypes(include=['int64']).columns] = labels_df.select_dtypes(include=['int64']).astype(
    'int32')

# Convert pain surveys to int8
dataset_df['pain_survey_1'] = dataset_df['pain_survey_1'].astype('int8')
dataset_df['pain_survey_2'] = dataset_df['pain_survey_2'].astype('int8')
dataset_df['pain_survey_3'] = dataset_df['pain_survey_3'].astype('int8')
dataset_df['pain_survey_4'] = dataset_df['pain_survey_4'].astype('int8')

kaggle_test_df['pain_survey_1'] = kaggle_test_df['pain_survey_1'].astype('int8')
kaggle_test_df['pain_survey_2'] = kaggle_test_df['pain_survey_2'].astype('int8')
kaggle_test_df['pain_survey_3'] = kaggle_test_df['pain_survey_3'].astype('int8')
kaggle_test_df['pain_survey_4'] = kaggle_test_df['pain_survey_4'].astype('int8')

# Convert labels sample_index to int8
label_map = {'low_pain': 1, 'no_pain': 0, 'high_pain': 2}

labels_df['label'] = labels_df['label'].str.strip().str.lower().map(label_map).astype('int8')

### **Prepare the Time feature**

In [61]:
# --- Time features: normalized_time + sin/cos ---
TIME_FEATURES = ['time_norm', 'time_sin', 'time_cos']
# Vengono aggiunte tre nuove colonne continue: time_norm, time_sin, time_cos
# Se la sequenza per un sample ha lunghezza variabile, normalizziamo dividendo per il max time per sample.
for _df in [dataset_df, kaggle_test_df]:
    if 'time' in _df.columns and 'time_norm' not in _df.columns:
        max_time = _df.groupby('sample_index')['time'].transform('max').replace(0, 1)
        _df['time_norm'] = (_df['time'] / max_time).astype('float32')
        _df['time_sin'] = np.sin(2 * np.pi * _df['time_norm']).astype('float32')
        _df['time_cos'] = np.cos(2 * np.pi * _df['time_norm']).astype('float32')

## üîÑ **Data Preprocessing**

In [62]:
TEST_SET_PERCENTAGE = 0.2

JOINT_COLUMNS = [f'joint_{i:02d}' for i in range(31)]

CONTINUOUS_COLS = JOINT_COLUMNS + TIME_FEATURES
CATEGORICAL_COLS = ['n_legs', 'n_hands', 'n_eyes', 'pain_survey_1', 'pain_survey_2', 'pain_survey_3', 'pain_survey_4']

#COLUMNS_TO_REMOVE = [f'joint_{i:02d}' for i in range(13, 26)] + ['joint_30']
COLUMNS_TO_REMOVE = ['joint_30']

COLS_TO_EXCLUDE_FROM_NORMALIZATION = TIME_FEATURES

In [63]:
num_classes = len(labels_df['label'].unique())
unique_samples = dataset_df['sample_index'].unique()

#### Remove useless features

In [64]:
# @title Remove feature from joint_13 to joint_25 + joint_30
df_dataset_reduced = dataset_df.drop(columns=COLUMNS_TO_REMOVE, inplace=False)
kaggle_test_df_reduced = kaggle_test_df.drop(columns=COLUMNS_TO_REMOVE, inplace=False)

In [65]:
# Rimuoviamo le colonne eliminate anche dalle nostre liste di colonne
CONTINUOUS_COLS_REDUCED = [col for col in CONTINUOUS_COLS if col not in COLUMNS_TO_REMOVE]
CATEGORICAL_COLS_REDUCED = [col for col in CATEGORICAL_COLS if col not in COLUMNS_TO_REMOVE]

#### Build sequences with sliding window

üéØ **Adaptive Padding Strategy**

Invece di usare padding con zeri (che introduce rumore), usiamo **padding adattivo**:
- **Continuous features**: Usa la media dell'ultimo N timesteps della sequenza
- **Categorical features**: Usa la moda (valore pi√π frequente) dell'ultimo N timesteps
- **Fallback**: Se necessario, usa le statistiche globali del dataset

Questo approccio riduce il rumore e migliora la qualit√† delle predizioni.

In [66]:
def build_sequences(df, label_df, continuous_cols, categorical_cols, window=200, stride=200,
                    padding_strategy='adaptive', lookback_steps=10):
    """
    Build sequences from time series data with intelligent padding.

    Args:
        df: DataFrame with time series data
        label_df: DataFrame with labels
        continuous_cols: List of continuous feature columns
        categorical_cols: List of categorical feature columns
        window: Window size for sequences
        stride: Stride for sliding window
        padding_strategy: 'adaptive' (mean/mode), 'repeat' (repeat last), or 'zero' (zeros)
        lookback_steps: Number of timesteps to use for computing padding statistics

    Returns:
        dataset_continuous, dataset_categorical, labels
    """
    assert window % stride == 0, "Window must be divisible by stride"

    dataset_continuous = []
    dataset_categorical = []
    labels = []

    # Pre-compute global statistics for fallback (only if adaptive)
    if padding_strategy == 'adaptive':
        global_cont_mean = df[continuous_cols].mean().values.astype('float32')
        global_cat_mode = df[categorical_cols].mode().iloc[0].values.astype('int8')

    for sample_id in df['sample_index'].unique():
        # Extract data for current sample
        temp_continuous = df[df['sample_index'] == sample_id][continuous_cols].values
        temp_categorical = df[df['sample_index'] == sample_id][categorical_cols].values

        label = label_df[label_df['sample_index'] == sample_id]['label'].values[0]

        # Calculate padding length
        padding_len = (window - len(temp_continuous) % window) % window

        if padding_strategy == 'adaptive':
            # Adaptive padding: use statistics from last N timesteps
            lookback = min(lookback_steps, len(temp_continuous))

            if lookback > 0:
                # Use mean of last timesteps for continuous
                last_cont_values = temp_continuous[-lookback:]
                pad_cont_value = np.mean(last_cont_values, axis=0, keepdims=True)

                # Use mode of last timesteps for categorical
                last_cat_values = temp_categorical[-lookback:]
                pad_cat_value = np.array([
                    np.bincount(last_cat_values[:, i]).argmax()
                    for i in range(last_cat_values.shape[1])
                ]).reshape(1, -1)
            else:
                # Fallback to global statistics
                pad_cont_value = global_cont_mean.reshape(1, -1)
                pad_cat_value = global_cat_mode.reshape(1, -1)

            # Create padding by repeating the computed values
            padding_cont = np.repeat(pad_cont_value, padding_len, axis=0).astype('float32')
            padding_cat = np.repeat(pad_cat_value, padding_len, axis=0).astype('int8')
        elif padding_strategy == 'repeat':
            # Repeat last timestep
            if len(temp_continuous) > 0:
                padding_cont = np.repeat(temp_continuous[-1:], padding_len, axis=0)
                padding_cat = np.repeat(temp_categorical[-1:], padding_len, axis=0)
            else:
                # Fallback to zeros if no data
                padding_cont = np.zeros((padding_len, temp_continuous.shape[1]), dtype='float32')
                padding_cat = np.zeros((padding_len, temp_categorical.shape[1]), dtype='int8')
        else:  # 'zero' or default
            # Original zero padding
            padding_cont = np.zeros((padding_len, temp_continuous.shape[1]), dtype='float32')
            padding_cat = np.zeros((padding_len, temp_categorical.shape[1]), dtype='int8')

        temp_continuous = np.concatenate((temp_continuous, padding_cont))
        temp_categorical = np.concatenate((temp_categorical, padding_cat))

        # Build windows with sliding stride
        idx = 0
        while idx + window <= len(temp_continuous):
            dataset_continuous.append(temp_continuous[idx:idx + window])
            dataset_categorical.append(temp_categorical[idx:idx + window])
            labels.append(label)
            idx += stride

    # Convert to numpy arrays
    dataset_continuous = np.array(dataset_continuous, dtype='float32')
    dataset_categorical = np.array(dataset_categorical, dtype='int8')
    labels = np.array(labels, dtype='int64')

    return dataset_continuous, dataset_categorical, labels

In [67]:
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset


def make_loader(ds, batch_size, shuffle, drop_last):
    # Determine optimal number of worker processes for data loading
    cpu_cores = os.cpu_count() or 2
    num_workers = max(2, min(4, cpu_cores))

    # Create DataLoader with performance optimizations
    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
        pin_memory=True,  # Faster GPU transfer
        pin_memory_device="cuda" if torch.cuda.is_available() else "",
        prefetch_factor=4,  # Load 4 batches ahead
        persistent_workers=True if num_workers > 0 else False,  # Mantiene i worker attivi
    )

## üõ†Ô∏è **Model Building**

In [68]:
from torch import nn


def recurrent_summary(model, input_specs):
    """
    Custom summary function that emulates torchinfo's output while correctly
    counting parameters for RNN/GRU/LSTM layers. It supports models with multiple inputs.

    This function is designed for models whose direct children are
    nn.Linear, nn.RNN, nn.GRU, or nn.LSTM layers.

    Args:
        model (nn.Module): The model to analyze.
        input_specs (list of tuples): A list where each tuple contains the shape
                                     and dtype of an input tensor.
                                     Example: [((seq_len, features_cont), torch.float32),
                                               ((seq_len, features_cat), torch.long)]
    """

    # Dictionary to store output shapes captured by forward hooks
    output_shapes = {}
    # List to track hook handles for later removal
    hooks = []

    def get_hook(name):
        """Factory function to create a forward hook for a specific module."""

        def hook(module, input, output):
            # Handle RNN layer outputs (returns a tuple)
            if isinstance(output, tuple):
                # output[0]: all hidden states with shape (batch, seq_len, hidden*directions)
                shape1 = list(output[0].shape)
                shape1[0] = -1  # Replace batch dimension with -1

                # output[1]: final hidden state h_n (or tuple (h_n, c_n) for LSTM)
                if isinstance(output[1], tuple):  # LSTM case: (h_n, c_n)
                    shape2 = list(output[1][0].shape)  # Extract h_n only
                else:  # RNN/GRU case: h_n only
                    shape2 = list(output[1].shape)

                # Replace batch dimension (middle position) with -1
                shape2[1] = -1

                output_shapes[name] = f"[{shape1}, {shape2}]"

            # Handle standard layer outputs (e.g., Linear)
            else:
                shape = list(output.shape)
                shape[0] = -1  # Replace batch dimension with -1
                output_shapes[name] = f"{shape}"

        return hook

    # 1. Determine the device where model parameters reside
    try:
        device = next(model.parameters()).device
    except StopIteration:
        device = torch.device("cpu")  # Fallback for models without parameters

    # 2. Create dummy input tensors with batch_size=1
    dummy_inputs = []
    for shape, dtype in input_specs:
        if dtype in [torch.long, torch.int, torch.int8, torch.int16, torch.int32, torch.int64]:
            dummy_inputs.append(torch.zeros(1, *shape, dtype=dtype).to(device))
        else:
            dummy_inputs.append(torch.randn(1, *shape, dtype=dtype).to(device))

    # 3. Register forward hooks on target layers
    # Iterate through direct children of the model (e.g., self.rnn, self.classifier)
    for name, module in model.named_children():
        if isinstance(module, (nn.Linear, nn.RNN, nn.GRU, nn.LSTM, nn.ModuleList)):
            # Register the hook and store its handle for cleanup
            hook_handle = module.register_forward_hook(get_hook(name))
            hooks.append(hook_handle)

    # 4. Execute a dummy forward pass in evaluation mode
    model.eval()
    with torch.no_grad():
        try:
            model(*dummy_inputs)
        except Exception as e:
            print(f"Error during dummy forward pass: {e}")
            # Clean up hooks even if an error occurs
            for h in hooks:
                h.remove()
            return

    # 5. Remove all registered hooks
    for h in hooks:
        h.remove()

    # --- 6. Print the summary table ---

    print("-" * 79)
    # Column headers
    print(f"{'Layer (type)':<25} {'Output Shape':<28} {'Param #':<18}")
    print("=" * 79)

    total_params = 0
    total_trainable_params = 0

    # Iterate through modules again to collect and display parameter information
    for name, module in model.named_children():
        if name in output_shapes:
            # Count total and trainable parameters for this module
            module_params = sum(p.numel() for p in module.parameters())
            trainable_params = sum(p.numel() for p in module.parameters() if p.requires_grad)

            total_params += module_params
            total_trainable_params += trainable_params

            # Format strings for display
            layer_name = f"{name} ({type(module).__name__})"
            output_shape_str = str(output_shapes[name])
            params_str = f"{trainable_params:,}"

            print(f"{layer_name:<25} {output_shape_str:<28} {params_str:<15}")

    print("=" * 79)
    print(f"Total params: {total_params:,}")
    print(f"Trainable params: {total_trainable_params:,}")
    print(f"Non-trainable params: {total_params - total_trainable_params:,}")
    print("-" * 79)

In [69]:
from typing import Optional


class RecurrentClassifier(nn.Module):
    """
    Generic RNN classifier with Embedding layer for categorical features.
    """

    def __init__(
            self,
            continuous_input_size,
            categorical_cardinalities,
            embedding_dims,
            hidden_size,
            num_layers,
            num_classes,
            rnn_type='GRU',
            bidirectional=False,
            dropout_rate=0.2,
            use_conv: bool = False,
            conv_num_filters: int = 64,
            conv_kernel_size: int = 5,
            conv_num_layers: int = 1,
            conv_stride: int = 1,
            conv_pool: Optional[int] = None,
            conv_batch_norm: bool = True
    ):
        super().__init__()

        self.rnn_type = rnn_type
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional

        self.use_conv = use_conv
        self.conv_num_layers = conv_num_layers
        self.conv_kernel_size = conv_kernel_size
        self.conv_num_filters = conv_num_filters

        # 1. Embedding Layers per le feature categoriche
        self.embedding_layers = nn.ModuleList([
            nn.Embedding(num_embeddings, emb_dim)
            for num_embeddings, emb_dim in zip(categorical_cardinalities, embedding_dims)
        ])
        total_embedding_dim = sum(embedding_dims)

        # 2. Calcola la dimensione dell'input per la RNN
        rnn_input_size = continuous_input_size + total_embedding_dim

        rnn_map = {'RNN': nn.RNN, 'LSTM': nn.LSTM, 'GRU': nn.GRU}
        if rnn_type not in rnn_map:
            raise ValueError("rnn_type must be 'RNN', 'LSTM', or 'GRU'")
        rnn_module = rnn_map[rnn_type]

        dropout_val = dropout_rate if num_layers > 1 else 0

        if use_conv:
            conv_layers = []
            in_channels = rnn_input_size
            for layer_idx in range(conv_num_layers):
                out_channels = conv_num_filters
                conv_layers.append(
                    nn.Conv1d(
                        in_channels=in_channels,
                        out_channels=out_channels,
                        kernel_size=conv_kernel_size,
                        stride=conv_stride,
                        padding=(conv_kernel_size - 1) // 2,
                        padding_mode='zeros',
                    )
                )
                if conv_batch_norm:
                    conv_layers.append(nn.BatchNorm1d(out_channels))
                conv_layers.append(nn.ReLU())
                if conv_pool:
                    conv_layers.append(nn.MaxPool1d(kernel_size=conv_pool, stride=conv_pool))
                in_channels = out_channels
            self.conv_block = nn.Sequential(*conv_layers)
            rnn_input_size = in_channels

        # 3. Crea il layer ricorrente
        self.rnn = rnn_module(
            input_size=rnn_input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout_val
        )

        classifier_input_size = hidden_size * 2 if self.bidirectional else hidden_size
        self.classifier = nn.Linear(classifier_input_size, num_classes)

    def forward(self, x_continuous, x_categorical):
        """
        x_continuous shape: (batch_size, seq_length, num_continuous_features)
        x_categorical shape: (batch_size, seq_length, num_categorical_features)
        """
        # 1. Applica gli embedding
        embedded_features = []
        for i, emb_layer in enumerate(self.embedding_layers):
            # Prendi la i-esima feature categorica per tutti i timestep
            cat_feature = x_categorical[:, :, i]
            embedded_features.append(emb_layer(cat_feature))

        # 2. Concatena gli embedding
        # embedded_features √® una lista di tensori (batch, seq, emb_dim)
        # li concateniamo lungo l'ultima dimensione
        x_embedded = torch.cat(embedded_features, dim=-1)

        # 3. Concatena le feature continue con quelle embedded
        x_combined = torch.cat([x_continuous, x_embedded], dim=-1)

        # Convolutional layer
        if self.use_conv:
            # x_combined: (batch, seq, features) -> permute
            x_conv = x_combined.permute(0, 2, 1)  # (batch, features, seq)
            x_conv = self.conv_block(x_conv)  # (batch, conv_filters, seq')
            # Riporta a (batch, seq', features_conv)
            x_processed = x_conv.permute(0, 2, 1)
        else:
            x_processed = x_combined

        # 4. Passa il tensore combinato alla RNN
        # rnn_out contiene gli output per ogni timestep
        rnn_out, hidden = self.rnn(x_processed)

        if self.rnn_type == 'LSTM':
            hidden = hidden[0]

        if self.bidirectional:
            hidden = hidden.view(self.num_layers, 2, -1, self.hidden_size)
            hidden_to_classify = torch.cat([hidden[-1, 0, :, :], hidden[-1, 1, :, :]], dim=1)
        else:
            hidden_to_classify = hidden[-1]

        # Originale del prof
        logits = self.classifier(hidden_to_classify)

        # 5. Proposto da gemini. Usa l'output dell'ultimo timestep per la classificazione
        # rnn_out ha shape (batch_size, seq_length, hidden_size * num_directions)
        # Prendiamo l'output dell'ultimo timestep: rnn_out[:, -1, :]
        # last_timestep_output = rnn_out[:, -1, :]

        # 6. Classifica
        # logits = self.classifier(last_timestep_output)
        return logits

## üßÆ **Network and Training Hyperparameters**

In [70]:
# Cross-validation
K = 4  # Number of splits (5 and 10 are considered good values)
N_TEST_SAMPLE_INDEXES = int(TEST_SET_PERCENTAGE * len(unique_samples))

# Training
EPOCHS = 500  # Maximum epochs (increase to improve performance)
PATIENCE = 35  # Early stopping patience (increase to improve performance)
VERBOSE = 20  # Print frequency

# Optimisation
LEARNING_RATE = 9e-5  # Learning rate
BATCH_SIZE = 512  # Batch size
WINDOW_SIZE = 20  # Input window size
STRIDE = 10  # Input stride

# Architecture
HIDDEN_LAYERS = 2  # Hidden layers
HIDDEN_SIZE = 64  # Neurons per layer
RNN_TYPE = 'LSTM'  # Type of RNN architecture
BIDIRECTIONAL = True  # Bidirectional RNN

# Regularisation
DROPOUT_RATE = 0.7  # Dropout probability
L1_LAMBDA = 1e-4  # L1 penalty
L2_LAMBDA = 1e-2  # L2 penalty

# Label smoothing
LABEL_SMOOTHING = 0.1

# Gradient Clipping
MAX_GRADIENT_NORM = 1.0

# Padding Strategy
PADDING_STRATEGY = 'adaptive'  # Options: 'adaptive', 'repeat', 'zero'
PADDING_LOOKBACK_STEPS = (STRIDE * 2) % WINDOW_SIZE  # Number of timesteps for adaptive padding statistics

# Embedding dims for categorical dimensions
MIN_N_EMBEDDING_DIMS = 50

In [71]:
# Convolution 1d

USE_CONV = True
CONV_NUM_FILTERS = 64
CONV_KERNEL_SIZE = 5
CONV_NUM_LAYERS = 1
CONV_STRIDE = 1
CONV_POOL = None
CONV_BATCH_NORM = True

### üìâ **Learning Rate Scheduler Configuration**

Il training ora supporta diversi tipi di learning rate schedulers:

**Scheduler disponibili:**
- `'reduce_on_plateau'`: Riduce LR quando il metric si stabilizza
- `'cosine'`: Cosine Annealing - Riduzione smooth del LR
- `'step'`: StepLR - Riduce LR ogni N epochs

**Come usare:**
1. Imposta `USE_SCHEDULER = True` nella sezione hyperparameters
2. Scegli `SCHEDULER_TYPE` (default: 'reduce_on_plateau')
3. Configura i parametri specifici dello scheduler se necessario

Lo scheduler viene automaticamente integrato nel training loop e il learning rate viene tracciato in TensorBoard.

In [72]:
# Learning Rate Scheduler
USE_SCHEDULER = True  # Enable/disable scheduler
SCHEDULER_TYPE = 'cosine'  # Options: 'reduce_on_plateau', 'cosine', 'step'
SCHEDULER_PATIENCE = 10  # For ReduceLROnPlateau
SCHEDULER_FACTOR = 0.8  # For ReduceLROnPlateau and StepLR
SCHEDULER_STEP_SIZE = 30  # For StepLR

In [73]:
# Definiamo le cardinalit√† (numero di valori unici) per ogni feature categorica.
# La cardinalit√† deve essere il valore massimo della categoria.
# Questo assicura che tutti gli indici siano validi per il layer di embedding.
categorical_cardinalities = [
    int(df_dataset_reduced['n_legs'].max() + 1),
    int(df_dataset_reduced['n_hands'].max() + 1),
    int(df_dataset_reduced['n_eyes'].max() + 1),
    int(df_dataset_reduced['pain_survey_1'].max() + 1),
    int(df_dataset_reduced['pain_survey_2'].max() + 1),
    int(df_dataset_reduced['pain_survey_3'].max() + 1),
    int(df_dataset_reduced['pain_survey_4'].max() + 1)
]

# Definiamo la dimensione dell'embedding per ogni feature.
embedding_dims = [max(MIN_N_EMBEDDING_DIMS, (c + 1) // 2) for c in categorical_cardinalities]  #= [2,2,2,2,2,2,2]

In [74]:
from torch.utils.tensorboard import SummaryWriter

EXPERIMENT_NAME = "lstm_conv1d_simpleNetwork_onlyTraining"  #spostato qui che mi ero rotto di scorrere #Legittimo bisogno

# Set up TensorBoard logging and save model architecture
writer = SummaryWriter("./" + logs_dir + "/" + EXPERIMENT_NAME)

In [75]:
# Fixed hyperparameters (not being tuned)
fixed_params = {
    'learning_rate': LEARNING_RATE,
    'window_size': WINDOW_SIZE,
    'stride': STRIDE,
    'batch_size': BATCH_SIZE,
    'l1_lambda': L1_LAMBDA,
    'l2_lambda': L2_LAMBDA,
    'rnn_type': RNN_TYPE,
    'bidirectional': BIDIRECTIONAL,
    'embedding_dims': embedding_dims,
    'label_smoothing': LABEL_SMOOTHING,
    'continuous_cols': CONTINUOUS_COLS_REDUCED,
    'categorical_cols': CATEGORICAL_COLS_REDUCED,
    'labels_df': labels_df,
    'hidden_layers': HIDDEN_LAYERS,
    'hidden_size': HIDDEN_SIZE,
    'dropout_rate': DROPOUT_RATE,

    'padding_strategy': PADDING_STRATEGY,
    'padding_lookback_steps': PADDING_LOOKBACK_STEPS,

    'use_scheduler': USE_SCHEDULER,
    'scheduler_patience': SCHEDULER_PATIENCE,
    'scheduler_factor': SCHEDULER_FACTOR,
    'scheduler_step_size': SCHEDULER_STEP_SIZE,

    "use_conv": USE_CONV,
    "conv_num_filters": CONV_NUM_FILTERS,
    "conv_kernel_size": CONV_KERNEL_SIZE,
    "conv_num_layers": CONV_NUM_LAYERS,
    "conv_pool": CONV_POOL,
    "conv_stride": CONV_STRIDE,
    "conv_batch_norm": CONV_BATCH_NORM,
}

# Cross-validation settings
cv_params = {
    'epochs': EPOCHS,
    'device': device,
    'k': K,
    'n_test_sample_indexes': N_TEST_SAMPLE_INDEXES,
    'patience': PATIENCE,
    'verbose': VERBOSE,
    'seed': SEED,
    'evaluation_metric': "val_f1",
    'mode': 'max',
    'restore_best_weights': True,
    'writer': writer,
}

## üß† **Model Training**

### **Utility Functions**

In [76]:
from sklearn.metrics import f1_score


def train_one_epoch(model, train_loader, criterion, optimizer, scaler, device, l1_lambda=0, l2_lambda=0):
    """
    Perform one complete training epoch through the entire training dataset.

    Args:
        model (nn.Module): The neural network model to train
        train_loader (DataLoader): PyTorch DataLoader containing training data batches
        criterion (nn.Module): Loss function (e.g., CrossEntropyLoss, MSELoss)
        optimizer (torch.optim): Optimization algorithm (e.g., Adam, SGD)
        scaler (GradScaler): PyTorch's gradient scaler for mixed precision training
        device (torch.device): Computing device ('cuda' for GPU, 'cpu' for CPU)
        l1_lambda (float): Lambda for L1 regularization
        l2_lambda (float): Lambda for L2 regularization

    Returns:
        tuple: (average_loss, f1 score) - Training loss and f1 score for this epoch
    """
    model.train()  # Set model to training mode

    running_loss = 0.0
    all_predictions = []
    all_targets = []

    # Iterate through training batches
    for batch_idx, (inputs_cont, inputs_cat, targets) in enumerate(train_loader):
        # Move data to device (GPU/CPU)
        inputs_cont, inputs_cat, targets = inputs_cont.to(device), inputs_cat.to(device), targets.to(device)

        # Clear gradients from previous step
        optimizer.zero_grad(set_to_none=True)

        # Forward pass with mixed precision (if CUDA available)
        with torch.amp.autocast(device_type=device.type, enabled=(
                device.type == 'cuda')):  # consider to add dtype=torch.float16 to improve speed
            logits = model(inputs_cont, inputs_cat)
            loss = criterion(logits, targets)

            # Add L1 and L2 regularization
            l1_norm = sum(p.abs().sum() for p in model.parameters())
            l2_norm = sum(p.pow(2).sum() for p in model.parameters())
            loss = loss + l1_lambda * l1_norm + l2_lambda * l2_norm

        # Backward pass with gradient scaling
        scaler.scale(loss).backward()

        # --- Gradient Clipping ---
        # Unscale gradients before clipping to avoid clipping scaled gradients
        scaler.unscale_(optimizer)
        # Clip the gradients to a maximum norm (e.g., 1.0)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=MAX_GRADIENT_NORM)
        # --- End of Clipping ---

        scaler.step(optimizer)
        scaler.update()

        # Accumulate metrics
        running_loss += loss.item() * inputs_cont.size(0)
        predictions = logits.argmax(dim=1)
        all_predictions.append(predictions.cpu().numpy())
        all_targets.append(targets.cpu().numpy())

    # Calculate epoch metrics
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_f1 = f1_score(
        np.concatenate(all_targets),
        np.concatenate(all_predictions),
        average='weighted'
    )

    return epoch_loss, epoch_f1

In [77]:
def create_scheduler(optimizer, scheduler_type, train_loader, epochs, **kwargs):
    """
    Create learning rate scheduler based on configuration.

    Args:
        optimizer: PyTorch optimizer
        scheduler_type: Type of scheduler ('reduce_on_plateau', 'cosine', 'step')
        train_loader: DataLoader for calculating steps_per_epoch
        epochs: Total number of epochs
        **kwargs: Additional scheduler-specific parameters

    Returns:
        scheduler or None
    """
    if scheduler_type == 'reduce_on_plateau':
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='max',
            factor=kwargs.get('scheduler_factor', 0.5),
            patience=kwargs.get('scheduler_patience', 10),
        )
    elif scheduler_type == 'cosine':
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=epochs,
            eta_min=kwargs.get('learning_rate', 1e-3) * 0.05
        )
    elif scheduler_type == 'step':
        scheduler = torch.optim.lr_scheduler.StepLR(
            optimizer,
            step_size=kwargs.get('scheduler_step_size', 30),
            gamma=kwargs.get('scheduler_factor', 0.5)
        )
    else:
        scheduler = None

    return scheduler

In [78]:
def log_metrics_to_tensorboard(writer: SummaryWriter, epoch, train_loss, train_f1, model):
    """
    Log training metrics and model parameters to TensorBoard for visualization.

    Args:
        writer (SummaryWriter): TensorBoard SummaryWriter object for logging
        epoch (int): Current epoch number (used as x-axis in TensorBoard plots)
        train_loss (float): Training loss for this epoch
        train_f1 (float): Training f1 score for this epoch
        model (nn.Module): The neural network model (for logging weights/gradients)

    Note:
        This function logs scalar metrics (loss/f1 score) and histograms of model
        parameters and gradients, which helps monitor training progress and detect
        issues like vanishing/exploding gradients.
    """
    # Log scalar metrics
    writer.add_scalar('Loss/Training', train_loss, epoch)
    writer.add_scalar('F1/Training', train_f1, epoch)

    # Log model parameters and gradients
    for name, param in model.named_parameters():
        if param.requires_grad:
            # Check if the tensor is not empty before adding a histogram
            if param.numel() > 0:
                writer.add_histogram(f'{name}/weights', param.data, epoch)
            if param.grad is not None:
                # Check if the gradient tensor is not empty before adding a histogram
                if param.grad.numel() > 0:
                    if param.grad is not None and torch.isfinite(param.grad).all():
                        writer.add_histogram(f'{name}/gradients', param.grad.data, epoch)

In [79]:
def fit(model, train_loader, epochs, criterion, optimizer, scaler, device,
        scheduler=None, l1_lambda=0, l2_lambda=0, patience=0, evaluation_metric="val_f1", mode='max',
        restore_best_weights=True, writer=None, verbose=10, experiment_name=""):
    """
    Train the neural network model on the training data and validate on the validation data.

    Args:
        model (nn.Module): The neural network model to train
        train_loader (DataLoader): PyTorch DataLoader containing training data batches
        epochs (int): Number of training epochs
        criterion (nn.Module): Loss function (e.g., CrossEntropyLoss, MSELoss)
        optimizer (torch.optim): Optimization algorithm (e.g., Adam, SGD)
        scaler (GradScaler): PyTorch's gradient scaler for mixed precision training
        device (torch.device): Computing device ('cuda' for GPU, 'cpu' for CPU)
        scheduler (optional): Learning rate scheduler (default: None)
        l1_lambda (float): L1 regularization coefficient (default: 0)
        l2_lambda (float): L2 regularization coefficient (default: 0)
        patience (int): Number of epochs to wait for improvement before early stopping (default: 0)
        evaluation_metric (str): Metric to monitor for early stopping (default: "val_f1")
        mode (str): 'max' for maximizing the metric, 'min' for minimizing (default: 'max')
        restore_best_weights (bool): Whether to restore model weights from best epoch (default: True)
        writer (SummaryWriter, optional): TensorBoard SummaryWriter object for logging (default: None)
        verbose (int, optional): Frequency of printing training progress (default: 10)
        experiment_name (str, optional): Experiment name for saving models (default: "")

    Returns:
        tuple: (model, training_history, best_val_preds_np, best_val_targets_np) -
               Trained model, metrics history
    """

    # Initialize metrics tracking
    training_history = {
        'train_loss': [],
        'train_f1': [],
        'learning_rate': []
    }

    # Configure early stopping if patience is set
    if patience > 0:
        patience_counter = 0
        best_metric = float('-inf') if mode == 'max' else float('inf')
        best_epoch = 0

    print(f"Training {epochs} epochs...")

    # Main training loop: iterate through epochs
    for epoch in range(1, epochs + 1):

        # Forward pass through training data, compute gradients, update weights
        train_loss, train_f1 = train_one_epoch(
            model, train_loader, criterion, optimizer, scaler, device, l1_lambda, l2_lambda
        )

        # Store metrics for plotting and analysis
        training_history['train_loss'].append(train_loss)
        training_history['train_f1'].append(train_f1)

        # Track current learning rate
        current_lr = optimizer.param_groups[0]['lr']
        training_history['learning_rate'].append(current_lr)

        # Write metrics to TensorBoard for visualization
        if writer is not None:
            log_metrics_to_tensorboard(
                writer, epoch, train_loss, train_f1, model
            )
            writer.add_scalar('Learning_Rate', current_lr, epoch)

        # Print progress every N epochs or on first epoch
        if verbose > 0:
            if epoch % verbose == 0 or epoch == 1:
                print(f"Epoch {epoch:3d}/{epochs} | "
                      f"Train: Loss={train_loss:.4f}, F1 Score={train_f1:.4f} | "
                      f"LR={current_lr:.2e}")

        # Step the learning rate scheduler
        if scheduler is not None:
            if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                # ReduceLROnPlateau needs the metric
                #scheduler.step(val_f1 if mode == 'max' else val_loss)
                scheduler.step(train_f1 if mode == 'max' else train_loss)
                print("WARNING - using ReduceOnPlateau requires a validation set")
            else:
                # Other schedulers just need epoch
                scheduler.step()

        # Early stopping logic: monitor metric and save best model
        if patience > 0:
            current_metric = training_history[evaluation_metric][-1]
            is_improvement = (current_metric > best_metric) if mode == 'max' else (current_metric < best_metric)

            if is_improvement:
                best_metric = current_metric
                best_epoch = epoch
                torch.save(model.state_dict(), "models/" + experiment_name + '_model.pt')
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"Early stopping triggered after {epoch} epochs.")
                    break

    # Restore best model weights if early stopping was used
    if restore_best_weights and patience > 0:
        model.load_state_dict(torch.load("models/" + experiment_name + '_model.pt'))
        print(f"Best model restored from epoch {best_epoch} with {evaluation_metric} {best_metric:.4f}")

    # Save final model if no early stopping
    if patience == 0:
        torch.save(model.state_dict(), "models/" + experiment_name + '_model.pt')

    # Close TensorBoard writer
    if writer is not None:
        writer.close()

    return model, training_history

In [80]:
def get_max_score(scores):
    """
    Extract the maximum score from a dictionary of scores.

    Args:
        scores: Dict with keys like 'split_0', 'split_1', ..., 'mean', 'std'

    Returns:
        max_score: Maximum score across splits
    """
    split_scores = [v for k, v in scores.items() if k.startswith('split_')]
    return max(split_scores) if split_scores else None

## **Training - no validation**

In [81]:
%%time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

final_params = {**fixed_params, **cv_params}

# --- 2. Prepara il dataset di training completo ---


# Stratified split su sample_index per mantenere la distribuzione delle label
labels_map = labels_df.set_index('sample_index')['label']
y_all = np.array([labels_map[sid] for sid in unique_samples], dtype=np.int64)

train_ids, test_ids, y_train, y_test = train_test_split(
    unique_samples,
    y_all,
    test_size=N_TEST_SAMPLE_INDEXES,
    stratify=y_all,
    random_state=SEED + 1,
    shuffle=True
)

n_train_samples = len(train_ids)
assert n_train_samples > 0, "Train set vuoto, riduci val/test"

df_train = df_dataset_reduced[df_dataset_reduced['sample_index'].isin(train_ids)].copy()
df_test = df_dataset_reduced[df_dataset_reduced['sample_index'].isin(test_ids)].copy()


# Normalizza l'intero dataset di training e salva lo scaler
final_scaler = StandardScaler()
features_to_normalize = list(set(CONTINUOUS_COLS_REDUCED) - set(COLS_TO_EXCLUDE_FROM_NORMALIZATION))
df_train[features_to_normalize] = final_scaler.fit_transform(df_train[features_to_normalize])
df_test[features_to_normalize] = final_scaler.transform(df_test[features_to_normalize])
joblib.dump(final_scaler, f"models/{EXPERIMENT_NAME}_final_scaler.pkl")
print(f"Scaler salvato in: models/{EXPERIMENT_NAME}_final_scaler.pkl")

# Costruisci le sequenze
X_train_cont, X_train_cat, y_train = build_sequences(
    df_train,
    labels_df,
    continuous_cols=final_params['continuous_cols'],
    categorical_cols=final_params['categorical_cols'],
    window=final_params['window_size'],
    stride=final_params['stride'],
    padding_strategy=final_params['padding_strategy'],
    lookback_steps=final_params['padding_lookback_steps']
)
X_test_cont, X_test_cat, y_test = build_sequences(
    df_test,
    labels_df,
    continuous_cols=final_params['continuous_cols'],
    categorical_cols=final_params['categorical_cols'],
    window=final_params['window_size'],
    stride=final_params['stride'],
    padding_strategy=final_params['padding_strategy'],
    lookback_steps=final_params['padding_lookback_steps']
)

# Crea DataLoader
train_ds = TensorDataset(torch.from_numpy(X_train_cont).float(), torch.from_numpy(X_train_cat).long(), torch.from_numpy(y_train).long())
test_ds = TensorDataset(torch.from_numpy(X_test_cont).float(), torch.from_numpy(X_test_cat).long(), torch.from_numpy(y_test).long())

train_loader = make_loader(train_ds, batch_size=final_params['batch_size'], shuffle=False, drop_last=False)
test_loader = make_loader(test_ds, batch_size=final_params['batch_size'], shuffle=False, drop_last=False)

# --- 3. Inizializza e addestra il modello finale ---
final_model = RecurrentClassifier(
    continuous_input_size=len(final_params['continuous_cols']),
    categorical_cardinalities=categorical_cardinalities,
    embedding_dims=final_params['embedding_dims'],
    hidden_size=final_params['hidden_size'],
    num_layers=final_params['hidden_layers'],
    num_classes=num_classes,
    rnn_type=final_params['rnn_type'],
    bidirectional=final_params['bidirectional'],
    dropout_rate=final_params['dropout_rate'],
    use_conv=final_params.get('use_conv', False),
    conv_num_filters=final_params.get('conv_num_filters', 64),
    conv_kernel_size=final_params.get('conv_kernel_size', 5),
    conv_num_layers=final_params.get('conv_num_layers', 1),
    conv_stride=final_params.get('conv_stride', 1),
    conv_pool=final_params.get('conv_pool', 2),
    conv_batch_norm=final_params.get('conv_batch_norm', True)
).to(device)

# Ottimizzatore e Criterio
optimizer = torch.optim.AdamW(final_model.parameters(), lr=final_params['learning_rate'], weight_decay=final_params['l2_lambda'])

# Calcolo pesi per class imbalance
class_counts = np.bincount(y_train)
total_samples = len(y_train)
class_weights = total_samples / (len(np.unique(y_train)) * class_counts)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=final_params['label_smoothing'])
scaler = torch.amp.GradScaler(enabled=(device.type == 'cuda'))

# Scegli un numero di epoche basato sui risultati del CV (es. 170)
FINAL_EPOCHS = 300

# --- INTEGRAZIONE DELLO SCHEDULER ---
# Avviso: 'reduce_on_plateau' non √® adatto per il training finale senza validation set.
# Se il tuo scheduler migliore √® quello, considera di usare 'cosine' o 'step' per il training finale.
if USE_SCHEDULER and SCHEDULER_TYPE == 'reduce_on_plateau':
    print("Attenzione: 'reduce_on_plateau' non √® adatto al training finale. Lo scheduler verr√† disabilitato.")
    final_scheduler = None
elif USE_SCHEDULER:
    final_scheduler = create_scheduler(optimizer, SCHEDULER_TYPE, train_loader, **final_params)
    print(f"Scheduler '{SCHEDULER_TYPE}' attivato per il training finale.")
else:
    final_scheduler = None
    print("Nessuno scheduler attivato per il training finale.")

print("Inizio addestramento del modello finale...")
for epoch in range(1, FINAL_EPOCHS + 1):
    train_loss, train_f1 = train_one_epoch(
        final_model, train_loader, criterion, optimizer, scaler, device,
        l1_lambda=final_params['l1_lambda'], l2_lambda=final_params['l2_lambda']
    )

    # Aggiorna lo scheduler alla fine di ogni epoca
    if final_scheduler is not None:
        final_scheduler.step()

    if epoch % 10 == 0 or epoch == 1:
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Epoch {epoch}/{FINAL_EPOCHS} | Train Loss: {train_loss:.4f}, Train F1: {train_f1:.4f}, LR: {current_lr:.6f}")

print("Addestramento finale completato.")

# --- 4. Salva il modello finale ---
torch.save(final_model.state_dict(), f"models/{EXPERIMENT_NAME}_final_model.pt")
print(f"Modello finale salvato in: models/{EXPERIMENT_NAME}_final_model.pt")

Scaler salvato in: models/lstm_conv1d_simpleNetwork_onlyTraining_final_scaler.pkl
Scheduler 'cosine' attivato per il training finale.
Inizio addestramento del modello finale...
Epoch 1/300 | Train Loss: 22.1117, Train F1: 0.0192, LR: 0.000090
Epoch 10/300 | Train Loss: 19.1375, Train F1: 0.4902, LR: 0.000090
Epoch 20/300 | Train Loss: 16.9091, Train F1: 0.6508, LR: 0.000090
Epoch 30/300 | Train Loss: 15.5994, Train F1: 0.7634, LR: 0.000089
Epoch 40/300 | Train Loss: 14.7356, Train F1: 0.8216, LR: 0.000089
Epoch 50/300 | Train Loss: 14.0711, Train F1: 0.8547, LR: 0.000088
Epoch 60/300 | Train Loss: 13.5063, Train F1: 0.8837, LR: 0.000087
Epoch 70/300 | Train Loss: 12.9837, Train F1: 0.9066, LR: 0.000086
Epoch 80/300 | Train Loss: 12.5510, Train F1: 0.9153, LR: 0.000085
Epoch 90/300 | Train Loss: 12.0232, Train F1: 0.9361, LR: 0.000083
Epoch 100/300 | Train Loss: 11.5378, Train F1: 0.9425, LR: 0.000082
Epoch 110/300 | Train Loss: 11.1091, Train F1: 0.9522, LR: 0.000080
Epoch 120/300 | Tr

## **Inference on kaggle dataset**

In [82]:

model = RecurrentClassifier(
    continuous_input_size=len(final_params['continuous_cols']),
    categorical_cardinalities=categorical_cardinalities,
    embedding_dims=final_params['embedding_dims'],
    hidden_size=final_params['hidden_size'],
    num_layers=final_params['hidden_layers'],
    num_classes=num_classes,
    rnn_type=final_params['rnn_type'],
    bidirectional=final_params['bidirectional'],
    dropout_rate=final_params['dropout_rate'],
    use_conv=final_params.get('use_conv', False),
    conv_num_filters=final_params.get('conv_num_filters', 64),
    conv_kernel_size=final_params.get('conv_kernel_size', 5),
    conv_num_layers=final_params.get('conv_num_layers', 1),
    conv_stride=final_params.get('conv_stride', 1),
    conv_pool=final_params.get('conv_pool', 2),
    conv_batch_norm=final_params.get('conv_batch_norm', True)
).to(device)

# 3) Carica i pesi del modello migliore e lo scaler associato
model_path = f"models/{EXPERIMENT_NAME}_final_model.pt"
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

scaler_path = f"models/{EXPERIMENT_NAME}_final_scaler.pkl"
scaler = joblib.load(scaler_path)

print(f"Modello caricato da {model_path}")
print(f"Scaler caricato da {scaler_path}")

Modello caricato da models/lstm_conv1d_simpleNetwork_onlyTraining_final_model.pt
Scaler caricato da models/lstm_conv1d_simpleNetwork_onlyTraining_final_scaler.pkl


In [83]:
submission_path = f"submissions/{EXPERIMENT_NAME}_final_submission.csv"

In [84]:
# 4) Funzione per creare le sequenze per l'inferenza (senza etichette)
def build_sequences_inference(df, continuous_cols, categorical_cols, window=200, stride=200,
                              padding_strategy='adaptive', lookback_steps=10):
    """
    Build sequences for inference with adaptive padding (no labels).

    Args:
        df: DataFrame with time series data
        continuous_cols: List of continuous feature columns
        categorical_cols: List of categorical feature columns
        window: Window size for sequences
        stride: Stride for sliding window
        padding_strategy: 'adaptive' (mean/mode), 'repeat' (repeat last), or 'zero' (zeros)
        lookback_steps: Number of timesteps for computing padding statistics

    Returns:
        X_continuous, X_categorical, sample_owners (array of sample_index for each window)
    """
    assert window % stride == 0, "Window must be divisible by stride"

    X_cont, X_cat, owners = [], [], []

    # Pre-compute global statistics for fallback (only if adaptive)
    if padding_strategy == 'adaptive':
        global_cont_mean = df[continuous_cols].mean().values.astype('float32')
        global_cat_mode = df[categorical_cols].mode().iloc[0].values.astype('int8')

    for sid, g in df.groupby('sample_index'):
        cont = g[continuous_cols].values.astype('float32')
        cat = g[categorical_cols].values.astype('int64')

        pad = (window - (len(cont) % window)) % window

        if pad > 0:
            if padding_strategy == 'adaptive':
                # Use statistics from last timesteps
                lookback = min(lookback_steps, len(cont))

                if lookback > 0:
                    pad_cont_value = np.mean(cont[-lookback:], axis=0, keepdims=True)
                    pad_cat_value = np.array([
                        np.bincount(cat[-lookback:, i]).argmax()
                        for i in range(cat.shape[1])
                    ]).reshape(1, -1)
                else:
                    pad_cont_value = global_cont_mean.reshape(1, -1)
                    pad_cat_value = global_cat_mode.reshape(1, -1)

                padding_cont = np.repeat(pad_cont_value, pad, axis=0).astype('float32')
                padding_cat = np.repeat(pad_cat_value, pad, axis=0).astype('int64')

            elif padding_strategy == 'repeat':
                if len(cont) > 0:
                    padding_cont = np.repeat(cont[-1:], pad, axis=0)
                    padding_cat = np.repeat(cat[-1:], pad, axis=0)
                else:
                    padding_cont = np.zeros((pad, cont.shape[1]), dtype='float32')
                    padding_cat = np.zeros((pad, cat.shape[1]), dtype='int64')
            else:  # 'zero' or default
                padding_cont = np.zeros((pad, cont.shape[1]), dtype='float32')
                padding_cat = np.zeros((pad, cat.shape[1]), dtype='int64')

            cont = np.concatenate([cont, padding_cont], axis=0)
            cat = np.concatenate([cat, padding_cat], axis=0)

        # Build windows
        i = 0
        while i + window <= len(cont):
            X_cont.append(cont[i:i + window])
            X_cat.append(cat[i:i + window])
            owners.append(sid)
            i += stride

    return (np.asarray(X_cont, dtype=np.float32),
            np.asarray(X_cat, dtype=np.int64),
            np.asarray(owners, dtype=np.int32))

In [85]:
# 5) Normalizza il kaggle test con lo scaler CORRETTO (caricato sopra), quello relativo allo split migliore
kaggle_test_df_reduced[features_to_normalize] = scaler.transform(kaggle_test_df_reduced[features_to_normalize])

In [86]:
from collections import Counter

# 6) Costruisci le sequenze per Kaggle test
Xk_cont, Xk_cat, owners = build_sequences_inference(
    kaggle_test_df_reduced,
    continuous_cols=CONTINUOUS_COLS_REDUCED,
    categorical_cols=CATEGORICAL_COLS_REDUCED,
    window=final_params['window_size'],
    stride=final_params['stride'],
    padding_strategy=final_params.get('padding_strategy', 'adaptive'),
    lookback_steps=final_params.get('padding_lookback_steps', 10)
)

# 7) Inference sui windows
kaggle_ds = TensorDataset(
    torch.from_numpy(Xk_cont).float(),
    torch.from_numpy(Xk_cat).long()
)
kaggle_loader = make_loader(kaggle_ds, batch_size=final_params['batch_size'], shuffle=False, drop_last=False)

all_preds = []
with torch.no_grad():
    for xb_cont, xb_cat in kaggle_loader:
        xb_cont = xb_cont.to(device)
        xb_cat = xb_cat.to(device)
        logits = model(xb_cont, xb_cat)
        preds = logits.argmax(dim=1).detach().cpu().numpy()
        all_preds.append(preds)

all_preds = np.concatenate(all_preds) if len(all_preds) else np.array([], dtype=np.int64)

# 8) Aggrega per sample_index (maggioranza)
preds_per_sample = {}
for sid, p in zip(owners, all_preds):
    preds_per_sample.setdefault(int(sid), []).append(int(p))

final_idx = {sid: Counter(v).most_common(1)[0][0] for sid, v in preds_per_sample.items()}

# 9) Mappa a etichette testuali e crea submission
inv_label_map = {0: 'no_pain', 1: 'low_pain', 2: 'high_pain'}
submission = pd.DataFrame({
    'sample_index': list(final_idx.keys()),
    'label': [inv_label_map[int(v)] for v in final_idx.values()]
}).sort_values('sample_index', kind='stable')

os.makedirs("submissions", exist_ok=True)
submission.to_csv(submission_path, index=False)

print(f"Submission salvata in {submission_path}")



Submission salvata in submissions/lstm_conv1d_simpleNetwork_onlyTraining_final_submission.csv


In [87]:
submission.head(2000)

Unnamed: 0,sample_index,label
0,0,no_pain
1,1,no_pain
2,2,no_pain
3,3,no_pain
4,4,no_pain
...,...,...
1319,1319,no_pain
1320,1320,no_pain
1321,1321,no_pain
1322,1322,no_pain
