# 🏴‍☠️ Ensemble con Optuna - Pipeline Completa\n\nQuesto notebook integra **Optuna optimization** con **ensemble approach**:\n1. Preprocessing e creazione finestre\n2. **Optuna optimization** per trovare i migliori iperparametri\n3. **Selezione automatica** dei migliori N modelli dai trial\n4. **Training ensemble** con i modelli selezionati\n5. **EWA weighting** basato su validation F1\n6. **Predizione test** con ensemble\n\n## Vantaggi\n- 🔍 Optuna trova automaticamente i migliori iperparametri\n- 🎯 Ensemble usa i top N modelli invece di configurazioni manuali\n- ⚖️ EWA pesa i modelli basandosi sulle performance\n- 🚀 Pipeline completamente automatizzata

In [1]:
# Core libraries
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Stats and ML
from statsmodels.tsa.stattools import acf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight

# Set seeds
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print(f'PyTorch: {torch.__version__}')
print(f'Device: {device}')
print('✅ Environment ready!')

  from .autonotebook import tqdm as notebook_tqdm


PyTorch: 2.9.0+cpu
Device: cpu
✅ Environment ready!


## 1. Caricamento Dati

In [2]:
# Load dataset
X_train = pd.read_csv('pirate_pain_train.csv')
y_train = pd.read_csv('pirate_pain_train_labels.csv')

print('📊 Dataset Shape:')
print(f'  Features: {X_train.shape}')
print(f'  Labels: {y_train.shape}')
print(f'  Samples: {X_train["sample_index"].nunique()}')
print(f'  Timesteps/sample: {X_train.groupby("sample_index").size().iloc[0]}')

# Feature groups
pain_survey_cols = [c for c in X_train.columns if 'pain_survey' in c]
categorical_cols = ['n_legs', 'n_hands', 'n_eyes']
joint_cols = [c for c in X_train.columns if 'joint_' in c]

print(f'\n📋 Features: {len(pain_survey_cols)} pain_survey + {len(categorical_cols)} categorical + {len(joint_cols)} joints')

# ADVICE 08/11: Check class imbalance
print(f'\n🏷️ Labels (IMBALANCED - need class weighting):')
for label, count in y_train['label'].value_counts().items():
    print(f'  {label}: {count} ({100*count/len(y_train):.1f}%)')

📊 Dataset Shape:
  Features: (105760, 40)
  Labels: (661, 2)
  Samples: 661
  Timesteps/sample: 160

📋 Features: 4 pain_survey + 3 categorical + 31 joints

🏷️ Labels (IMBALANCED - need class weighting):
  no_pain: 511 (77.3%)
  low_pain: 94 (14.2%)
  high_pain: 56 (8.5%)


## 2. ADVICE 11/11: Determinare WINDOW_SIZE

*"Its own echo, the series sings."*

Usiamo autocorrelazione per scegliere window size basata sui dati.

In [3]:
# ADVICE 11/11: Analyze autocorrelation to determine optimal window
print('🔍 Analyzing autocorrelation...')
samples_analyze = X_train['sample_index'].unique()[:10]
key_features = joint_cols[:6]

optimal_lags = {}
for feature in key_features:
    sample_lags = []
    for sid in samples_analyze:
        data = X_train[X_train['sample_index']==sid][feature].values
        if len(data) >= 50:
            max_lags = min(len(data)//2-1, 80)
            acf_vals = acf(data, nlags=max_lags)
            sig_bound = 1.96/np.sqrt(len(data))
            for lag in range(1, len(acf_vals)):
                if abs(acf_vals[lag]) < sig_bound:
                    sample_lags.append(lag)
                    break
            else:
                sample_lags.append(max_lags)
    if sample_lags:
        optimal_lags[feature] = int(np.median(sample_lags))

if optimal_lags:
    suggested = int(np.median(list(optimal_lags.values())))
    WINDOW_SIZE = max(min(suggested, 100), 40)
else:
    WINDOW_SIZE = 60

WINDOW_STRIDE = WINDOW_SIZE // 2

print(f'✅ WINDOW_SIZE from autocorrelation: {WINDOW_SIZE}')
print(f'   STRIDE: {WINDOW_STRIDE}')
print(f'💡 ADVICE 11/11: Data-driven window size!')

🔍 Analyzing autocorrelation...
✅ WINDOW_SIZE from autocorrelation: 40
   STRIDE: 20
💡 ADVICE 11/11: Data-driven window size!


## 3. Preprocessing con ADVICE 07/11 e 12/11

**ADVICE 07/11**: Map categorical per embeddings  
**ADVICE 12/11**: Aggiungi time features ciclici

In [4]:
# ADVICE 07/11: Map categorical features
cat_map = {
    'n_legs': {'two': 0, 'one+peg_leg': 1},
    'n_hands': {'two': 0, 'one+hook_hand': 1},
    'n_eyes': {'two': 0, 'one+eye_patch': 1}
}

X_proc = X_train.copy()
for col, mapping in cat_map.items():
    X_proc[col] = X_proc[col].map(mapping).fillna(0).astype(int)

# ADVICE 12/11: Add cyclical time features
max_time = X_proc['time'].max()
X_proc['time_sin'] = np.sin(2*np.pi*X_proc['time']/max_time)
X_proc['time_cos'] = np.cos(2*np.pi*X_proc['time']/max_time)
X_proc['time_norm'] = X_proc['time']/max_time

print('✅ Preprocessing done:')
print('   - ADVICE 07/11: Categorical mapped')
print('   - ADVICE 12/11: Time features (sin, cos, norm) added')
print(f'   Shape: {X_proc.shape}')

✅ Preprocessing done:
   - ADVICE 07/11: Categorical mapped
   - ADVICE 12/11: Time features (sin, cos, norm) added
   Shape: (105760, 43)


## 4. Creazione Finestre

In [5]:
# Create sliding windows
def create_windows(df, sample_idx, window_size, stride):
    sample = df[df['sample_index']==sample_idx].sort_values('time')
    feat_cols = [c for c in sample.columns if c not in ['sample_index','time']]
    features = sample[feat_cols].values
    
    windows = []
    for start in range(0, max(1, len(features)-window_size+1), stride):
        end = min(start+window_size, len(features))
        win = features[start:end]
        if len(win) < window_size:
            pad = np.zeros((window_size-len(win), win.shape[1]))
            win = np.vstack([win, pad])
        windows.append(win)
    return windows

print('🔄 Creating windows...')
all_windows = []
all_labels = []

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_train['label'])

for sid, label in zip(y_train['sample_index'], y_encoded):
    wins = create_windows(X_proc, sid, WINDOW_SIZE, WINDOW_STRIDE)
    all_windows.extend(wins)
    all_labels.extend([label]*len(wins))

X_windows = np.array(all_windows, dtype=np.float32)
y_windows = np.array(all_labels, dtype=np.int64)

print(f'✅ Windows: {X_windows.shape}')
print(f'   Labels: {y_windows.shape}')

🔄 Creating windows...
✅ Windows: (4627, 40, 41)
   Labels: (4627,)


In [6]:
# Split and normalize
X_tr, X_val, y_tr, y_val = train_test_split(
    X_windows, y_windows, test_size=0.2, random_state=SEED, stratify=y_windows
)

scaler = StandardScaler()
X_tr = scaler.fit_transform(X_tr.reshape(-1, X_tr.shape[-1])).reshape(X_tr.shape)
X_val = scaler.transform(X_val.reshape(-1, X_val.shape[-1])).reshape(X_val.shape)

print(f'📊 Split: Train {X_tr.shape}, Val {X_val.shape}')

# ADVICE 08/11: Compute class weights
class_weights_array = compute_class_weight('balanced', classes=np.unique(y_tr), y=y_tr)
class_weights_tensor = torch.FloatTensor(class_weights_array).to(device)

print(f'\n⚖️ ADVICE 08/11 - Class Weights:')
for i, w in enumerate(class_weights_array):
    print(f'   {label_encoder.classes_[i]}: {w:.3f}')

# Create DataLoaders
train_dataset = TensorDataset(
    torch.FloatTensor(X_tr),
    torch.LongTensor(y_tr)
)
val_dataset = TensorDataset(
    torch.FloatTensor(X_val),
    torch.LongTensor(y_val)
)

BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f'\n✅ DataLoaders ready (batch_size={BATCH_SIZE})')

📊 Split: Train (3701, 40, 41), Val (926, 40, 41)

⚖️ ADVICE 08/11 - Class Weights:
   high_pain: 3.929
   low_pain: 2.345
   no_pain: 0.431

✅ DataLoaders ready (batch_size=16)


## 5. ADVICE 13/11: Conv1D + LSTM

*"A pattern in time, like a pattern in space it is."*

In [7]:
# ADVICE 13/11: Build Conv1D + LSTM model
class ConvLSTMClassifier(nn.Module):
    """Hybrid CNN-LSTM for time series classification."""
    
    def __init__(self, input_size, num_classes, 
                 conv_filters=[64, 64], lstm_units=128, dropout=0.5):
        super().__init__()
        
        # Conv1D layers for local pattern extraction
        self.conv_layers = nn.ModuleList()
        in_channels = input_size
        
        for filters in conv_filters:
            self.conv_layers.append(nn.Sequential(
                nn.Conv1d(in_channels, filters, kernel_size=3, padding=1),
                nn.ReLU(),
                nn.BatchNorm1d(filters),
                nn.MaxPool1d(2),
                nn.Dropout(dropout)
            ))
            in_channels = filters
        
        # LSTM for temporal dependencies
        self.lstm = nn.LSTM(
            input_size=conv_filters[-1],
            hidden_size=lstm_units,
            batch_first=True,
            bidirectional=True
        )
        
        # Classification head
        self.fc1 = nn.Linear(lstm_units * 2, 64)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(64, num_classes)
    
    def forward(self, x):
        # x: (batch, seq_len, features)
        x = x.transpose(1, 2)  # -> (batch, features, seq_len) for Conv1D
        
        # Apply Conv1D layers
        for conv in self.conv_layers:
            x = conv(x)
        
        # Back to (batch, seq_len, features) for LSTM
        x = x.transpose(1, 2)
        
        # LSTM
        lstm_out, (h_n, c_n) = self.lstm(x)
        x = torch.cat([h_n[0], h_n[1]], dim=1)  # Concatenate bidirectional
        
        # Classification
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

# Initialize model
n_features = X_tr.shape[2]
n_classes = len(label_encoder.classes_)

model = ConvLSTMClassifier(
    input_size=n_features,
    num_classes=n_classes,
    conv_filters=[32, 32],
    lstm_units=64
).to(device)

print('✅ ADVICE 13/11: Conv1D + LSTM created')
print(f'   Input: ({WINDOW_SIZE}, {n_features})')
print(f'   Output: {n_classes} classes')
print(f'   Parameters: {sum(p.numel() for p in model.parameters()):,}')

✅ ADVICE 13/11: Conv1D + LSTM created
   Input: (40, 41)
   Output: 3 classes
   Parameters: 65,827


## 6. ADVICE 09/11 + 08/11: Loss Function

**ADVICE 09/11**: Label smoothing  
**ADVICE 08/11**: Class weighting

In [8]:
# ADVICE 09/11: Label Smoothing + ADVICE 08/11: Class Weighting
class WeightedLabelSmoothingCE(nn.Module):
    """Combines label smoothing (ADVICE 09/11) and class weighting (ADVICE 08/11)."""
    
    def __init__(self, class_weights, smoothing=0.1):
        super().__init__()
        self.smoothing = smoothing
        self.confidence = 1.0 - smoothing
        self.register_buffer('class_weights', class_weights)
    
    def forward(self, pred, target):
        pred = pred.log_softmax(dim=-1)
        
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (pred.size(-1) - 1))
            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
        
        # Apply class weights
        weights = self.class_weights[target]
        return torch.mean(weights * torch.sum(-true_dist * pred, dim=-1))

# Initialize loss function
criterion = WeightedLabelSmoothingCE(
    class_weights=class_weights_tensor,
    smoothing=0.1
)

print('✅ Loss function initialized:')
print('   - ADVICE 09/11: Label smoothing (0.1)')
print('   - ADVICE 08/11: Class weights integrated')

✅ Loss function initialized:
   - ADVICE 09/11: Label smoothing (0.1)
   - ADVICE 08/11: Class weights integrated


## 7. Training con ADVICE 10/11: Gradient Clipping

*"A step too great, from the precipice fall it makes you."*

In [9]:
# ADVICE 10/11: Training with gradient clipping
def train_epoch(model, loader, criterion, optimizer, device, max_grad_norm=1.0):
    """Train one epoch with gradient clipping."""
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        loss.backward()
        
        # ADVICE 10/11: Gradient clipping
        if max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        
        optimizer.step()
        
        total_loss += loss.item()
        all_preds.extend(outputs.argmax(1).cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(loader)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return avg_loss, f1

def eval_epoch(model, loader, criterion, device):
    """Evaluate one epoch."""
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            all_preds.extend(outputs.argmax(1).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(loader)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return avg_loss, f1, all_preds, all_labels

print('✅ Training functions defined with ADVICE 10/11 (gradient clipping)')

✅ Training functions defined with ADVICE 10/11 (gradient clipping)


### Configurazioni per la Grid Search

Definisci qui le combinazioni di iperparametri da testare prima di eseguire il training finale.


### Grid Search sugli iperparametri

Eseguiamo un ciclo sulle configurazioni definite, valutando l'F1 macro su validation per selezionare la migliore.

In [10]:
import optuna

# ============================
# 1) Objective per Optuna
# ============================
def objective(trial):

    # --- spazio degli iperparametri (adattalo come vuoi) ---
    conv_filters = trial.suggest_categorical(
        "conv_filters",
        [
            [160, 160],
            [48, 48],
            [64, 64],
            [48, 96],
            [64, 128],
            [64, 32],
            [128, 128],
            [128,64]
        ]
    )
    lstm_units = trial.suggest_categorical("lstm_units", [32,64, 128, 160, 192])

    dropout = trial.suggest_float("dropout", 0.10, 0.40)
    lr = trial.suggest_float("lr", 3e-4, 1e-3, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
    label_smoothing = trial.suggest_float("label_smoothing", 0.0, 0.15)

    scheduler_factor = trial.suggest_float("scheduler_factor", 0.2, 0.6)
    scheduler_patience = trial.suggest_int("scheduler_patience", 2, 5)

    early_stop_patience = trial.suggest_int("early_stop_patience", 4, 8)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.5, 2.0)

    # puoi anche far scegliere a Optuna il numero di epoche di "search"
    epochs_to_run = trial.suggest_int("search_epochs", 15, 50)

    # ============================
    # 2) Modello + training loop
    # ============================
    model_gs = ConvLSTMClassifier(
        input_size=n_features,
        num_classes=n_classes,
        conv_filters=conv_filters,
        lstm_units=lstm_units,
        dropout=dropout
    ).to(device)

    criterion_gs = WeightedLabelSmoothingCE(
        class_weights=class_weights_tensor,
        smoothing=label_smoothing
    )

    optimizer_gs = AdamW(
        model_gs.parameters(),
        lr=lr,
        weight_decay=weight_decay
    )

    scheduler_gs = ReduceLROnPlateau(
        optimizer_gs,
        mode='max',
        factor=scheduler_factor,
        patience=scheduler_patience
    )

    best_f1_cfg = -np.inf
    patience_counter = 0

    for epoch in range(epochs_to_run):
        train_loss, train_f1 = train_epoch(
            model_gs, train_loader, criterion_gs, optimizer_gs, device, max_grad_norm
        )
        val_loss, val_f1, _, _ = eval_epoch(
            model_gs, val_loader, criterion_gs, device
        )
        scheduler_gs.step(val_f1)

        '''print(
            f"  Trial {trial.number:03d} | "
            f"Epoch {epoch+1:02d}/{epochs_to_run} | "
            f"Train F1: {train_f1:.4f} | Val F1: {val_f1:.4f}",
            end="\r"
        )'''

        # per Optuna: log dello stato intermedio (utile per pruning)
        trial.report(val_f1, epoch)

        # pruning: se il trial va male, lo stoppa prima
        if trial.should_prune():
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            raise optuna.TrialPruned()

        # early stopping "classico"
        if val_f1 > best_f1_cfg:
            best_f1_cfg = val_f1
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= early_stop_patience:
                break

    print()  # newline

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Optuna massimizza questo
    return best_f1_cfg


# ============================
# 3) Lancio dello studio Optuna
# ============================
n_trials = 300  # quante combinazioni vuoi provare

study = optuna.create_study(
    direction="maximize",
    study_name="conv_lstm_opt"
)

study.optimize(objective, n_trials=n_trials)

# ============================
# 4) Risultati e tabella tipo grid_search_df
# ============================
print("\n🏁 Optuna terminato")
print(f"🔝 Best F1 val: {study.best_value:.4f}")
print("🔧 Best params:")
for k, v in study.best_params.items():
    print(f"   {k}: {v}")

# dataframe con tutti i trial, simile al tuo grid_search_df
optuna_df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
optuna_df = optuna_df.sort_values(by="value", ascending=False)
display(optuna_df)


[I 2025-11-14 00:24:15,735] A new study created in memory with name: conv_lstm_opt
[I 2025-11-14 00:25:59,845] Trial 0 finished with value: 0.9750621817660924 and parameters: {'conv_filters': [128, 128], 'lstm_units': 160, 'dropout': 0.3188154855045433, 'lr': 0.0005257104915418183, 'weight_decay': 5.348466673482687e-05, 'label_smoothing': 0.05221552572589589, 'scheduler_factor': 0.5311983430756677, 'scheduler_patience': 2, 'early_stop_patience': 4, 'max_grad_norm': 1.8829834674647157, 'search_epochs': 34}. Best is trial 0 with value: 0.9750621817660924.





[I 2025-11-14 00:28:26,576] Trial 1 finished with value: 0.977643662100553 and parameters: {'conv_filters': [160, 160], 'lstm_units': 192, 'dropout': 0.2879502720669003, 'lr': 0.0003513282523471821, 'weight_decay': 3.3827573301545166e-05, 'label_smoothing': 0.060489889516022716, 'scheduler_factor': 0.23067077583006365, 'scheduler_patience': 4, 'early_stop_patience': 7, 'max_grad_norm': 1.8296309975184604, 'search_epochs': 38}. Best is trial 1 with value: 0.977643662100553.





[I 2025-11-14 00:29:19,961] Trial 2 finished with value: 0.9683570845690784 and parameters: {'conv_filters': [48, 96], 'lstm_units': 160, 'dropout': 0.25304796310425004, 'lr': 0.0004439580730743453, 'weight_decay': 1.8626376096072425e-06, 'label_smoothing': 0.12698416152502218, 'scheduler_factor': 0.3725835304005193, 'scheduler_patience': 4, 'early_stop_patience': 4, 'max_grad_norm': 1.1281720753296962, 'search_epochs': 15}. Best is trial 1 with value: 0.977643662100553.





[I 2025-11-14 00:30:37,640] Trial 3 finished with value: 0.9748121525112244 and parameters: {'conv_filters': [160, 160], 'lstm_units': 160, 'dropout': 0.15283124140580828, 'lr': 0.00032075567174013936, 'weight_decay': 0.0005679855731283927, 'label_smoothing': 0.03595769370905515, 'scheduler_factor': 0.2750153842733519, 'scheduler_patience': 2, 'early_stop_patience': 6, 'max_grad_norm': 0.6647068225263149, 'search_epochs': 16}. Best is trial 1 with value: 0.977643662100553.





[I 2025-11-14 00:31:47,428] Trial 4 finished with value: 0.9685542702229952 and parameters: {'conv_filters': [128, 128], 'lstm_units': 32, 'dropout': 0.36669511134607335, 'lr': 0.0004928276657608646, 'weight_decay': 0.00011516755857050543, 'label_smoothing': 0.03963365155691585, 'scheduler_factor': 0.29585880716810575, 'scheduler_patience': 5, 'early_stop_patience': 8, 'max_grad_norm': 1.857705442176708, 'search_epochs': 22}. Best is trial 1 with value: 0.977643662100553.





[I 2025-11-14 00:31:56,308] Trial 5 pruned. 
[I 2025-11-14 00:31:59,823] Trial 6 pruned. 
[I 2025-11-14 00:32:09,474] Trial 7 pruned. 
[I 2025-11-14 00:32:12,613] Trial 8 pruned. 
[I 2025-11-14 00:32:16,815] Trial 9 pruned. 
[I 2025-11-14 00:32:47,628] Trial 10 pruned. 
[I 2025-11-14 00:32:58,110] Trial 11 pruned. 
[I 2025-11-14 00:33:00,817] Trial 12 pruned. 
[I 2025-11-14 00:33:04,500] Trial 13 pruned. 
[I 2025-11-14 00:33:21,644] Trial 14 pruned. 
[I 2025-11-14 00:33:25,841] Trial 15 pruned. 
[I 2025-11-14 00:33:33,200] Trial 16 pruned. 
[I 2025-11-14 00:33:36,854] Trial 17 pruned. 
[I 2025-11-14 00:33:39,611] Trial 18 pruned. 
[I 2025-11-14 00:33:45,896] Trial 19 pruned. 
[I 2025-11-14 00:34:00,943] Trial 20 pruned. 
[I 2025-11-14 00:35:21,798] Trial 21 finished with value: 0.9799949737834286 and parameters: {'conv_filters': [160, 160], 'lstm_units': 160, 'dropout': 0.1009876525994633, 'lr': 0.0003070928970945703, 'weight_decay': 0.0005963826391651269, 'label_smoothing': 0.03095790




[I 2025-11-14 00:35:36,241] Trial 22 pruned. 
[I 2025-11-14 00:37:31,608] Trial 23 finished with value: 0.9840655741374453 and parameters: {'conv_filters': [128, 128], 'lstm_units': 160, 'dropout': 0.10719836946533634, 'lr': 0.00030160348105263263, 'weight_decay': 0.0008306955221071393, 'label_smoothing': 0.0514445601302129, 'scheduler_factor': 0.24164889313027996, 'scheduler_patience': 3, 'early_stop_patience': 6, 'max_grad_norm': 0.955732164499852, 'search_epochs': 37}. Best is trial 23 with value: 0.9840655741374453.





[I 2025-11-14 00:37:41,701] Trial 24 pruned. 
[I 2025-11-14 00:38:29,862] Trial 25 pruned. 
[I 2025-11-14 00:40:17,947] Trial 26 finished with value: 0.9820912831158299 and parameters: {'conv_filters': [128, 128], 'lstm_units': 160, 'dropout': 0.10452605719751837, 'lr': 0.0003669110520955091, 'weight_decay': 0.0006590080484897349, 'label_smoothing': 0.03501941933057649, 'scheduler_factor': 0.3134768445690985, 'scheduler_patience': 3, 'early_stop_patience': 7, 'max_grad_norm': 0.7253641125155368, 'search_epochs': 25}. Best is trial 23 with value: 0.9840655741374453.





[I 2025-11-14 00:40:31,149] Trial 27 pruned. 
[I 2025-11-14 00:40:40,270] Trial 28 pruned. 
[I 2025-11-14 00:40:49,007] Trial 29 pruned. 
[I 2025-11-14 00:40:57,998] Trial 30 pruned. 
[I 2025-11-14 00:42:47,094] Trial 31 finished with value: 0.9769902553185862 and parameters: {'conv_filters': [160, 160], 'lstm_units': 160, 'dropout': 0.11910357885479522, 'lr': 0.00034555236604470816, 'weight_decay': 0.0001293799618000982, 'label_smoothing': 0.0810353421471178, 'scheduler_factor': 0.23182722578161408, 'scheduler_patience': 4, 'early_stop_patience': 6, 'max_grad_norm': 0.5192842236392128, 'search_epochs': 33}. Best is trial 23 with value: 0.9840655741374453.





[I 2025-11-14 00:42:50,820] Trial 32 pruned. 
[I 2025-11-14 00:42:55,121] Trial 33 pruned. 
[I 2025-11-14 00:42:58,739] Trial 34 pruned. 
[I 2025-11-14 00:43:01,349] Trial 35 pruned. 
[I 2025-11-14 00:43:04,655] Trial 36 pruned. 
[I 2025-11-14 00:43:13,260] Trial 37 pruned. 
[I 2025-11-14 00:43:22,179] Trial 38 pruned. 
[I 2025-11-14 00:43:25,225] Trial 39 pruned. 
[I 2025-11-14 00:43:33,916] Trial 40 pruned. 
[I 2025-11-14 00:43:43,956] Trial 41 pruned. 
[I 2025-11-14 00:43:53,539] Trial 42 pruned. 
[I 2025-11-14 00:44:31,898] Trial 43 pruned. 
[I 2025-11-14 00:44:45,006] Trial 44 pruned. 
[I 2025-11-14 00:44:48,362] Trial 45 pruned. 
[I 2025-11-14 00:44:50,865] Trial 46 pruned. 
[I 2025-11-14 00:45:06,929] Trial 47 pruned. 
[I 2025-11-14 00:45:11,744] Trial 48 pruned. 
[I 2025-11-14 00:45:19,643] Trial 49 pruned. 
[I 2025-11-14 00:45:22,146] Trial 50 pruned. 
[I 2025-11-14 00:45:30,809] Trial 51 pruned. 
[I 2025-11-14 00:45:39,538] Trial 52 pruned. 
[I 2025-11-14 00:45:43,768] Trial 




[I 2025-11-14 00:56:08,209] Trial 123 pruned. 
[I 2025-11-14 00:56:17,738] Trial 124 pruned. 
[I 2025-11-14 00:56:30,790] Trial 125 pruned. 
[I 2025-11-14 00:56:34,657] Trial 126 pruned. 
[I 2025-11-14 00:56:39,519] Trial 127 pruned. 
[I 2025-11-14 00:56:43,237] Trial 128 pruned. 
[I 2025-11-14 00:56:49,256] Trial 129 pruned. 
[I 2025-11-14 00:56:56,386] Trial 130 pruned. 
[I 2025-11-14 00:57:13,056] Trial 131 pruned. 
[I 2025-11-14 00:57:27,795] Trial 132 pruned. 
[I 2025-11-14 00:57:37,372] Trial 133 pruned. 
[I 2025-11-14 00:57:46,859] Trial 134 pruned. 
[I 2025-11-14 00:57:49,652] Trial 135 pruned. 
[I 2025-11-14 00:57:54,357] Trial 136 pruned. 
[I 2025-11-14 00:58:03,024] Trial 137 pruned. 
[I 2025-11-14 00:58:06,809] Trial 138 pruned. 
[I 2025-11-14 00:58:11,612] Trial 139 pruned. 
[I 2025-11-14 00:58:25,849] Trial 140 pruned. 
[I 2025-11-14 00:58:33,019] Trial 141 pruned. 
[I 2025-11-14 00:58:43,457] Trial 142 pruned. 
[I 2025-11-14 00:58:52,096] Trial 143 pruned. 
[I 2025-11-14




[I 2025-11-14 01:13:27,930] Trial 224 pruned. 
[I 2025-11-14 01:14:00,299] Trial 225 pruned. 
[I 2025-11-14 01:14:09,209] Trial 226 pruned. 
[I 2025-11-14 01:14:13,543] Trial 227 pruned. 
[I 2025-11-14 01:14:22,129] Trial 228 pruned. 
[I 2025-11-14 01:14:29,730] Trial 229 pruned. 
[I 2025-11-14 01:14:39,281] Trial 230 pruned. 
[I 2025-11-14 01:14:43,404] Trial 231 pruned. 
[I 2025-11-14 01:14:47,424] Trial 232 pruned. 
[I 2025-11-14 01:14:59,171] Trial 233 pruned. 
[I 2025-11-14 01:15:07,459] Trial 234 pruned. 
[I 2025-11-14 01:15:10,731] Trial 235 pruned. 
[I 2025-11-14 01:15:27,822] Trial 236 pruned. 
[I 2025-11-14 01:15:32,861] Trial 237 pruned. 
[I 2025-11-14 01:15:38,135] Trial 238 pruned. 
[I 2025-11-14 01:15:41,875] Trial 239 pruned. 
[I 2025-11-14 01:15:44,582] Trial 240 pruned. 
[I 2025-11-14 01:15:52,373] Trial 241 pruned. 
[I 2025-11-14 01:16:08,228] Trial 242 pruned. 
[I 2025-11-14 01:16:12,333] Trial 243 pruned. 
[I 2025-11-14 01:16:16,442] Trial 244 pruned. 
[I 2025-11-14


🏁 Optuna terminato
🔝 Best F1 val: 0.9847
🔧 Best params:
   conv_filters: [128, 128]
   lstm_units: 128
   dropout: 0.11775262021902702
   lr: 0.0005057161812291795
   weight_decay: 1.2290249626719205e-06
   label_smoothing: 0.03464710093006009
   scheduler_factor: 0.21208684021448007
   scheduler_patience: 3
   early_stop_patience: 6
   max_grad_norm: 1.0677414965036354
   search_epochs: 23


Unnamed: 0,number,value,params_conv_filters,params_dropout,params_early_stop_patience,params_label_smoothing,params_lr,params_lstm_units,params_max_grad_norm,params_scheduler_factor,params_scheduler_patience,params_search_epochs,params_weight_decay,state
223,223,0.984681,"[128, 128]",0.117753,6,0.034647,0.000506,128,1.067741,0.212087,3,23,0.000001,COMPLETE
23,23,0.984066,"[128, 128]",0.107198,6,0.051445,0.000302,160,0.955732,0.241649,3,37,0.000831,COMPLETE
122,122,0.983217,"[160, 160]",0.114202,7,0.051782,0.000335,160,1.155704,0.349190,4,29,0.000030,COMPLETE
26,26,0.982091,"[128, 128]",0.104526,7,0.035019,0.000367,160,0.725364,0.313477,3,25,0.000659,COMPLETE
21,21,0.979995,"[160, 160]",0.100988,6,0.030958,0.000307,160,0.806692,0.255116,2,17,0.000596,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,196,0.635553,"[48, 96]",0.123893,4,0.044955,0.000390,160,1.114788,0.271237,4,19,0.000130,PRUNED
264,264,0.628963,"[128, 128]",0.340786,7,0.070958,0.000475,160,0.598498,0.306801,2,41,0.000051,PRUNED
50,50,0.618562,"[64, 32]",0.291972,5,0.038385,0.000338,64,0.761416,0.206530,4,22,0.000166,PRUNED
12,12,0.614011,"[64, 64]",0.315921,5,0.007548,0.000305,64,1.661958,0.491541,3,31,0.000216,PRUNED


---\n## 🎯 Ensemble: Selezione Automatica dai Migliori Trial Optuna\n\nOra selezioniamo i migliori N modelli dai trial di Optuna e creiamo un ensemble.

In [None]:
# ============================\n# Selezione dei migliori N trial\n# ============================\nimport gc\nfrom copy import deepcopy\nfrom scipy.special import softmax\n\n# Numero di modelli da includere nell'ensemble\nN_MODELS_ENSEMBLE = 9\n\n# Ottieni i migliori trial ordinati per F1\ntrials_df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))\ntrials_df = trials_df[trials_df['state'] == 'COMPLETE']\ntrials_df = trials_df.sort_values(by='value', ascending=False)\n\n# Seleziona i top N\nbest_trials = trials_df.head(N_MODELS_ENSEMBLE)\n\nprint(f"\n📊 Selezionati i migliori {N_MODELS_ENSEMBLE} trial da Optuna:")\nprint("="*70)\nfor idx, row in best_trials.iterrows():\n    print(f"Trial {row['number']:3d}: F1 = {row['value']:.4f}")\nprint("="*70)\n\nprint(f"\n🔍 Range F1 scores: {best_trials['value'].min():.4f} - {best_trials['value'].max():.4f}")\nprint(f"   Media F1: {best_trials['value'].mean():.4f}")

In [None]:
# ============================\n# Training ensemble models\n# ============================\n\ndef train_model_from_trial(trial_params, trial_number, X_tr, y_tr, X_val, y_val, \n                          device, class_weights_tensor, n_features, n_classes):\n    \"\"\"Train a single model with parameters from Optuna trial.\"\"\"\n    \n    print(f"\\n{'='*60}")\n    print(f"  Training model from Trial {trial_number}")\n    print(f"{'='*60}")\n    \n    # Extract parameters\n    conv_filters = trial_params['params_conv_filters']\n    lstm_units = trial_params['params_lstm_units']\n    dropout = trial_params['params_dropout']\n    lr = trial_params['params_lr']\n    weight_decay = trial_params['params_weight_decay']\n    label_smoothing = trial_params['params_label_smoothing']\n    scheduler_factor = trial_params['params_scheduler_factor']\n    scheduler_patience = trial_params['params_scheduler_patience']\n    early_stop_patience = trial_params['params_early_stop_patience']\n    max_grad_norm = trial_params['params_max_grad_norm']\n    max_epochs = trial_params['params_search_epochs']\n    \n    print(f"  Config: conv={conv_filters}, lstm={lstm_units}, dropout={dropout:.3f}")\n    print(f"  LR: {lr:.6f}, epochs: {max_epochs}")\n    \n    # Create model\n    model = ConvLSTMClassifier(\n        input_size=n_features,\n        num_classes=n_classes,\n        conv_filters=conv_filters,\n        lstm_units=lstm_units,\n        dropout=dropout\n    ).to(device)\n    \n    criterion = WeightedLabelSmoothingCE(\n        class_weights=class_weights_tensor,\n        smoothing=label_smoothing\n    )\n    \n    optimizer = AdamW(\n        model.parameters(),\n        lr=lr,\n        weight_decay=weight_decay\n    )\n    \n    scheduler = ReduceLROnPlateau(\n        optimizer,\n        mode='max',\n        factor=scheduler_factor,\n        patience=scheduler_patience\n    )\n    \n    # Training loop\n    best_f1 = -np.inf\n    best_state = None\n    patience_counter = 0\n    history = []\n    \n    for epoch in range(max_epochs):\n        # Train\n        train_loss, train_f1 = train_epoch(\n            model, train_loader, criterion, optimizer, device, max_grad_norm\n        )\n        \n        # Validate\n        val_loss, val_f1, val_preds, val_labels = eval_epoch(\n            model, val_loader, criterion, device\n        )\n        \n        scheduler.step(val_f1)\n        \n        history.append({\n            'epoch': epoch + 1,\n            'train_loss': train_loss,\n            'train_f1': train_f1,\n            'val_loss': val_loss,\n            'val_f1': val_f1\n        })\n        \n        print(f"  Epoch {epoch+1:02d}/{max_epochs} | "\n              f"Train: loss={train_loss:.4f} F1={train_f1:.4f} | "\n              f"Val: loss={val_loss:.4f} F1={val_f1:.4f}")\n        \n        # Early stopping\n        if val_f1 > best_f1:\n            best_f1 = val_f1\n            best_state = deepcopy(model.state_dict())\n            patience_counter = 0\n            print(f"    ✅ New best F1: {best_f1:.4f}")\n        else:\n            patience_counter += 1\n            if patience_counter >= early_stop_patience:\n                print(f"    ⏹️ Early stopping at epoch {epoch+1}")\n                break\n    \n    # Restore best weights\n    if best_state is not None:\n        model.load_state_dict(best_state)\n    \n    print(f"  Final best F1: {best_f1:.4f}\\n")\n    \n    return model, best_f1, history\n\n\n# Train all ensemble models\nprint("\\n" + "="*70)\nprint("🚀 TRAINING ENSEMBLE MODELS")\nprint("="*70)\n\nensemble_models = []\nensemble_f1_scores = []\nensemble_histories = []\nensemble_trial_numbers = []\n\nfor idx, row in best_trials.iterrows():\n    trial_number = int(row['number'])\n    \n    model, f1_score, history = train_model_from_trial(\n        row, trial_number, X_tr, y_tr, X_val, y_val,\n        device, class_weights_tensor, n_features, n_classes\n    )\n    \n    ensemble_models.append(model)\n    ensemble_f1_scores.append(f1_score)\n    ensemble_histories.append(history)\n    ensemble_trial_numbers.append(trial_number)\n    \n    gc.collect()\n    if torch.cuda.is_available():\n        torch.cuda.empty_cache()\n\nprint("\\n" + "="*70)\nprint(f"✅ Training completato! {len(ensemble_models)} modelli allenati.")\nprint("="*70)

In [None]:
# ============================\n# EWA Weighting\n# ============================\n\nensemble_f1_array = np.array(ensemble_f1_scores)\n\nprint("\\n" + "="*70)\nprint("📊 VALUTAZIONE ENSEMBLE E CALCOLO PESI EWA")\nprint("="*70)\n\nprint("\\nF1 scores dei singoli modelli:")\nfor i, (trial_num, f1) in enumerate(zip(ensemble_trial_numbers, ensemble_f1_scores)):\n    print(f"  Modello {i} (Trial {trial_num}): F1 = {f1:.4f}")\n\n# EWA weights\neta = 15.0\nlosses = 1.0 - ensemble_f1_array\nraw_weights = np.exp(-eta * losses)\nweights = raw_weights / raw_weights.sum()\n\nprint(f"\\nPesi EWA (eta={eta}):")\nfor i, (trial_num, w) in enumerate(zip(ensemble_trial_numbers, weights)):\n    print(f"  Modello {i} (Trial {trial_num}): peso = {w:.4f}")\n\nprint(f"\\nStatistiche:")\nprint(f"  F1 medio singoli: {ensemble_f1_array.mean():.4f}")\nprint(f"  F1 min: {ensemble_f1_array.min():.4f}")\nprint(f"  F1 max: {ensemble_f1_array.max():.4f}")\nprint(f"  Somma pesi: {weights.sum():.4f}")

In [None]:
# ============================\n# Ensemble Prediction Functions\n# ============================\n\n@torch.no_grad()\ndef predict_proba_batch(model, X_batch, device):\n    \"\"\"Get probability predictions for a batch.\"\"\"\n    model.eval()\n    X_tensor = torch.FloatTensor(X_batch).to(device)\n    logits = model(X_tensor)\n    probs = F.softmax(logits, dim=1)\n    return probs.cpu().numpy()\n\n\ndef ensemble_predict_weighted(models, X_data, weights, device, batch_size=256):\n    \"\"\"Make weighted ensemble predictions.\"\"\"\n    n_samples = X_data.shape[0]\n    n_classes = len(label_encoder.classes_)\n    \n    # Initialize weighted probabilities\n    weighted_probs = np.zeros((n_samples, n_classes))\n    \n    # Accumulate weighted predictions from each model\n    for model, weight in zip(models, weights):\n        model_probs = np.zeros((n_samples, n_classes))\n        \n        # Process in batches\n        for i in range(0, n_samples, batch_size):\n            batch_end = min(i + batch_size, n_samples)\n            X_batch = X_data[i:batch_end]\n            batch_probs = predict_proba_batch(model, X_batch, device)\n            model_probs[i:batch_end] = batch_probs\n        \n        weighted_probs += weight * model_probs\n    \n    # Apply softmax and get predictions\n    weighted_probs = softmax(weighted_probs, axis=1)\n    predictions = np.argmax(weighted_probs, axis=1)\n    \n    return predictions, weighted_probs\n\n\nprint("✅ Funzioni di predizione ensemble definite")\n

In [None]:
# ============================\n# Valutazione Ensemble su Validation\n# ============================\n\nprint("\\n" + "="*70)\nprint("🧪 VALUTAZIONE ENSEMBLE SU VALIDATION SET")\nprint("="*70)\n\n# Ensemble predictions\ny_val_pred_ensemble, y_val_probs_ensemble = ensemble_predict_weighted(\n    ensemble_models, X_val, weights, device\n)\n\n# Calculate metrics\nensemble_f1 = f1_score(y_val, y_val_pred_ensemble, average='macro')\nensemble_acc = (y_val_pred_ensemble == y_val).mean()\n\nprint(f"\\n📊 Risultati Ensemble:")\nprint(f"  F1 Score (macro): {ensemble_f1:.4f}")\nprint(f"  Accuracy: {ensemble_acc:.4f}")\nprint(f"\\n  Confronto con singoli modelli:")\nprint(f"    F1 medio singoli: {ensemble_f1_array.mean():.4f}")\nprint(f"    F1 best singolo: {ensemble_f1_array.max():.4f}")\nprint(f"    Miglioramento su media: {(ensemble_f1 - ensemble_f1_array.mean())*100:.2f}%")\nprint(f"    Miglioramento su best: {(ensemble_f1 - ensemble_f1_array.max())*100:.2f}%")\n\n# Classification report\nprint("\\n📋 Classification Report (Ensemble):")\nprint(classification_report(y_val, y_val_pred_ensemble, \n                          target_names=label_encoder.classes_, \n                          digits=4))\n\n# Confusion matrix\nprint("\\n📊 Confusion Matrix:")\ncm = confusion_matrix(y_val, y_val_pred_ensemble)\nprint(cm)

In [None]:
# ============================\n# Test Prediction con Ensemble\n# ============================\n\nprint("\\n" + "="*70)\nprint("🔮 PREDIZIONE TEST CON ENSEMBLE")\nprint("="*70)\n\n# Load and preprocess test data (già fatto nelle celle precedenti dal notebook originale)\n# X_test dovrebbe già essere caricato\n\ntest_sample_indices = X_test['sample_index'].unique()\nsample_predictions = {}\n\nfor sid in tqdm(test_sample_indices, desc='Predicting with ensemble'):\n    # Create windows\n    windows = create_windows(X_test_proc, sid, WINDOW_SIZE, WINDOW_STRIDE)\n    \n    if len(windows) > 0:\n        X_sample = np.array(windows, dtype=np.float32)\n        X_sample = scaler.transform(\n            X_sample.reshape(-1, X_sample.shape[-1])\n        ).reshape(X_sample.shape)\n        \n        # Ensemble prediction\n        preds, probs = ensemble_predict_weighted(\n            ensemble_models, X_sample, weights, device, batch_size=256\n        )\n        \n        # Average probabilities across windows\n        avg_probs = probs.mean(axis=0)\n        final_pred = np.argmax(avg_probs)\n        \n        sample_predictions[sid] = final_pred\n\nprint(f"\\n✅ Predizioni completate per {len(sample_predictions)} campioni")

In [None]:
# ============================\n# Creazione Submission\n# ============================\n\nsubmission_data = []\nfor sid in sorted(sample_predictions.keys()):\n    pred_class = sample_predictions[sid]\n    pred_label = label_encoder.classes_[pred_class]\n    submission_data.append({\n        'sample_index': sid,\n        'label': pred_label\n    })\n\nsubmission = pd.DataFrame(submission_data)\nsubmission.to_csv('submission_optuna_ensemble.csv', index=False)\n\nprint('✅ Submission created!')\nprint(f'   File: submission_optuna_ensemble.csv')\nprint(f'   Shape: {submission.shape}')\nprint(f'\\n📊 Predicted label distribution:')\nprint(submission['label'].value_counts())\n\nprint("\\n" + "="*70)\nprint("🎉 PIPELINE COMPLETATA CON SUCCESSO!")\nprint("="*70)\nprint(f"\\n📈 Riepilogo:")\nprint(f"  - {len(ensemble_models)} modelli nell'ensemble (dai migliori trial Optuna)")\nprint(f"  - F1 ensemble su validation: {ensemble_f1:.4f}")\nprint(f"  - Miglioramento rispetto al best single model: {(ensemble_f1 - ensemble_f1_array.max())*100:.2f}%")\nprint(f"  - File submission: submission_optuna_ensemble.csv")

In [None]:
# ============================\n# Visualizzazioni\n# ============================\n\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\nfig, axes = plt.subplots(1, 2, figsize=(15, 5))\n\n# Plot 1: F1 scores dei modelli\nax1 = axes[0]\nx_pos = np.arange(len(ensemble_f1_scores))\nax1.bar(x_pos, ensemble_f1_scores, alpha=0.7, label='Modelli individuali')\nax1.axhline(y=ensemble_f1, color='r', linestyle='--', linewidth=2, \n           label=f'Ensemble F1: {ensemble_f1:.4f}')\nax1.axhline(y=ensemble_f1_array.mean(), color='g', linestyle=':', linewidth=2,\n           label=f'Media F1: {ensemble_f1_array.mean():.4f}')\nax1.set_xlabel('Modello Index')\nax1.set_ylabel('F1 Score (macro)')\nax1.set_title('F1 Scores: Modelli Individuali vs Ensemble')\nax1.legend()\nax1.grid(True, alpha=0.3)\n\n# Add trial numbers as labels\nax1.set_xticks(x_pos)\nax1.set_xticklabels([f'T{tn}' for tn in ensemble_trial_numbers], rotation=45)\n\n# Plot 2: Pesi EWA\nax2 = axes[1]\nax2.bar(x_pos, weights, alpha=0.7, color='orange')\nax2.set_xlabel('Modello Index')\nax2.set_ylabel('Peso EWA')\nax2.set_title(f'Pesi EWA (η={eta})')\nax2.grid(True, alpha=0.3)\nax2.set_xticks(x_pos)\nax2.set_xticklabels([f'T{tn}' for tn in ensemble_trial_numbers], rotation=45)\n\nplt.tight_layout()\nplt.savefig('ensemble_optuna_results.png', dpi=150, bbox_inches='tight')\nplt.show()\n\nprint("\\n✅ Grafici salvati in 'ensemble_optuna_results.png'")