In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
from sklearn.metrics import accuracy_score, recall_score
from tqdm import tqdm
from utils import load_labels_from_dataset, get_audio_paths

In [7]:
class CNNMLP(nn.Module):
    """
    Implementazione dell'architettura CNN+MLP descritta nel paper.
    La rete accetta in input segmenti di waveform audio (1D).
    """
    def __init__(self, dropout_rate=0.5):
        super(CNNMLP, self).__init__()
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=16, kernel_size=64, stride=1),
            nn.BatchNorm1d(num_features=16),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Dropout(p=dropout_rate)
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=32, stride=1),
            nn.BatchNorm1d(num_features=32),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Dropout(p=dropout_rate)
        )
        self.conv_block3 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=16, stride=1),
            nn.BatchNorm1d(num_features=64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Dropout(p=dropout_rate)
        )
        self.flatten = nn.Flatten()
        self.mlp_block = nn.Sequential(
            nn.Linear(in_features=1, out_features=128), # Placeholder, verrà inizializzato dinamicamente
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(in_features=128, out_features=1),
            nn.Sigmoid()
        )
        self._mlp_initialized = False

    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x_flattened = self.flatten(x)
        if not self._mlp_initialized:
            in_features = x_flattened.shape[1]
            self.mlp_block[0] = nn.Linear(in_features, 128).to(x.device)
            print(f"MLP inizializzato dinamicamente con {in_features} feature di input.")
            self._mlp_initialized = True
        output = self.mlp_block(x_flattened)
        return output

In [8]:
class AudioSegmentDataset(Dataset):
    """
    Dataset PyTorch generico per caricare e segmentare file audio.
    """
    def __init__(self, file_paths, labels, sr=16000, segment_ms=250, hop_ms=50):
        self.file_paths = file_paths
        self.labels = labels
        self.sr = sr
        self.segment_length = int(sr * (segment_ms / 1000.0))
        self.hop_length = int(sr * (hop_ms / 1000.0))
        self.segments = []
        self.segment_labels = []
        self._create_segments()

    def _create_segments(self):
        print("Creazione dei segmenti dal dataset...")
        for i, file_path in enumerate(self.file_paths):
            label = self.labels[i]
            try:
                waveform, original_sr = librosa.load(file_path, sr=None)
                if np.max(np.abs(waveform)) > 0:
                    waveform = waveform / np.max(np.abs(waveform))
                start = 0
                while start + self.segment_length <= len(waveform):
                    segment = waveform[start : start + self.segment_length]
                    self.segments.append(segment)
                    self.segment_labels.append(label)
                    start += self.hop_length
            except Exception as e:
                print(f"Errore durante l'elaborazione del file {file_path}: {e}")
        print(f"Creati {len(self.segments)} segmenti totali.")

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        segment = self.segments[idx]
        label = self.segment_labels[idx]
        segment_tensor = torch.tensor(segment, dtype=torch.float32).unsqueeze(0)
        label_tensor = torch.tensor(label, dtype=torch.float32)
        return segment_tensor, label_tensor

In [None]:
# --- Impostazioni e Iperparametri ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Dispositivo in uso: {DEVICE}")

dataset_name = "datasets/DAIC-WOZ-Cleaned"
LEARNING_RATE = 0.0001
BATCH_SIZE = 32
NUM_EPOCHS = 20
DROPOUT_RATE = 0.5
SR = 16000
SEGMENT_MS = 250
HOP_MS = 50

# --- Caricamento dei DataFrame ---
try:
    train_df = pd.read_csv(os.path.join(dataset_name, 'train_split_Depression_AVEC2017.csv'))
    dev_df = pd.read_csv(os.path.join(dataset_name, 'dev_split_Depression_AVEC2017.csv'))
    test_df = pd.read_csv(os.path.join(dataset_name, 'full_test_split.csv'))

    # --- Estrazione di Percorsi e Etichette ---
    y_train = load_labels_from_dataset(train_df)
    y_dev = load_labels_from_dataset(dev_df) 
    y_test = load_labels_from_dataset(test_df)

    train_paths = get_audio_paths(train_df, dataset_name)
    dev_paths = get_audio_paths(dev_df, dataset_name)
    test_paths = get_audio_paths(test_df, dataset_name)

    print(f"\nFile per training: {len(train_paths)}, Etichette: {len(y_train)}")
    print(f"File per validation: {len(dev_paths)}, Etichette: {len(y_dev)}")
    print(f"File per test: {len(test_paths)}, Etichette: {len(y_test)}")

    # --- Inizializzazione di Dataset e DataLoader ---
    train_dataset = AudioSegmentDataset(
        train_paths, y_train, sr=SR, segment_ms=SEGMENT_MS, hop_ms=HOP_MS
    )
    train_loader = DataLoader(
        train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2
    )

    # --- Inizializzazione Modello, Loss e Ottimizzatore ---
    model = CNNMLP(dropout_rate=DROPOUT_RATE).to(DEVICE)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    # --- Ciclo di Addestramento e Valutazione ---
    print("\n--- Inizio Addestramento ---")
    for epoch in range(NUM_EPOCHS):
        avg_train_loss = train_loop(model, train_loader, criterion, optimizer, DEVICE)
        
        # Valutazione sul set di sviluppo (dev) alla fine di ogni epoca
        acc, sens, spec = evaluate(model, dev_paths, y_dev, DEVICE, sr=SR, segment_ms=SEGMENT_MS, hop_ms=HOP_MS)
        
        print(f"\nEpoca {epoch + 1}/{NUM_EPOCHS}")
        print(f"  Loss di Addestramento: {avg_train_loss:.4f}")
        print(f"  Metriche di Sviluppo (Dev) -> Accuratezza: {acc:.4f} | Sensitività: {sens:.4f} | Specificità: {spec:.4f}")

    print("\n--- Addestramento Completato ---")

    # --- Valutazione Finale sul Test Set ---
    print("\n--- Valutazione Finale sul Test Set ---")
    test_acc, test_sens, test_spec = evaluate(model, test_paths, y_test, DEVICE, sr=SR, segment_ms=SEGMENT_MS, hop_ms=HOP_MS)
    print(f"Accuratezza Test: {test_acc:.4f}")
    print(f"Sensitività Test (Recall classe 1): {test_sens:.4f}")
    print(f"Specificità Test (Recall classe 0): {test_spec:.4f}")

except FileNotFoundError:
    print(f"\nERRORE: Uno dei file CSV non è stato trovato nella cartella '{dataset_name}'.")
    print("Assicurati che il percorso sia corretto e che i file 'train_split_Depression_AVEC2017.csv',")
    print("'dev_split_Depression_AVEC2017.csv' e 'full_test_split.csv' esistano.")

Dispositivo in uso: cpu

File per training: 107, Etichette: 107
File per validation: 35, Etichette: 35
File per test: 47, Etichette: 47
Creazione dei segmenti dal dataset...
Creati 1050247 segmenti totali.

--- Inizio Addestramento ---
