In [37]:
import os
import torch
import librosa
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
from sklearn.metrics import f1_score
from utils import load_labels_from_dataset, get_audio_paths
from collections import defaultdict
from sklearn.metrics import accuracy_score, confusion_matrix

# Impostazione del seed per la riproducibilità
seed_value = 42
np.random.seed(seed_value)
torch.manual_seed(seed_value)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Controlla se la GPU è disponibile
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [38]:
class AudioDepressionDataset(Dataset):
    def __init__(self, audio_paths, labels, return_filename=False, sample_rate=16_000, segment_ms=250, hop_ms=50):
        self.audio_paths = audio_paths
        self.labels = labels
        self.return_filename = return_filename
        self.sample_rate = sample_rate
        self.segment_samples = int(sample_rate * (segment_ms / 1000.0))
        self.hop_samples = int(sample_rate * (hop_ms / 1000.0))
        self.segments = []

        for audio_path in self.audio_paths:
            info = torchaudio.info(audio_path)
            num_frames = info.num_frames

            start = 0
            while start + self.segment_samples <= num_frames:
                self.segments.append({
                    "path": audio_path,
                    "filename": os.path.basename(audio_path), 
                    "start_sample": start,
                    "label": self.labels[self.audio_paths.index(audio_path)]
                })
                start += self.hop_samples

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        segment_info = self.segments[idx]
        file_path = segment_info["path"]
        start_sample = segment_info["start_sample"]
        label = segment_info["label"]

        waveform_segment, _ = torchaudio.load(
            file_path,
            frame_offset=start_sample,
            num_frames=self.segment_samples
        )
        
        if self.return_filename:
            return {
                'input_values': waveform_segment, 
                'label': torch.tensor([label], dtype=torch.float32),
                'filename': segment_info["filename"]
            }
        else:
            return {
                'input_values': waveform_segment, 
                'label': torch.tensor([label], dtype=torch.float32)
            }

In [39]:
class CNNMLP(nn.Module):
    def __init__(self, dropout_rate=0.25):
        super(CNNMLP, self).__init__()
        
        # conv1
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=16, kernel_size=64, stride=1),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        
        # conv2
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=32, stride=1),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        
        # conv3
        self.conv_block3 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=16, stride=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        
        # MLP
        self.flatten = nn.Flatten()
        self.mlp_block = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(30464, 128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, batch):
        x = batch['input_values']
        # Input: x.shape -> [batch_size, 1, 4000] (1 canale, 250ms a 16kHz)
        x = self.conv_block1(x)
        # Dopo Conv1d: [batch_size, 16, 4000 - 64 + 1] = [batch_size, 16, 3937]
        # Dopo MaxPool1d: [batch_size, 16, 3937 // 2] = [batch_size, 16, 1968]
        x = self.conv_block2(x)
        # Dopo Conv1d: [batch_size, 32, 1968 - 32 + 1] = [batch_size, 32, 1937]
        # Dopo MaxPool1d: [batch_size, 32, 1937 // 2] = [batch_size, 32, 968]
        x = self.conv_block3(x)
        # Dopo Conv1d: [batch_size, 64, 968 - 16 + 1] = [batch_size, 64, 953]
        # Dopo MaxPool1d: [batch_size, 64, 953 // 2] = [batch_size, 64, 476]
        
        x_flattened = self.flatten(x) # [batch_size, 64 * 476] = [batch_size, 30464]
        output = self.mlp_block(x_flattened)
        return output

In [40]:
dataset_name = "datasets/DAIC-WOZ-Cleaned"

train_df = pd.read_csv(os.path.join(dataset_name, 'train_split_Depression_AVEC2017.csv'))
dev_df = pd.read_csv(os.path.join(dataset_name, 'dev_split_Depression_AVEC2017.csv'))

y_train = load_labels_from_dataset(train_df)
y_dev = load_labels_from_dataset(dev_df) 

train_paths = get_audio_paths(train_df, dataset_name)
dev_paths = get_audio_paths(dev_df, dataset_name)


train_dataset = AudioDepressionDataset(
    audio_paths=train_paths,
    labels=y_train,
)

dev_dataset = AudioDepressionDataset(
    audio_paths=dev_paths,
    labels=y_dev
)

# DataLoaders
batch_size = 200
num_workers = 0

train_dataloader = DataLoader(
    train_dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=num_workers
)

dev_dataloader = DataLoader(
    dev_dataset, 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=num_workers
)

# Modello
model = CNNMLP()

KeyboardInterrupt: 

In [None]:
def print_model_summary(model):
    print("-"*89)
    print(f"| Layer Name                                              | # of Parameters | Trainable |")
    print("-"*89)
    total_num_trainable_params = 0
    for layer_name, layer_params in model.named_parameters():
        if layer_params.requires_grad:
            total_num_trainable_params += layer_params.numel()
        print(f"| {layer_name:<55} | {layer_params.numel():<15} | {str(layer_params.requires_grad):<9} |")
    print("-"*89)
    print(f"| Total # of Parameters: {total_num_trainable_params:<62} |" )
    print("-"*89)

print_model_summary(model)

-----------------------------------------------------------------------------------------
| Layer Name                                              | # of Parameters | Trainable |
-----------------------------------------------------------------------------------------
| conv_block1.0.weight                                    | 1024            | True      |
| conv_block1.0.bias                                      | 16              | True      |
| conv_block1.1.weight                                    | 16              | True      |
| conv_block1.1.bias                                      | 16              | True      |
| conv_block2.0.weight                                    | 16384           | True      |
| conv_block2.0.bias                                      | 32              | True      |
| conv_block2.1.weight                                    | 32              | True      |
| conv_block2.1.bias                                      | 32              | True      |
| conv_blo

In [None]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.0, mode='max'):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = -np.inf if mode == 'max' else np.inf

    def __call__(self, current_score):
        if self.mode == 'max':
            improvement = (current_score - self.best_score) > self.min_delta
        else:
            improvement = (self.best_score - current_score) > self.min_delta

        if improvement:
            self.best_score = current_score
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True  # Early stop
        return False

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, scheduler, device,epoch, num_epochs):
    model.train()
    total_loss, correct_predictions = 0, 0
    train_pbar = tqdm(enumerate(train_dataloader), 
                      total=len(train_dataloader),
                      desc=f"Epoch {epoch+1}/{num_epochs} - Training")
    for batch_idx, batch in train_pbar:
        # Preleva i dati dal batch
        batch['input_values'] = batch['input_values'].to(device)
        batch['label'] = batch['label'].to(device)
        
        optimizer.zero_grad()

        outputs = model(batch)

        loss = loss_fn(outputs, batch['label'])
        total_loss += loss.item()

        # Calcolo delle predizioni
        preds = (outputs > 0.5).float()
        correct_predictions += torch.sum(preds == batch['label'])

        # Backpropagation e aggiornamento dei pesi
        loss.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()  # Aggiorna lo scheduler se necessario

        train_pbar.set_postfix(loss=loss.item(), accuracy=correct_predictions.double() / ((batch_idx + 1) * data_loader.batch_size))

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / len(data_loader.dataset)
    return avg_loss, accuracy

def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss, correct_predictions = 0, 0
    predictions, targets = [], []
    with torch.no_grad():
        for batch in data_loader:
            batch['input_values'] = batch['input_values'].to(device)
            batch['label'] = batch['label'].to(device)

            outputs = model(batch)
            loss = loss_fn(outputs,  batch['label'])
            total_loss += loss.item()

            preds = (outputs > 0.5).float()
            correct_predictions += torch.sum(preds == batch['label'])

            predictions.extend(preds.cpu().numpy())
            targets.extend(batch['label'].cpu().numpy())

    f1 = f1_score(targets, predictions, average='macro')
    return total_loss / len(data_loader), correct_predictions.double() / len(data_loader.dataset), f1

In [None]:
# Ottimizzatore e loss
optimizer = optim.AdamW(model.parameters(), lr=0.01)
criterion = nn.BCELoss()
early_stopping = EarlyStopping(patience=5, min_delta=0.005, mode='max')
model_save_path = "depression_classifier_best.pth"

model = model.to(device)

# Scheduler per il learning rate
num_epochs = 100
total_steps  = len(train_dataloader) * num_epochs
num_warmup_steps = total_steps // 10
scheduler = None
best_val_acc = -np.inf
best_model_weights = None

for epoch in range(num_epochs):
    # Training
    train_loss, train_acc = train_epoch(
        model, 
        train_dataloader, 
        criterion, 
        optimizer, 
        scheduler, 
        device,
        epoch,
        num_epochs
    )
    
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")

    # Validation
    val_loss, val_acc, val_f1 = eval_model(
        model, 
        dev_dataloader, 
        criterion, 
        device
    )
    
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}, Validation F1: {val_f1:.4f}")
    # Salvataggio miglior modello
    if val_acc > best_val_acc + early_stopping.min_delta:
        best_val_acc = val_acc
        best_model_weights = model.state_dict().copy()
        print(f"Nuovo miglior Accuracy: {best_val_acc:.4f}")

    if early_stopping(val_acc):
        print(f"Early stopping attivato. Miglior accuracy: {best_val_acc:.4f}")
        break
    
print("Training Completato")
print(f"Miglior Accuracy: {best_val_acc:.4f}")
torch.save(model.state_dict(), model_save_path)


=== Epoch 1/100 ===


Epoch 1/100 - Training:   0%|          | 0/5252 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def eval_model_test(model, data_loader, device):
    # Dizionario per raccogliere le probabilità per ogni file
    # defaultdict(list) crea una lista vuota per ogni nuova chiave
    file_scores = defaultdict(list)
    file_labels = {} # Dizionario per memorizzare l'etichetta di ogni file
    
    with torch.no_grad():
        for batch in data_loader:
            batch['input_values'] = batch['input_values'].to(device)
            batch['label'] = batch['label'].to(device)
            batch['filename'] = batch['filename'].to(device) 
            filenames = batch['filename']    

            outputs = model(batch)

            for i in range(len(filenames)):
                filename = filenames[i]
                score = outputs[i].item() # La probabilità predetta
                label = batch['label'].item()
                
                file_scores[filename].append(score)
                
                # Memorizziamo l'etichetta del file (sarà la stessa per tutti i suoi segmenti)
                if filename not in file_labels:
                    file_labels[filename] = int(label)
                
        final_predictions = []
        true_labels = []

        # Iteriamo sui file in ordine alfabetico per assicurarci che l'ordine sia consistente
        for filename in sorted(file_scores.keys()):
            avg_score = np.mean(file_scores[filename])
            predicted_label = 1 if avg_score > 0.5 else 0
            
            final_predictions.append(predicted_label)
            true_labels.append(file_labels[filename])
    
        accuracy = accuracy_score(true_labels, final_predictions)

        # La Confusion Matrix ci dà TP, TN, FP, FN per calcolare sensitività e specificità
        tn, fp, fn, tp = confusion_matrix(true_labels, final_predictions).ravel()
        
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
        
        return accuracy, sensitivity, specificity

In [None]:
test_df = pd.read_csv(os.path.join(dataset_name, 'full_test_split.csv'))
y_test = load_labels_from_dataset(test_df)
test_paths = get_audio_paths(test_df, dataset_name)

test_dataset = AudioDepressionDataset(
    audio_paths=test_paths,
    labels=y_test,
    return_filename=True,
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers
)

# Carica il miglior modello
model.load_state_dict(best_model_weights)
model.eval()

test_loss, test_acc, test_f1 = eval_model_test(
    model, 
    test_dataloader, 
    criterion, 
    device
)

print(f"\n=== Test Results ===")
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}, Test F1: {test_f1:.4f}")