In [6]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, f1_score
from tqdm import tqdm
import random
from utils import load_labels_from_dataset, get_audio_paths

# Set random seeds for reproducibility
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [7]:
class CNNMLP(nn.Module):
    """
    Implementazione dell'architettura CNN+MLP descritta nel paper.
    La rete accetta in input segmenti di waveform audio (1D).
    """
    def __init__(self, dropout_rate=0.5, num_classes=2):
        super(CNNMLP, self).__init__()
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=16, kernel_size=64, stride=1),
            nn.BatchNorm1d(num_features=16),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Dropout(p=dropout_rate)
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=32, stride=1),
            nn.BatchNorm1d(num_features=32),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Dropout(p=dropout_rate)
        )
        self.conv_block3 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=16, stride=1),
            nn.BatchNorm1d(num_features=64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            nn.Dropout(p=dropout_rate)
        )
        self.flatten = nn.Flatten()
        self.mlp_block = nn.Sequential(
            nn.Linear(in_features=1, out_features=128), # Will be initialized dynamically
            nn.ReLU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(in_features=128, out_features=num_classes)
        )
        self._mlp_initialized = False

    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x_flattened = self.flatten(x)
        if not self._mlp_initialized:
            in_features = x_flattened.shape[1]
            self.mlp_block[0] = nn.Linear(in_features, 128).to(x.device)
            print(f"MLP inizializzato dinamicamente con {in_features} feature di input.")
            self._mlp_initialized = True
        output = self.mlp_block(x_flattened)
        return output

In [8]:
class AudioSegmentDataset(Dataset):
    """
    Dataset PyTorch generico per caricare e segmentare file audio.
    """
    def __init__(self, file_paths, labels, sr=16000, segment_ms=250, hop_ms=50):
        self.file_paths = file_paths
        self.labels = labels
        self.sr = sr
        self.segment_length = int(sr * (segment_ms / 1000.0))
        self.hop_length = int(sr * (hop_ms / 1000.0))
        self.segments = []
        self.segment_labels = []
        self.file_indices = []  # To track which file each segment belongs to
        self._create_segments()

    def _create_segments(self):
        print("Creazione dei segmenti dal dataset...")
        for i, file_path in enumerate(self.file_paths):
            label = self.labels[i]
            try:
                waveform, original_sr = librosa.load(file_path, sr=self.sr)
                if np.max(np.abs(waveform)) > 0:
                    waveform = waveform / np.max(np.abs(waveform))
                start = 0
                while start + self.segment_length <= len(waveform):
                    segment = waveform[start : start + self.segment_length]
                    self.segments.append(segment)
                    self.segment_labels.append(label)
                    self.file_indices.append(i)
                    start += self.hop_length
            except Exception as e:
                print(f"Errore durante l'elaborazione del file {file_path}: {e}")
        print(f"Creati {len(self.segments)} segmenti totali.")

    def get_segments_for_file(self, file_path):
        """Get all segments for a specific file"""
        try:
            waveform, _ = librosa.load(file_path, sr=self.sr)
            if np.max(np.abs(waveform)) > 0:
                waveform = waveform / np.max(np.abs(waveform))
            
            segments = []
            start = 0
            while start + self.segment_length <= len(waveform):
                segment = waveform[start : start + self.segment_length]
                segments.append(segment)
                start += self.hop_length
            
            if segments:
                segments_tensor = torch.stack([torch.tensor(seg, dtype=torch.float32).unsqueeze(0) for seg in segments])
                return segments_tensor, None
            else:
                return torch.empty(0), None
        except Exception as e:
            print(f"Errore durante l'elaborazione del file {file_path}: {e}")
            return torch.empty(0), None

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        segment = self.segments[idx]
        label = self.segment_labels[idx]
        segment_tensor = torch.tensor(segment, dtype=torch.float32).unsqueeze(0)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return segment_tensor, label_tensor

In [9]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.0, mode='max'):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = -np.inf if mode == 'max' else np.inf

    def __call__(self, current_score):
        if self.mode == 'max':
            improvement = (current_score - self.best_score) > self.min_delta
        else:
            improvement = (self.best_score - current_score) > self.min_delta

        if improvement:
            self.best_score = current_score
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True  # Early stop
        return False

In [10]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, epoch, num_epochs):
    model.train()
    total_loss, correct_predictions = 0, 0
    train_pbar = tqdm(enumerate(data_loader), 
                      total=len(data_loader),
                      desc=f"Epoch {epoch+1}/{num_epochs} - Training")
    
    for batch_idx, (batch_segments, batch_labels) in train_pbar:
        batch_segments = batch_segments.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(batch_segments)
        
        # Calculate loss
        loss = loss_fn(outputs, batch_labels)
        total_loss += loss.item()

        # Calculate predictions for multi-class classification
        preds = torch.argmax(outputs, dim=1)
        correct_predictions += torch.sum(preds == batch_labels)

        # Backward pass
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / len(data_loader.dataset)
    return avg_loss, accuracy

def evaluate_on_full_files(model, file_paths, labels, device, sr=16000, segment_ms=250, hop_ms=50):
    """Evaluate model on complete audio files by averaging segment predictions"""
    model.eval()
    all_targets = []
    all_predictions = []
    
    segment_length = int(sr * (segment_ms / 1000.0))
    hop_length = int(sr * (hop_ms / 1000.0))

    with torch.no_grad():
        for file_path, label in tqdm(zip(file_paths, labels), desc="Evaluating on full files", total=len(file_paths)):
            try:
                # Load and segment the audio file
                waveform, _ = librosa.load(file_path, sr=sr)
                if np.max(np.abs(waveform)) > 0:
                    waveform = waveform / np.max(np.abs(waveform))
                
                segments = []
                start = 0
                while start + segment_length <= len(waveform):
                    segment = waveform[start : start + segment_length]
                    segments.append(segment)
                    start += hop_length
                
                if not segments:
                    continue
                
                # Convert to tensor and add channel dimension
                segments_tensor = torch.stack([torch.tensor(seg, dtype=torch.float32).unsqueeze(0) for seg in segments])
                segments_tensor = segments_tensor.to(device)
                
                # Get predictions for all segments
                segment_outputs = model(segments_tensor)
                segment_probs = torch.softmax(segment_outputs, dim=1)
                
                # Average probabilities across segments
                avg_probs = torch.mean(segment_probs, dim=0)
                final_prediction = torch.argmax(avg_probs).item()
                
                all_predictions.append(final_prediction)
                all_targets.append(label)
                
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
                continue

    return np.array(all_targets), np.array(all_predictions)

def eval_model(model, file_paths, labels, loss_fn, device, sr=16000, segment_ms=250, hop_ms=50):
    """Evaluation function similar to SSL fine-tuning"""
    targets, predictions = evaluate_on_full_files(model, file_paths, labels, device, sr, segment_ms, hop_ms)
    
    if len(targets) == 0:
        return 0.0, 0.0, 0.0
    
    accuracy = accuracy_score(targets, predictions)
    f1 = f1_score(targets, predictions, average='macro')
    
    # Calculate a dummy loss (since we don't have segments with labels for full files)
    loss = 0.0
    
    return loss, accuracy, f1

In [11]:
# Dataset configuration
dataset_name = "datasets/DAIC-WOZ-Cleaned"
num_classes = 2

# Load data splits
train_df = pd.read_csv(os.path.join(dataset_name, 'train_split_Depression_AVEC2017.csv'))
dev_df = pd.read_csv(os.path.join(dataset_name, 'dev_split_Depression_AVEC2017.csv'))
test_df = pd.read_csv(os.path.join(dataset_name, 'full_test_split.csv'))

# Extract labels and paths
y_train = load_labels_from_dataset(train_df)
y_dev = load_labels_from_dataset(dev_df) 
y_test = load_labels_from_dataset(test_df)

train_paths = get_audio_paths(train_df, dataset_name)
dev_paths = get_audio_paths(dev_df, dataset_name)
test_paths = get_audio_paths(test_df, dataset_name)

print(f"Training files: {len(train_paths)}, Labels: {len(y_train)}")
print(f"Validation files: {len(dev_paths)}, Labels: {len(y_dev)}")
print(f"Test files: {len(test_paths)}, Labels: {len(y_test)}")

# Audio processing parameters
SR = 16000
SEGMENT_MS = 250
HOP_MS = 50

# Create datasets
train_dataset = AudioSegmentDataset(
    train_paths, y_train, sr=SR, segment_ms=SEGMENT_MS, hop_ms=HOP_MS
)

# Create data loaders
BATCH_SIZE = 32
train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2
)

# Initialize model
DROPOUT_RATE = 0.5
model = CNNMLP(dropout_rate=DROPOUT_RATE, num_classes=num_classes).to(device)

print(f"\nModel initialized with {sum(p.numel() for p in model.parameters())} parameters")

Training files: 107, Labels: 107
Validation files: 35, Labels: 35
Test files: 47, Labels: 47
Creazione dei segmenti dal dataset...
Creati 1050247 segmenti totali.

Model initialized with 51026 parameters
Creati 1050247 segmenti totali.

Model initialized with 51026 parameters


In [None]:
# Training configuration (similar to SSL fine-tuning)
LEARNING_RATE = 1e-4
NUM_EPOCHS = 20

# Optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()
early_stopping = EarlyStopping(patience=5, min_delta=0.005, mode='max')
model_save_path = "depression_cnn_classifier_best.pth"

# Training loop (similar structure to SSL fine-tuning)
best_val_f1 = -np.inf
best_model_weights = None

print("=== Starting Training ===")

for epoch in range(NUM_EPOCHS):
    print(f"\n=== Epoch {epoch + 1}/{NUM_EPOCHS} ===")
    
    # Training
    train_loss, train_acc = train_epoch(
        model, 
        train_loader, 
        criterion, 
        optimizer, 
        device,
        epoch,
        NUM_EPOCHS
    )
    
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")

    # Validation on full files
    val_loss, val_acc, val_f1 = eval_model(
        model, 
        dev_paths, 
        y_dev, 
        criterion, 
        device,
        sr=SR,
        segment_ms=SEGMENT_MS,
        hop_ms=HOP_MS
    )
    
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}, Validation F1: {val_f1:.4f}")
    
    # Save best model
    if val_f1 > best_val_f1 + early_stopping.min_delta:
        best_val_f1 = val_f1 
        best_model_weights = model.state_dict().copy()
        print(f"New best F1: {best_val_f1:.4f}")

    # Early stopping check
    if early_stopping(val_f1):
        print(f"Early stopping activated. Best F1: {best_val_f1:.4f}")
        break

print("Training Completed")
print(f"Best F1 Score: {best_val_f1:.4f}")

# Save the best model
if best_model_weights:
    torch.save(best_model_weights, model_save_path)
    print(f"Best model saved to {model_save_path}")

=== Starting Training ===

=== Epoch 1/20 ===


In [None]:
# Test evaluation
print("\n=== Test Evaluation ===")

# Load best model if available
if best_model_weights:
    model.load_state_dict(best_model_weights)
    print("Best model weights loaded for testing")

# Evaluate on test set
test_loss, test_acc, test_f1 = eval_model(
    model, 
    test_paths, 
    y_test, 
    criterion, 
    device,
    sr=SR,
    segment_ms=SEGMENT_MS,
    hop_ms=HOP_MS
)

print(f"\n=== Test Results ===")
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}, Test F1: {test_f1:.4f}")

# Additional detailed evaluation
targets, predictions = evaluate_on_full_files(
    model, test_paths, y_test, device, sr=SR, segment_ms=SEGMENT_MS, hop_ms=HOP_MS
)

if len(targets) > 0:
    from sklearn.metrics import classification_report, confusion_matrix
    
    print("\n=== Detailed Test Results ===")
    print("Classification Report:")
    print(classification_report(targets, predictions, target_names=['Non-Depressed', 'Depressed']))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(targets, predictions)
    print(cm)
    
    # Calculate sensitivity and specificity
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    print(f"\nSensitivity (Recall for Depressed): {sensitivity:.4f}")
    print(f"Specificity (Recall for Non-Depressed): {specificity:.4f}")