In [60]:
import numpy as np
import os
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [61]:
'''
# Percorso corretto per il file
file_path = os.path.join('datasets', 'DAIC-WOZ', 'full_test_split.csv')

# Carica il file con le righe vuote
df = pd.read_csv(file_path)

# Rimuovi le righe vuote/NaN
df = df.dropna(how='all')

# Sovrascrive il file originale pulito
df.to_csv(file_path, index=False)
'''

"\n# Percorso corretto per il file\nfile_path = os.path.join('datasets', 'DAIC-WOZ', 'full_test_split.csv')\n\n# Carica il file con le righe vuote\ndf = pd.read_csv(file_path)\n\n# Rimuovi le righe vuote/NaN\ndf = df.dropna(how='all')\n\n# Sovrascrive il file originale pulito\ndf.to_csv(file_path, index=False)\n"

In [62]:
def load_features_for_dataset(df, dataset_name):
    """
    Carica le features di articolazione per un dataset specifico.
    
    Args:
        df: DataFrame contenente Participant_ID e PHQ8_Binary
        dataset_name: Nome della cartella del dataset
    
    Returns:
        tuple: (X_features, y_labels) come array numpy
    """
    X_list, y_list = [], []
    label_col = 'PHQ8_Binary' if 'PHQ8_Binary' in df.columns else 'PHQ_Binary'

    for _, row in df.iterrows():
        participant_id = int(row['Participant_ID'])
        label = int(row[label_col])  # 0 = non-depressed, 1 = depressed

        # Costruisce il percorso delle features
        feature_path = os.path.join("features", dataset_name, f"{participant_id}_P", "articulation_features.npy")
        
        features = np.load(feature_path)
        X_list.append(features.flatten())  
        y_list.append(label)
    
    return np.array(X_list), np.array(y_list)

In [63]:
# Carica solo i dati di training
train_df = pd.read_csv(os.path.join('datasets', 'DAIC-WOZ', 'train_split_Depression_AVEC2017.csv'))
dev_df = pd.read_csv(os.path.join('datasets', 'DAIC-WOZ', 'dev_split_Depression_AVEC2017.csv'))
test_df = pd.read_csv(os.path.join('datasets', 'DAIC-WOZ', 'full_test_split.csv'))

dataset_name = "DAIC-WOZ-Cleaned"

# Carica features per train dev e test separatamente
X_train, y_train = load_features_for_dataset(train_df, dataset_name)
X_dev, y_dev = load_features_for_dataset(dev_df, dataset_name)
X_test, y_test = load_features_for_dataset(test_df, dataset_name)

print(f"Training set: {len(X_train)} samples")
print(f"Dev set: {len(X_dev)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Training label distribution: {np.bincount(y_train)}")
print(f"Dev label distribution: {np.bincount(y_dev)}")
print(f"Test label distribution: {np.bincount(y_test)}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_dev_scaled = scaler.transform(X_dev)
X_test_scaled = scaler.transform(X_test)    

Training set: 107 samples
Dev set: 35 samples
Test set: 47 samples
Training label distribution: [77 30]
Dev label distribution: [23 12]
Test label distribution: [33 14]


In [64]:
# Hyperparameter tuning - approccio corretto

# Parametri da testare
param_grid = {
    'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000],
    'gamma': [1e-3, 1e-2, 1e-1, 1, 'scale', 'auto'],
    'kernel': ['rbf', 'linear']
}


# Metodo 1: GridSearch con CV sul training, poi valida sul dev
best_score = 0
best_params = None
best_model = None

for C in param_grid['C']:
    for gamma in param_grid['gamma']:
        for kernel in param_grid['kernel']:
            if kernel == 'linear' and gamma not in ['scale', 'auto']:
                continue  # Linear non usa gamma
            
            # Addestra sul training set
            svm = SVC(C=C, gamma=gamma, kernel=kernel, random_state=42, class_weight='balanced')
            svm.fit(X_train_scaled, y_train)
            
            # Valuta sul dev set
            y_dev_pred = svm.predict(X_dev_scaled)
            dev_accuracy = accuracy_score(y_dev, y_dev_pred)
            
            if dev_accuracy > best_score:
                best_score = dev_accuracy
                best_params = {'C': C, 'gamma': gamma, 'kernel': kernel}
                best_model = svm

print(f"Best parameters: {best_params}")
print(f"Best dev accuracy: {best_score:.3f}")

y_test_pred = best_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Final Test Accuracy: {test_accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

print(f"Sensitivity: {sensitivity:.3f}")
print(f"Specificity: {specificity:.3f}")

Best parameters: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
Best dev accuracy: 0.686
Final Test Accuracy: 0.660

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.85      0.78        33
           1       0.38      0.21      0.27        14

    accuracy                           0.66        47
   macro avg       0.55      0.53      0.53        47
weighted avg       0.62      0.66      0.63        47

Sensitivity: 0.214
Specificity: 0.848
