In [None]:
# ! pip install librosa
# ! pip install parselmouth
# ! pip install pyAudioAnalysis
# ! pip install torch
# ! pip install transformers
# ! pip install pandas
# ! pip install tqdm
# ! pip install scikit-learn
# ! pip install parselmouth
# ! pip install nolds

In [None]:
%cd Project

[Errno 2] No such file or directory: 'Project'
/content/Project


In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torchaudio
import librosa
import parselmouth
from parselmouth.praat import call
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, roc_auc_score, roc_curve, auc
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoFeatureExtractor, WhisperModel, AutoModelForAudioClassification
import logging
import gc
from torch.utils.data import Dataset, DataLoader
import nolds
from sklearn.model_selection import GroupKFold
from pathlib import Path
import pickle
import joblib

In [None]:
base_path = "ParkCeleb_filtered"
groups = {"PD": 1, "CN": 0}

parkinson_files = []
control_files = []

# Loop over each group (PD, CN)
for group, label in groups.items():
    group_path = os.path.join(base_path, group)
    # Recursively search for any .wav file
    wav_files = glob(os.path.join(group_path, "**", "*.wav"), recursive=True)

    if label == 1:
        parkinson_files.extend(wav_files)
    else:
        control_files.extend(wav_files)

# Merge and create labels
all_files = parkinson_files + control_files
labels = [1] * len(parkinson_files) + [0] * len(control_files)

print(f"Found {len(parkinson_files)} Parkinson files and {len(control_files)} Control files.")


Found 8708 Parkinson files and 5349 Control files.


# Load ParkCeleb dataset and extract speaker IDs

In [None]:
def load_parkceleb_dataset_with_speakers(dataset_path):
    groups = {"PD": 1, "CN": 0}
    audio_files = []
    labels = []
    speaker_ids = []

    # Loop over each group (PD, CN)
    for group, label in groups.items():
        group_path = os.path.join(dataset_path, group)
        # Recursively search for the specific file
        for audio_file in glob(os.path.join(group_path, "**", "*.wav"), recursive=True):
            audio_files.append(audio_file)
            labels.append(label)

            # Extract speaker ID from the path (assuming structure where speaker ID is the directory name after the group name e.g. CN/cn_01 or PD/pd_01)
            try:
                # Example: dataset_path/PD/speaker01/utterance_1.wav
                relative_path = os.path.relpath(audio_file, group_path)
                speaker_id = relative_path.split(os.sep)[0]
                speaker_ids.append(speaker_id)
            except IndexError:
                print(f"Could not extract speaker ID from path: {audio_file}")
                speaker_ids.append(None) # Handle cases where speaker ID cannot be extracted, caused by errors in dataset

    # Filter out files where speaker ID couldn't be determined, some stupid bug
    valid_indices = [i for i, spkr_id in enumerate(speaker_ids) if spkr_id is not None]
    audio_files = [audio_files[i] for i in valid_indices]
    labels = [labels[i] for i in valid_indices]
    speaker_ids = [speaker_ids[i] for i in valid_indices]

    return audio_files, labels, speaker_ids

# Create whisper features

In [None]:
# Read time before diagnosis from speakers_info.csv
def get_time_before_diagnosis(speaker_path):
    info_path = os.path.join(speaker_path, "speakers_info.csv")

    try:
        df = pd.read_csv(info_path)
        # Filter rows based on conditions
        filtered = df[
            (df['status'] == 'target') &
            (df['before_after_diagnosis'] == 'before') &
            (df['years_from_diagnosis'].between(0, 11, inclusive='neither'))
        ]
        if not filtered.empty:
            return filtered['years_from_diagnosis'].iloc[0]
    except Exception as e:
        print(f"Error reading {info_path}: {e}")
    return None

def extract_whisper_features(audio_paths, model_name="openai/whisper-small"):
    model = WhisperModel.from_pretrained(model_name)
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    features = []
    labels = []
    ids = []
    times = []
    lengths = []
    successful_paths = []

    for audio_path in tqdm(audio_paths, desc="Extracting Whisper features"):
        try:
            audio_array, sr = librosa.load(audio_path, sr=16000, mono=True)
            # print(audio_path)

            # Extract features using the feature extractor
            inputs = feature_extractor(
                audio_array,
                sampling_rate=16000,
                return_tensors="pt"
            )

            # Define a decoder input for Whisper (needed for the forward pass structure)
            # This is not really used, but whisper requires it to be passed, regardless if you do or do not use the decoder.
            decoder_input_ids = torch.tensor([[1] * 100]).to(device) # Add 100 tokens for the decoder input

            # Move inputs to the correct device
            inputs = {key: value.to(device) for key, value in inputs.items()}

            # Forward pass to get hidden states
            with torch.no_grad():
                outputs = model(**inputs, decoder_input_ids=decoder_input_ids, output_hidden_states=True)

            # Get the last hidden state from the encoder and average across the sequence dimension
            embeddings = outputs.encoder_hidden_states[-1].mean(dim=1).squeeze().cpu().numpy()


            # get patient id (pd_01 etc)
            patient_id = Path(audio_path).parts[2]

            # Get time before diagnosis
            speaker_dir = Path(audio_path).parent
            audio_dir = Path(speaker_dir).parent
            time_before_diagnosis = get_time_before_diagnosis(audio_dir)

            group = Path(audio_path).parent.parent.name  # Parent directory of speaker dir
            label = 1 if group == "PD" else 0  # Directly use group name

            features.append(embeddings)
            successful_paths.append(audio_path)
            labels.append(label)  # Use corrected label
            ids.append(patient_id)
            times.append(time_before_diagnosis)
            lengths.append(librosa.get_duration(y=audio_array, sr=sr))

        except Exception as e:
            print(f"Failed to process {audio_path}: {e}")

    if not features:
        return np.array([]), [] # Return empty arrays if no features were extracted

    return np.array(features), list(labels), list(ids), list(times), list(lengths), successful_paths


# Extract traditional features

In [None]:
def extract_traditional_features(audio_paths):
    features = []
    labels = []
    ids = []
    times = []
    lengths = []
    successful_paths = []

    for audio_path in tqdm(audio_paths, desc="Extracting traditional features"):
        try:
            # Load audio ONCE with librosa
            audio_array, sr = librosa.load(audio_path, sr=16000)

            patient_id = Path(audio_path).parts[2]
            speaker_dir = Path(audio_path).parent
            audio_dir = Path(speaker_dir).parent
            time_before_diagnosis = get_time_before_diagnosis(audio_dir)
            duration = librosa.get_duration(y=audio_array, sr=sr)
            group = Path(audio_path).parent.parent.name
            label = 1 if group == "PD" else 0

            # Create parselmouth Sound object from librosa's data
            try:
                # Ensure 'audio_array' is float64 for parselmouth, librosa might return float32
                sound = parselmouth.Sound(audio_array.astype(np.float64), sampling_frequency=sr)
            except Exception as e:
                print(f"Failed to create parselmouth.Sound from array for {audio_path}: {e}")
                # Skip this file if parselmouth object creation fails
                continue

            feature_vector = []

            # Jitter features
            try:
                pointProcess = call(sound, "To PointProcess (periodic, cc)", 75, 600)
                jitter_local = call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
                jitter_local_absolute = call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3)
                jitter_rap = call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
                jitter_ppq5 = call(pointProcess, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3)
                feature_vector.extend([jitter_local, jitter_local_absolute, jitter_rap, jitter_ppq5])
            except Exception as e:
                feature_vector.extend([0, 0, 0, 0])

            # Shimmer features
            try:
                shimmer_local = call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
                shimmer_local_db = call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
                feature_vector.extend([shimmer_local, shimmer_local_db])
            except Exception as e:
                feature_vector.extend([0, 0])

            # Harmonics-to-noise ratio
            try:
                harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
                hnr = call(harmonicity, "Get mean", 0, 0)
                feature_vector.append(hnr)
            except Exception as e:
                feature_vector.append(0)

            # Percentage of vocalic intervals (using RMS energy)
            rms_frames = librosa.feature.rms(y=audio_array).mean()
            feature_vector.append(rms_frames)

            # MFCCs
            try:
                mfccs = librosa.feature.mfcc(y=audio_array, sr=sr, n_mfcc=13)
                mfcc_means = np.mean(mfccs, axis=1)
                feature_vector.extend(mfcc_means)
            except Exception as e:
                feature_vector.extend([0] * 13)

            # Spectral features
            try:
                spectral_centroid = librosa.feature.spectral_centroid(y=audio_array, sr=sr).mean()
                spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_array, sr=sr).mean()
                feature_vector.extend([spectral_centroid, spectral_bandwidth])
            except Exception as e:
                feature_vector.extend([0, 0])

            # F0 Statistics (Prosodic Features)
            f0_values = []
            try:
                pitch = sound.to_pitch()
                f0_values = pitch.selected_array['frequency']
                f0_values = f0_values[f0_values != 0] # Exclude unvoiced frames (0 Hz)
                if len(f0_values) > 0:
                    f0_mean = np.mean(f0_values)
                    f0_std = np.std(f0_values)
                    f0_range = np.max(f0_values) - np.min(f0_values)
                else:
                    f0_mean, f0_std, f0_range = 0, 0, 0 # Handle case with no voiced frames

                feature_vector.extend([f0_mean, f0_std, f0_range])
            except Exception as e:
                feature_vector.extend([0, 0, 0])

            # Nonlinear Dynamics (RPDE, D2, DFA)
            rpde_val, d2_val, dfa_val = 0, 0, 0 # Default to 0 if not computed

            if len(f0_values) > 10: # Need enough points for nolds functions
                try:
                    # These can be computationally expensive, consider skipping for large files
                    if len(f0_values) < 5000:  # Limit to prevent memory issues
                        rpde_val = nolds.rpde(f0_values)
                        d2_val = nolds.d2(f0_values)
                        dfa_val = nolds.dfa(f0_values)
                except Exception as e:
                    pass # Keep default 0 values

            feature_vector.extend([rpde_val, d2_val, dfa_val])

            # Check if feature vector has consistent length
            expected_length = 29 # Update if adding/removing features
            if len(feature_vector) == expected_length:
                features.append(feature_vector)
                labels.append(label)
                ids.append(patient_id)
                times.append(time_before_diagnosis)
                lengths.append(duration)  # Using duration instead of audio_length
                successful_paths.append(audio_path)
            else:
                print(f"Skipping {audio_path} due to inconsistent feature vector length ({len(feature_vector)} != {expected_length})")

            # Force garbage collection to free memory
            gc.collect()

        except Exception as e:
            print(f"An error occurred processing file: {audio_path}. Error: {e}")

    features_array = np.array(features, dtype=np.float32)
    return features_array, labels, ids, times, lengths, successful_paths  # Return all data, not just features

# Loading CSV helper functions

In [None]:
# New function to load features from CSV
def load_features_from_csv(filename):
    if not os.path.exists(filename):
        print(f"File not found: {filename}")
        return None, None, None, None, None, None

    print(f"Loading features from: {filename}")
    df = pd.read_csv(filename)
    required_columns = {'label', 'id', 'time_before_diagnosis', 'length'}
    if not required_columns.issubset(df.columns):
        print(f"Error: One or more required columns {required_columns} missing in {filename}")
        return None, None, None, None, None, None

    labels = df['label'].values.astype(int)
    speaker_ids = df['id'].values
    times = df['time_before_diagnosis'].values
    lengths = df['length'].values

    features = df.drop(columns=['label', 'id', 'time_before_diagnosis', 'length']).values.astype(np.float32)
    features = np.nan_to_num(features)

    return features, labels, speaker_ids, times, lengths, df



# Save features function
def save_features(features, labels, ids, time_before_diagnosis, audio_length, filename):
    """Save features and labels to a CSV file."""
    if features.size == 0 or not labels:
        print(f"Warning: No features or labels to save for {filename}")
        return
    df = pd.DataFrame(features)
    df['label'] = labels
    df['id'] = ids
    df['time_before_diagnosis'] = time_before_diagnosis
    df['length'] = audio_length
    df.to_csv(filename, index=False)
    print(f"Saved features to {filename}")


# Neural Networks

In [None]:
class DeepNN(torch.nn.Module):
    def __init__(self, input_dim):
        super(DeepNN, self).__init__()
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_dim, 256),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(256, 128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.3),
            torch.nn.Linear(128, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)


class ImprovedDeepNN(nn.Module):
    def __init__(self, input_dim, num_classes=1, dropout=0.3):
        super(ImprovedDeepNN, self).__init__()

        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)

        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)

        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)

        self.fc4 = nn.Linear(128, 64)
        self.bn4 = nn.BatchNorm1d(64)

        self.output = nn.Linear(64, num_classes)  # 1 for binary, >1 for multi-class (depends on number of bins)
        self.dropout = nn.Dropout(dropout)

        self.num_classes = num_classes

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)

        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)

        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)

        x = F.relu(self.bn4(self.fc4(x)))
        x = self.dropout(x)

        if self.num_classes == 1:
            return torch.sigmoid(self.output(x))  # Binary classification
        else:
            return self.output(x)  # Use CrossEntropyLoss with raw logits

# Model Training/Testing/Comparison Helper Functions

In [None]:
def calculate_metrics(y_true, y_pred_prob):
    y_pred = (y_pred_prob > 0.5).astype(int)
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_pred_prob) if len(np.unique(y_true)) > 1 else np.nan,
        "sensitivity": recall_score(y_true, y_pred),
        "specificity": recall_score(1 - y_true, 1 - y_pred),
        "f1_score": f1_score(y_true, y_pred),
        "confusion_matrix": confusion_matrix(y_true, y_pred)
    }

def calculate_comprehensive_statistics(results_list):
    """
    Calculate comprehensive statistics for cross-validation results.

    Returns statistics including:
    - Mean and standard deviation
    - Variance and coefficient of variation
    - 95% confidence intervals
    - Min/max values
    - Statistical significance tests
    """
    if not results_list or len(results_list) == 0:
        return {}

    stats_dict = {}
    metric_keys = [key for key in results_list[0].keys() if key != 'confusion_matrix']

    for key in metric_keys:
        values = [res[key] for res in results_list if not np.isnan(res[key])]

        if len(values) == 0:
            stats_dict[key] = {
                'mean': np.nan, 'std': np.nan, 'variance': np.nan,
                'cv': np.nan, 'ci_lower': np.nan, 'ci_upper': np.nan,
                'min': np.nan, 'max': np.nan, 'range': np.nan,
                'median': np.nan, 'iqr': np.nan, 'n_valid': 0
            }
            continue

        values = np.array(values)
        n = len(values)
        mean_val = np.mean(values)
        std_val = np.std(values, ddof=1) if n > 1 else 0

        # Basic statistics
        stats_dict[key] = {
            'mean': mean_val,
            'std': std_val,
            'variance': np.var(values, ddof=1) if n > 1 else 0,
            'cv': (std_val / mean_val * 100) if mean_val != 0 else np.nan,  # Coefficient of variation as percentage
            'min': np.min(values),
            'max': np.max(values),
            'range': np.max(values) - np.min(values),
            'median': np.median(values),
            'iqr': np.percentile(values, 75) - np.percentile(values, 25),
            'n_valid': n
        }

        # 95% Confidence interval for the mean
        if n > 1:
            ci = stats.t.interval(0.95, n-1, loc=mean_val, scale=stats.sem(values))
            stats_dict[key]['ci_lower'] = ci[0]
            stats_dict[key]['ci_upper'] = ci[1]
        else:
            stats_dict[key]['ci_lower'] = mean_val
            stats_dict[key]['ci_upper'] = mean_val

    return stats_dict

def print_comprehensive_statistics(feature_type, stats_dict):
    """Print detailed statistics in a formatted way"""
    print(f"\n=== {feature_type} Features - Comprehensive Statistics ===")

    # Define the metrics we want to display
    metric_display_names = {
        'accuracy': 'Accuracy',
        'roc_auc': 'ROC AUC',
        'sensitivity': 'Sensitivity (Recall)',
        'specificity': 'Specificity',
        'f1_score': 'F1 Score'
    }

    for metric, display_name in metric_display_names.items():
        if metric in stats_dict:
            s = stats_dict[metric]
            print(f"\n{display_name}:")
            print(f"  Mean ± SD:        {s['mean']:.4f} ± {s['std']:.4f}")
            print(f"  95% CI:           [{s['ci_lower']:.4f}, {s['ci_upper']:.4f}]")
            print(f"  Variance:         {s['variance']:.6f}")
            print(f"  CV:               {s['cv']:.2f}%")  # Coefficient of variation
            print(f"  Range:            [{s['min']:.4f}, {s['max']:.4f}] (span: {s['range']:.4f})")
            print(f"  Median (IQR):     {s['median']:.4f} ({s['iqr']:.4f})")

def create_visualizations(feature_types, all_fold_metrics, all_stats, save_folder, save_figures):
    """Create visualizations with error bars and confidence intervals"""

    metric_names = ['Accuracy', 'AUC', 'Sensitivity', 'Specificity', 'F1 Score']
    metric_keys = ['accuracy', 'roc_auc', 'sensitivity', 'specificity', 'f1_score']

    if len(feature_types) > 1:
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

        # Main comparison with error bars
        width = 0.35
        x = np.arange(len(metric_names))

        for i, feature_type in enumerate(feature_types):
            means = [all_stats[feature_type][key]['mean'] for key in metric_keys]
            stds = [all_stats[feature_type][key]['std'] for key in metric_keys]

            bars = ax1.bar(x + (i - 0.5*(len(feature_types)-1)) * width, means,
                          width, label=feature_type, alpha=0.8,
                          yerr=stds, capsize=5, error_kw=dict(alpha=0.6))

            # Add value labels with mean ± std
            for j, (mean, std) in enumerate(zip(means, stds)):
                ax1.text(x[j] + (i - 0.5*(len(feature_types)-1)) * width,
                        mean + std + 0.02,
                        f'{mean:.3f}±{std:.3f}',
                        ha='center', fontsize=8, rotation=0)

        ax1.axhline(y=0.5, color='r', linestyle='--', alpha=0.3, label='Baseline')
        ax1.set_xlabel('Metric')
        ax1.set_ylabel('Score')
        ax1.set_title('Performance Comparison with Standard Deviation')
        ax1.set_xticks(x)
        ax1.set_xticklabels(metric_names)
        ax1.set_ylim(0, 1.1)
        ax1.legend()
        ax1.grid(axis='y', linestyle='--', alpha=0.3)

        # Coefficient of variation comparison
        cvs_data = []
        for feature_type in feature_types:
            cvs = [all_stats[feature_type][key]['cv'] for key in metric_keys]
            cvs_data.append(cvs)

        x_cv = np.arange(len(metric_names))
        for i, (feature_type, cvs) in enumerate(zip(feature_types, cvs_data)):
            ax2.bar(x_cv + (i - 0.5*(len(feature_types)-1)) * width, cvs,
                   width, label=feature_type, alpha=0.8)

            # Add value labels
            for j, cv in enumerate(cvs):
                if not np.isnan(cv):
                    ax2.text(x_cv[j] + (i - 0.5*(len(feature_types)-1)) * width,
                            cv + 0.5, f'{cv:.1f}%',
                            ha='center', fontsize=8)

        ax2.set_xlabel('Metric')
        ax2.set_ylabel('Coefficient of Variation (%)')
        ax2.set_title('Model Stability (Lower CV = More Stable)')
        ax2.set_xticks(x_cv)
        ax2.set_xticklabels(metric_names)
        ax2.legend()
        ax2.grid(axis='y', linestyle='--', alpha=0.3)

        plt.tight_layout()
        if save_figures:
            plt.savefig(f'{save_folder}/feature_comparison.png',
                       bbox_inches='tight', dpi=300)
            plt.close()
        else:
            plt.show()

    # Box plots for each feature type showing distribution across folds
    for feature_type in feature_types:
        if feature_type not in all_fold_metrics:
            continue

        fold_data = all_fold_metrics[feature_type]

        # Create box plot data
        box_data = []
        box_labels = []
        for key, display_name in zip(metric_keys, metric_names):
            if key in fold_data and len(fold_data[key]) > 0:
                box_data.append(fold_data[key])
                box_labels.append(display_name)

        if box_data:
            plt.figure(figsize=(10, 6))
            bp = plt.boxplot(box_data, labels=box_labels, patch_artist=True)

            # Color the boxes
            colors = plt.cm.Set3(np.linspace(0, 1, len(box_data)))
            for patch, color in zip(bp['boxes'], colors):
                patch.set_facecolor(color)
                patch.set_alpha(0.7)

            plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.3, label='Baseline')
            plt.ylabel('Score')
            plt.title(f'{feature_type} - Distribution of Metrics Across Folds')
            plt.grid(axis='y', linestyle='--', alpha=0.3)
            plt.ylim(0, 1.05)

            # Add mean markers
            for i, key in enumerate([k for k in metric_keys if k in fold_data]):
                if len(fold_data[key]) > 0:
                    mean_val = np.mean(fold_data[key])
                    plt.scatter(i+1, mean_val, color='red', s=50, marker='D',
                              label='Mean' if i == 0 else '', zorder=5)

            plt.legend()
            plt.xticks(rotation=45)

            if save_figures:
                plt.savefig(f'{save_folder}/{feature_type}_distribution_boxplot.png',
                           bbox_inches='tight', dpi=300)
                plt.close()
            else:
                plt.show()


def average_metrics(results_list):
    """Backward compatibility - returns means only"""
    return {
        key: np.nanmean([res[key] for res in results_list])
        for key in results_list[0].keys()
    }

def empty_metrics():
    return {
        "accuracy": np.nan,
        "roc_auc": np.nan,
        "sensitivity": np.nan,
        "specificity": np.nan,
        "f1_score": np.nan,
        "confusion_matrix": np.array([[0, 0], [0, 0]])
    }



# Training, Testing, and Saving DNN

In [None]:
def get_model_save_path(base_folder, feature_type, deep_features, model_name, fold, model_to_use):
    """Generate the path for saving/loading models"""
    # Create folder structure: models/{deep_features}/{feature_type}_{model_name}/
    if feature_type == "Whisper":
        model_folder = Path(base_folder) / "models" / deep_features.replace(" ", "_").lower() / f"{feature_type}_{model_name}"
    else:
        model_folder = Path(base_folder) / "models" / "traditional" / f"{feature_type}_{model_name}"

    model_folder.mkdir(parents=True, exist_ok=True)

    # Use appropriate extension based on model type
    if hasattr(model_to_use, '__name__') and not model_to_use.__name__.startswith('RandomForestClassifier'):
        # Neural network models use .pth
        return model_folder / f"fold_{fold}.pth"
    else:
        # Sklearn models use .pkl
        return model_folder / f"fold_{fold}.pkl"

def save_model_and_scaler(model, scaler, model_path, model_to_use):
    """Save model and scaler to disk"""
    try:
        # Handle PyTorch models separately
        if hasattr(model, 'state_dict'):  # PyTorch model
            # Save PyTorch model using torch.save
            save_data = {
                'model_state_dict': model.state_dict(),
                'model_class': model.__class__,
                'input_dim': getattr(model, 'input_size', None) or model.state_dict()[list(model.state_dict().keys())[0]].shape[1],
                'scaler': scaler,
                'model_type': model_to_use.__name__ if hasattr(model_to_use, '__name__') else str(type(model_to_use))
            }
            torch.save(save_data, model_path)

        else:  # Sklearn model
            save_data = {
                'model': model,
                'scaler': scaler,
                'model_type': model_to_use.__name__ if hasattr(model_to_use, '__name__') else str(type(model_to_use))
            }
            joblib.dump(save_data, model_path)

        print(f"Model saved to: {model_path}")
        return True
    except Exception as e:
        print(f"Error saving model to {model_path}: {e}")
        return False

def load_model_and_scaler(model_path, model_to_use, input_dim=None):
    """Load model and scaler from disk"""
    try:
        if not os.path.exists(model_path):
            return None, None

        # Handle PyTorch models (.pth files)
        if str(model_path).endswith('.pth'):
            save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
            scaler = save_data['scaler']

            if input_dim is None:
                input_dim = save_data.get('input_dim')
            if input_dim is None:
                raise ValueError("input_dim required for PyTorch model loading")

            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            model = model_to_use(input_dim).to(device)
            model.load_state_dict(save_data['model_state_dict'])
            model.eval()

        else:  # Sklearn models (.pkl files)
            save_data = joblib.load(model_path)
            model = save_data['model']
            scaler = save_data['scaler']

        print(f"Model loaded from: {model_path}")
        return model, scaler

    except Exception as e:
        print(f"Error loading model from {model_path}: {e}")
        return None, None

def train_evaluate_model(X_train, y_train, X_test, y_test, feature_type, model_to_use,
                        model_save_path=None, force_retrain=False):
    """Train and evaluate model for specific feature type with save/load functionality"""
    if X_train.size == 0 or X_test.size == 0:
        print(f"Warning: Empty feature sets for {feature_type}. Skipping evaluation.")
        return empty_metrics()

    # Try to load existing model if save path is provided and force_retrain is False
    model, scaler = None, None
    if model_save_path and not force_retrain:
        model, scaler = load_model_and_scaler(model_save_path, model_to_use, X_train.shape[1])

    # If model not loaded, train a new one
    if model is None or scaler is None:
        print(f"Training new model for {feature_type}...")

        # Standardize features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # For neural network models
        if not model_to_use.__name__.startswith('RandomForestClassifier'):
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

            # Create model with appropriate input dimensions
            input_dim = X_train_scaled.shape[1]
            model = model_to_use(input_dim).to(device)

            # Rest of training logic
            criterion = torch.nn.BCELoss()
            optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

            # Convert data to tensors
            X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
            y_train_tensor = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1).to(device)

            # Training loop
            dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
            dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

            for epoch in range(100):
                model.train()
                for batch_X, batch_y in dataloader:
                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

        # For Random Forest
        elif model_to_use.__name__ == 'RandomForestClassifier':
            # Create and train the random forest model
            model = model_to_use(
                n_estimators=100,
                max_depth=None,
                min_samples_split=2,
                min_samples_leaf=1,
                max_features='sqrt',
                bootstrap=True,
                random_state=37,
                n_jobs=-1,
                class_weight='balanced'
            )

            model.fit(X_train_scaled, y_train)

        # For other sklearn models
        else:
            model = model_to_use()
            model.fit(X_train_scaled, y_train)

        # Save the trained model and scaler
        if model_save_path:
            save_model_and_scaler(model, scaler, model_save_path, model_to_use)

    else:
        print(f"Using loaded model for {feature_type}...")
        # Use the loaded scaler to transform test data
        X_test_scaled = scaler.transform(X_test)

    # Evaluation (same for both loaded and newly trained models)
    if not model_to_use.__name__.startswith('RandomForestClassifier'):
        # Neural network evaluation
        model.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

        with torch.no_grad():
            y_pred_prob = model(X_test_tensor).cpu().numpy()
    else:
        # Sklearn model evaluation
        y_pred_prob = model.predict_proba(X_test_scaled)[:, 1].reshape(-1, 1)

    return calculate_metrics(y_test, y_pred_prob)


# Train model + Feature comparison

In [None]:
def crossval_compare_features(dataset_path, k=5, save_figures=True, min_audio_length=0,
                            deep_features="Whisper Small", model=None, force_retrain=False,
                            base_save_folder='experiment_results'):
    # Determine which model to use
    if model == "random_forest":
        model_to_use = RandomForestClassifier
        model_name = "RandomForest"
    elif isinstance(model, type) or callable(model):
        # If model is a class or function
        model_to_use = model
        model_name = model.__name__
    else:
        # If model is an instance
        model_to_use = model
        model_name = model.__class__.__name__

    save_folder = f'{base_save_folder}/{deep_features}/visualization_results_{model_name}'
    # Create directory for saving figures if needed
    os.makedirs(save_folder, exist_ok=True)

    # Load data with proper alignment
    all_audio_paths, all_labels, all_speaker_ids = load_parkceleb_dataset_with_speakers(dataset_path)

    # Load features
    if deep_features == "Whisper Small":
        whisper_data = load_features_from_csv("whisper_small_all_features.csv")
        traditional_data = load_features_from_csv("traditional_all_features.csv")
    elif deep_features == "Whisper Medium":
        whisper_data = load_features_from_csv("whisper_medium_all_features.csv")
        traditional_data = load_features_from_csv("traditional_all_features.csv")

    if whisper_data[0] is None:
        print("Whisper data is None. Generating features.")
        features, labels, ids, times, lengths, _ = extract_whisper_features(all_audio_paths)
        # Change CSV names according to whisper model used
        save_features(features, labels, ids, times, lengths, "whisper_medium_all_features.csv")
        whisper_data = load_features_from_csv("whisper_medium_all_features.csv")

    if traditional_data[0] is None:
        print("Traditional data is None. Generating features.")
        features, labels, ids, times, lengths, _ = extract_traditional_features(all_audio_paths)
        save_features(features, labels, ids, times, lengths, "traditional_all_features.csv")
        traditional_data = load_features_from_csv("traditional_all_features.csv")

    # Dictionary to store all results
    results = {}
    all_fold_metrics = {}  # Store metrics for each fold for later visualization
    all_comprehensive_stats = {}  # Store comprehensive statistics

    for feature_type, (features, labels, speakers, times, lengths, _) in [
        ("Whisper", whisper_data),
        ("Traditional", traditional_data)
    ]:
        if features is None:
            continue

        length_mask = lengths >= min_audio_length
        features = features[length_mask]
        labels = labels[length_mask]
        speakers = speakers[length_mask]
        times = times[length_mask]
        lengths = lengths[length_mask]

        print(f"\n=== Processing {feature_type} Features with {model_name} ===")
        print(f"Feature dimension: {features.shape[1]}")
        print(f"Class distribution: {np.bincount(labels)}")

        # Speaker-level stratified K-Fold
        unique_speakers = np.unique(speakers)
        speaker_labels = np.array([1 if 'pd' in s else 0 for s in unique_speakers])

        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=37)
        fold_results = []
        fold_metrics_dict = {
            'accuracy': [], 'roc_auc': [], 'sensitivity': [],
            'specificity': [], 'f1_score': [], 'confusion_matrices': [],
            'y_test': [], 'y_pred_prob': [], 'feature_importance': []
        }

        for fold, (train_idx, test_idx) in enumerate(skf.split(unique_speakers, speaker_labels)):
            print(f"\n=== Fold {fold+1} ===")
            train_speakers = set(unique_speakers[train_idx])
            test_speakers = set(unique_speakers[test_idx])

            # Select data for current fold
            train_mask = np.isin(speakers, list(train_speakers))
            test_mask = np.isin(speakers, list(test_speakers))

            X_train, X_test = features[train_mask], features[test_mask]
            y_train, y_test = labels[train_mask], labels[test_mask]

            # Check for valid class distribution
            if len(np.unique(y_test)) < 2:
                print(f"Skipping fold {fold+1} - single class in test set")
                continue

            # Get model save path for this fold
            model_save_path = get_model_save_path(base_save_folder, feature_type, deep_features, model_name, fold+1, model_to_use)

            # Train and Eval
            metrics = train_evaluate_model(X_train, y_train, X_test, y_test, feature_type,
                                         model_to_use, model_save_path, force_retrain)
            fold_results.append(metrics)

            # Store detailed metrics for visualization
            for key in ['accuracy', 'roc_auc', 'sensitivity', 'specificity', 'f1_score']:
                fold_metrics_dict[key].append(metrics[key])
            fold_metrics_dict['confusion_matrices'].append(metrics['confusion_matrix'])
            fold_metrics_dict['y_test'].append(y_test)

            # Visualize confusion matrix for this fold
            plt.figure(figsize=(6, 5))
            cm = metrics['confusion_matrix']
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                        xticklabels=['Control', 'PD'], yticklabels=['Control', 'PD'])
            plt.xlabel('Predicted')
            plt.ylabel('True')
            plt.title(f'{feature_type} - Fold {fold+1} Confusion Matrix')
            if save_figures:
                plt.savefig(f'{save_folder}/{feature_type}_fold{fold+1}_confusion.png', bbox_inches='tight')
                plt.close()
            else:
                plt.show()

        # Store results
        if fold_results:
            results[feature_type] = average_metrics(fold_results)
            all_fold_metrics[feature_type] = fold_metrics_dict

            # Calculate comprehensive statistics
            comprehensive_stats = calculate_comprehensive_statistics(fold_results)
            all_comprehensive_stats[feature_type] = comprehensive_stats

            # Print comprehensive statistics
            print_comprehensive_statistics(feature_type, comprehensive_stats)

            # Visualize metrics across folds with error bars
            plt.figure(figsize=(12, 6))
            metrics_across_folds = pd.DataFrame({
                'Accuracy': fold_metrics_dict['accuracy'],
                'AUC': fold_metrics_dict['roc_auc'],
                'Sensitivity': fold_metrics_dict['sensitivity'],
                'Specificity': fold_metrics_dict['specificity'],
                'F1 Score': fold_metrics_dict['f1_score']
            })

            metrics_across_folds.index = [f'Fold {i+1}' for i in range(len(metrics_across_folds))]
            ax = metrics_across_folds.plot(kind='bar', figsize=(12, 6))

            # Add error bars showing standard deviation
            means = metrics_across_folds.mean()
            stds = metrics_across_folds.std()

            plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.3)  # baseline
            plt.title(f'{feature_type} - Performance Metrics Across Folds\n'
                     f'Mean ± SD shown in legend')
            plt.ylabel('Score')
            plt.ylim(0, 1.05)
            plt.grid(axis='y', linestyle='--', alpha=0.7)

            # Update legend to include mean ± std
            handles, labels = ax.get_legend_handles_labels()
            new_labels = [f'{label}: {means[label]:.3f}±{stds[label]:.3f}'
                         for label in labels]
            plt.legend(handles, new_labels, loc='lower center',
                      bbox_to_anchor=(0.5, -0.35), ncol=3)

            # Add value labels on bars
            for container in ax.containers:
                ax.bar_label(container, fmt='%.3f', fontsize=7)

            if save_figures:
                plt.savefig(f'{save_folder}/{feature_type}_metrics_by_fold.png',
                           bbox_inches='tight', dpi=300)
                plt.close()
            else:
                plt.show()

            # Visualize average confusion matrix
            plt.figure(figsize=(6, 5))
            avg_cm = np.mean([cm for cm in fold_metrics_dict['confusion_matrices']], axis=0)
            std_cm = np.std([cm for cm in fold_metrics_dict['confusion_matrices']], axis=0)

            # Create annotations with mean ± std
            annotations = []
            for i in range(avg_cm.shape[0]):
                row = []
                for j in range(avg_cm.shape[1]):
                    row.append(f'{avg_cm[i,j]:.1f}±{std_cm[i,j]:.1f}')
                annotations.append(row)

            sns.heatmap(avg_cm, annot=annotations, fmt='', cmap='Blues',
                        xticklabels=['Control', 'PD'], yticklabels=['Control', 'PD'])
            plt.xlabel('Predicted')
            plt.ylabel('True')
            plt.title(f'{feature_type} - Average Confusion Matrix\n(Mean ± SD)')
            if save_figures:
                plt.savefig(f'{save_folder}/{feature_type}_avg_confusion.png',
                           bbox_inches='tight', dpi=300)
                plt.close()
            else:
                plt.show()

        else:
            results[feature_type] = empty_metrics()
            all_comprehensive_stats[feature_type] = {}

    # Create visualizations
    feature_types_with_data = [ft for ft in results.keys() if results[ft]['accuracy'] is not np.nan]
    if feature_types_with_data:
        create_visualizations(feature_types_with_data, all_fold_metrics,
                                     all_comprehensive_stats, save_folder, save_figures)

    # Print final summary with comprehensive statistics
    print("\n" + "="*80)
    print("COMPREHENSIVE CROSS-VALIDATION RESULTS SUMMARY")
    print("="*80)

    for feature_type in feature_types_with_data:
        print(f"\n{feature_type} Features Summary:")
        stats = all_comprehensive_stats[feature_type]

        print("┌─────────────┬─────────────┬───────────────┬──────────┬──────────┬──────────┐")
        print("│ Metric      │ Mean±SD     │ 95% CI        │ CV (%)   │ Range    │ Median   │")
        print("├─────────────┼─────────────┼───────────────┼──────────┼──────────┼──────────┤")

        for key, display_name in [('accuracy', 'Accuracy'), ('roc_auc', 'ROC AUC'),
                                 ('sensitivity', 'Sensitivity'), ('specificity', 'Specificity'),
                                 ('f1_score', 'F1 Score')]:
            if key in stats:
                s = stats[key]
                print(f"│ {display_name:<11} │ {s['mean']:.3f}±{s['std']:.3f} │ "
                      f"[{s['ci_lower']:.3f},{s['ci_upper']:.3f}] │ {s['cv']:6.1f}   │ "
                      f"{s['range']:.3f}    │ {s['median']:.3f}    │")

        print("└─────────────┴─────────────┴───────────────┴──────────┴──────────┴──────────┘")

    return results, all_comprehensive_stats

In [None]:
DeepNN_small_results = crossval_compare_features("parkceleb_filtered", min_audio_length=0.1, deep_features="Whisper Small" ,model=DeepNN)

Loading features from: whisper_small_all_features.csv
Loading features from: traditional_all_features.csv

=== Processing Whisper Features with DeepNN ===
Feature dimension: 768
Class distribution: [5239 8614]

=== Fold 1 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_DeepNN/fold_1.pth
Using loaded model for Whisper...


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Fold 2 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_DeepNN/fold_2.pth
Using loaded model for Whisper...

=== Fold 3 ===


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/whisper_small/Whisper_DeepNN/fold_3.pth
Using loaded model for Whisper...

=== Fold 4 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_DeepNN/fold_4.pth
Using loaded model for Whisper...


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Fold 5 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_DeepNN/fold_5.pth
Using loaded model for Whisper...

=== Whisper Features - Comprehensive Statistics ===

Accuracy:
  Mean ± SD:        0.5539 ± 0.0330
  95% CI:           [0.5129, 0.5949]
  Variance:         0.001091
  CV:               5.96%
  Range:            [0.5142, 0.6007] (span: 0.0866)
  Median (IQR):     0.5563 (0.0328)

ROC AUC:
  Mean ± SD:        0.5663 ± 0.0726
  95% CI:           [0.4762, 0.6564]
  Variance:         0.005264
  CV:               12.81%
  Range:            [0.4912, 0.6519] (span: 0.1607)
  Median (IQR):     0.5819 (0.1225)

Sensitivity (Recall):
  Mean ± SD:        0.5785 ± 0.0348
  95% CI:           [0.5353, 0.6217]
  Variance:         0.001211
  CV:               6.02%
  Range:            [0.5321, 0.6185] (span: 0.0863)
  Median (IQR):     0.5788 (0.0465)

Specificity:
  Mean ± SD:        0.5132 ± 0.1092
  95% CI:           [0.3776, 0.6488]
  Variance:         0.011928
  C

  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/traditional/Traditional_DeepNN/fold_2.pth
Using loaded model for Traditional...

=== Fold 3 ===


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/traditional/Traditional_DeepNN/fold_3.pth
Using loaded model for Traditional...

=== Fold 4 ===
Model loaded from: experiment_results/models/traditional/Traditional_DeepNN/fold_4.pth
Using loaded model for Traditional...

=== Fold 5 ===
Model loaded from: experiment_results/models/traditional/Traditional_DeepNN/fold_5.pth
Using loaded model for Traditional...

=== Traditional Features - Comprehensive Statistics ===

Accuracy:
  Mean ± SD:        0.5801 ± 0.0646
  95% CI:           [0.4998, 0.6603]
  Variance:         0.004180
  CV:               11.15%
  Range:            [0.5021, 0.6750] (span: 0.1729)
  Median (IQR):     0.5602 (0.0496)

ROC AUC:
  Mean ± SD:        0.6088 ± 0.0711
  95% CI:           [0.5206, 0.6971]
  Variance:         0.005053
  CV:               11.68%
  Range:            [0.5502, 0.7092] (span: 0.1590)
  Median (IQR):     0.5730 (0.1039)

Sensitivity (Recall):
  Mean ± SD:        0.5758 ± 0.0696
  95% CI:           [0

  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



COMPREHENSIVE CROSS-VALIDATION RESULTS SUMMARY

Whisper Features Summary:
┌─────────────┬─────────────┬───────────────┬──────────┬──────────┬──────────┐
│ Metric      │ Mean±SD     │ 95% CI        │ CV (%)   │ Range    │ Median   │
├─────────────┼─────────────┼───────────────┼──────────┼──────────┼──────────┤
│ Accuracy    │ 0.554±0.033 │ [0.513,0.595] │    6.0   │ 0.087    │ 0.556    │
│ ROC AUC     │ 0.566±0.073 │ [0.476,0.656] │   12.8   │ 0.161    │ 0.582    │
│ Sensitivity │ 0.579±0.035 │ [0.535,0.622] │    6.0   │ 0.086    │ 0.579    │
│ Specificity │ 0.513±0.109 │ [0.378,0.649] │   21.3   │ 0.252    │ 0.551    │
│ F1 Score    │ 0.614±0.027 │ [0.581,0.647] │    4.3   │ 0.065    │ 0.602    │
└─────────────┴─────────────┴───────────────┴──────────┴──────────┴──────────┘

Traditional Features Summary:
┌─────────────┬─────────────┬───────────────┬──────────┬──────────┬──────────┐
│ Metric      │ Mean±SD     │ 95% CI        │ CV (%)   │ Range    │ Median   │
├─────────────┼──────────

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

In [None]:
ImprovedDeepNN_small_results = crossval_compare_features("parkceleb_filtered", min_audio_length=0.1, deep_features="Whisper Small", model=ImprovedDeepNN)

Loading features from: whisper_small_all_features.csv
Loading features from: traditional_all_features.csv

=== Processing Whisper Features with ImprovedDeepNN ===
Feature dimension: 768
Class distribution: [5239 8614]

=== Fold 1 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_ImprovedDeepNN/fold_1.pth
Using loaded model for Whisper...


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Fold 2 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_ImprovedDeepNN/fold_2.pth
Using loaded model for Whisper...


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Fold 3 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_ImprovedDeepNN/fold_3.pth
Using loaded model for Whisper...


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Fold 4 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_ImprovedDeepNN/fold_4.pth
Using loaded model for Whisper...


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Fold 5 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_ImprovedDeepNN/fold_5.pth
Using loaded model for Whisper...


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Whisper Features - Comprehensive Statistics ===

Accuracy:
  Mean ± SD:        0.5329 ± 0.0310
  95% CI:           [0.4945, 0.5714]
  Variance:         0.000959
  CV:               5.81%
  Range:            [0.4949, 0.5757] (span: 0.0808)
  Median (IQR):     0.5358 (0.0323)

ROC AUC:
  Mean ± SD:        0.5464 ± 0.0623
  95% CI:           [0.4690, 0.6238]
  Variance:         0.003885
  CV:               11.41%
  Range:            [0.4808, 0.6329] (span: 0.1521)
  Median (IQR):     0.5612 (0.0761)

Sensitivity (Recall):
  Mean ± SD:        0.5217 ± 0.0776
  95% CI:           [0.4254, 0.6180]
  Variance:         0.006020
  CV:               14.87%
  Range:            [0.4015, 0.6042] (span: 0.2026)
  Median (IQR):     0.5366 (0.0687)

Specificity:
  Mean ± SD:        0.5312 ± 0.1222
  95% CI:           [0.3795, 0.6829]
  Variance:         0.014929
  CV:               23.00%
  Range:            [0.3927, 0.6543] (span: 0.2615)
  Median (IQR):     0.5543 (0.2229)

F1 Score:
  Mean ± SD

  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Fold 2 ===
Model loaded from: experiment_results/models/traditional/Traditional_ImprovedDeepNN/fold_2.pth
Using loaded model for Traditional...

=== Fold 3 ===


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/traditional/Traditional_ImprovedDeepNN/fold_3.pth
Using loaded model for Traditional...

=== Fold 4 ===


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/traditional/Traditional_ImprovedDeepNN/fold_4.pth
Using loaded model for Traditional...

=== Fold 5 ===


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/traditional/Traditional_ImprovedDeepNN/fold_5.pth
Using loaded model for Traditional...

=== Traditional Features - Comprehensive Statistics ===

Accuracy:
  Mean ± SD:        0.5668 ± 0.0746
  95% CI:           [0.4742, 0.6594]
  Variance:         0.005563
  CV:               13.16%
  Range:            [0.4574, 0.6380] (span: 0.1807)
  Median (IQR):     0.5693 (0.0956)

ROC AUC:
  Mean ± SD:        0.6037 ± 0.0858
  95% CI:           [0.4972, 0.7103]
  Variance:         0.007361
  CV:               14.21%
  Range:            [0.4929, 0.7016] (span: 0.2087)
  Median (IQR):     0.5917 (0.1195)

Sensitivity (Recall):
  Mean ± SD:        0.5706 ± 0.1198
  95% CI:           [0.4218, 0.7193]
  Variance:         0.014357
  CV:               21.00%
  Range:            [0.4207, 0.7377] (span: 0.3170)
  Median (IQR):     0.5459 (0.1130)

Specificity:
  Mean ± SD:        0.5876 ± 0.0458
  95% CI:           [0.5307, 0.6445]
  Variance:         0.002099

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

In [None]:
RF_small_results = crossval_compare_features("parkceleb_filtered", min_audio_length=0.1, deep_features="Whisper Small", model=RandomForestClassifier)

Loading features from: whisper_small_all_features.csv
Loading features from: traditional_all_features.csv

=== Processing Whisper Features with RandomForestClassifier ===
Feature dimension: 768
Class distribution: [5239 8614]

=== Fold 1 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_RandomForestClassifier/fold_1.pkl
Using loaded model for Whisper...

=== Fold 2 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_RandomForestClassifier/fold_2.pkl
Using loaded model for Whisper...

=== Fold 3 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_RandomForestClassifier/fold_3.pkl
Using loaded model for Whisper...

=== Fold 4 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_RandomForestClassifier/fold_4.pkl
Using loaded model for Whisper...

=== Fold 5 ===
Model loaded from: experiment_results/models/whisper_small/Whisper_RandomForestClassifier/fold_5.pkl
Using loaded model for Whisper...

=== Whisper Featur

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

In [None]:
DeepNN_med_results = crossval_compare_features("parkceleb_filtered", min_audio_length=0.1, deep_features="Whisper Medium" ,model=DeepNN)

Loading features from: whisper_medium_all_features.csv
Loading features from: traditional_all_features.csv

=== Processing Whisper Features with DeepNN ===
Feature dimension: 1024
Class distribution: [5239 8614]

=== Fold 1 ===
Model loaded from: experiment_results/models/whisper_medium/Whisper_DeepNN/fold_1.pth
Using loaded model for Whisper...


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Fold 2 ===
Model loaded from: experiment_results/models/whisper_medium/Whisper_DeepNN/fold_2.pth
Using loaded model for Whisper...

=== Fold 3 ===


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/whisper_medium/Whisper_DeepNN/fold_3.pth
Using loaded model for Whisper...

=== Fold 4 ===
Model loaded from: experiment_results/models/whisper_medium/Whisper_DeepNN/fold_4.pth
Using loaded model for Whisper...

=== Fold 5 ===


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/whisper_medium/Whisper_DeepNN/fold_5.pth
Using loaded model for Whisper...

=== Whisper Features - Comprehensive Statistics ===

Accuracy:
  Mean ± SD:        0.5573 ± 0.0406
  95% CI:           [0.5070, 0.6077]
  Variance:         0.001644
  CV:               7.28%
  Range:            [0.5080, 0.6007] (span: 0.0927)
  Median (IQR):     0.5738 (0.0622)

ROC AUC:
  Mean ± SD:        0.5798 ± 0.0503
  95% CI:           [0.5173, 0.6422]
  Variance:         0.002529
  CV:               8.67%
  Range:            [0.5168, 0.6373] (span: 0.1205)
  Median (IQR):     0.5655 (0.0699)

Sensitivity (Recall):
  Mean ± SD:        0.5698 ± 0.0695
  95% CI:           [0.4836, 0.6561]
  Variance:         0.004825
  CV:               12.19%
  Range:            [0.5017, 0.6779] (span: 0.1761)
  Median (IQR):     0.5553 (0.0701)

Specificity:
  Mean ± SD:        0.5435 ± 0.0871
  95% CI:           [0.4353, 0.6517]
  Variance:         0.007589
  CV:             

  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/traditional/Traditional_DeepNN/fold_2.pth
Using loaded model for Traditional...

=== Fold 3 ===
Model loaded from: experiment_results/models/traditional/Traditional_DeepNN/fold_3.pth
Using loaded model for Traditional...

=== Fold 4 ===
Model loaded from: experiment_results/models/traditional/Traditional_DeepNN/fold_4.pth
Using loaded model for Traditional...


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Fold 5 ===
Model loaded from: experiment_results/models/traditional/Traditional_DeepNN/fold_5.pth
Using loaded model for Traditional...

=== Traditional Features - Comprehensive Statistics ===

Accuracy:
  Mean ± SD:        0.5801 ± 0.0646
  95% CI:           [0.4998, 0.6603]
  Variance:         0.004180
  CV:               11.15%
  Range:            [0.5021, 0.6750] (span: 0.1729)
  Median (IQR):     0.5602 (0.0496)

ROC AUC:
  Mean ± SD:        0.6088 ± 0.0711
  95% CI:           [0.5206, 0.6971]
  Variance:         0.005053
  CV:               11.68%
  Range:            [0.5502, 0.7092] (span: 0.1590)
  Median (IQR):     0.5730 (0.1039)

Sensitivity (Recall):
  Mean ± SD:        0.5758 ± 0.0696
  95% CI:           [0.4894, 0.6623]
  Variance:         0.004845
  CV:               12.09%
  Range:            [0.4743, 0.6457] (span: 0.1715)
  Median (IQR):     0.5879 (0.0876)

Specificity:
  Mean ± SD:        0.5960 ± 0.0930
  95% CI:           [0.4805, 0.7115]
  Variance:         

  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



COMPREHENSIVE CROSS-VALIDATION RESULTS SUMMARY

Whisper Features Summary:
┌─────────────┬─────────────┬───────────────┬──────────┬──────────┬──────────┐
│ Metric      │ Mean±SD     │ 95% CI        │ CV (%)   │ Range    │ Median   │
├─────────────┼─────────────┼───────────────┼──────────┼──────────┼──────────┤
│ Accuracy    │ 0.557±0.041 │ [0.507,0.608] │    7.3   │ 0.093    │ 0.574    │
│ ROC AUC     │ 0.580±0.050 │ [0.517,0.642] │    8.7   │ 0.120    │ 0.566    │
│ Sensitivity │ 0.570±0.069 │ [0.484,0.656] │   12.2   │ 0.176    │ 0.555    │
│ Specificity │ 0.544±0.087 │ [0.435,0.652] │   16.0   │ 0.201    │ 0.577    │
│ F1 Score    │ 0.611±0.027 │ [0.578,0.644] │    4.4   │ 0.069    │ 0.609    │
└─────────────┴─────────────┴───────────────┴──────────┴──────────┴──────────┘

Traditional Features Summary:
┌─────────────┬─────────────┬───────────────┬──────────┬──────────┬──────────┐
│ Metric      │ Mean±SD     │ 95% CI        │ CV (%)   │ Range    │ Median   │
├─────────────┼──────────

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

In [None]:
ImprovedDeepNN_med_results = crossval_compare_features("parkceleb_filtered", min_audio_length=0.1, deep_features="Whisper Medium", model=ImprovedDeepNN)

Loading features from: whisper_medium_all_features.csv
Loading features from: traditional_all_features.csv

=== Processing Whisper Features with ImprovedDeepNN ===
Feature dimension: 1024
Class distribution: [5239 8614]

=== Fold 1 ===
Model loaded from: experiment_results/models/whisper_medium/Whisper_ImprovedDeepNN/fold_1.pth
Using loaded model for Whisper...


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Fold 2 ===
Model loaded from: experiment_results/models/whisper_medium/Whisper_ImprovedDeepNN/fold_2.pth
Using loaded model for Whisper...

=== Fold 3 ===


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/whisper_medium/Whisper_ImprovedDeepNN/fold_3.pth
Using loaded model for Whisper...

=== Fold 4 ===


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/whisper_medium/Whisper_ImprovedDeepNN/fold_4.pth
Using loaded model for Whisper...

=== Fold 5 ===


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/whisper_medium/Whisper_ImprovedDeepNN/fold_5.pth
Using loaded model for Whisper...

=== Whisper Features - Comprehensive Statistics ===

Accuracy:
  Mean ± SD:        0.5501 ± 0.0559
  95% CI:           [0.4807, 0.6195]
  Variance:         0.003123
  CV:               10.16%
  Range:            [0.4829, 0.6083] (span: 0.1254)
  Median (IQR):     0.5425 (0.0938)

ROC AUC:
  Mean ± SD:        0.5841 ± 0.0703
  95% CI:           [0.4969, 0.6714]
  Variance:         0.004937
  CV:               12.03%
  Range:            [0.5056, 0.6593] (span: 0.1538)
  Median (IQR):     0.5533 (0.1130)

Sensitivity (Recall):
  Mean ± SD:        0.5500 ± 0.0769
  95% CI:           [0.4545, 0.6455]
  Variance:         0.005915
  CV:               13.98%
  Range:            [0.4408, 0.6546] (span: 0.2138)
  Median (IQR):     0.5489 (0.0394)

Specificity:
  Mean ± SD:        0.5505 ± 0.1344
  95% CI:           [0.3836, 0.7174]
  Variance:         0.018069
  CV:   

  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Fold 2 ===
Model loaded from: experiment_results/models/traditional/Traditional_ImprovedDeepNN/fold_2.pth
Using loaded model for Traditional...


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')



=== Fold 3 ===
Model loaded from: experiment_results/models/traditional/Traditional_ImprovedDeepNN/fold_3.pth
Using loaded model for Traditional...

=== Fold 4 ===


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')
  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/traditional/Traditional_ImprovedDeepNN/fold_4.pth
Using loaded model for Traditional...

=== Fold 5 ===


  save_data = torch.load(model_path, map_location='cuda' if torch.cuda.is_available() else 'cpu')


Model loaded from: experiment_results/models/traditional/Traditional_ImprovedDeepNN/fold_5.pth
Using loaded model for Traditional...

=== Traditional Features - Comprehensive Statistics ===

Accuracy:
  Mean ± SD:        0.5668 ± 0.0746
  95% CI:           [0.4742, 0.6594]
  Variance:         0.005563
  CV:               13.16%
  Range:            [0.4574, 0.6380] (span: 0.1807)
  Median (IQR):     0.5693 (0.0956)

ROC AUC:
  Mean ± SD:        0.6037 ± 0.0858
  95% CI:           [0.4972, 0.7103]
  Variance:         0.007361
  CV:               14.21%
  Range:            [0.4929, 0.7016] (span: 0.2087)
  Median (IQR):     0.5917 (0.1195)

Sensitivity (Recall):
  Mean ± SD:        0.5706 ± 0.1198
  95% CI:           [0.4218, 0.7193]
  Variance:         0.014357
  CV:               21.00%
  Range:            [0.4207, 0.7377] (span: 0.3170)
  Median (IQR):     0.5459 (0.1130)

Specificity:
  Mean ± SD:        0.5876 ± 0.0458
  95% CI:           [0.5307, 0.6445]
  Variance:         0.002099

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

In [None]:
RF_med_results = crossval_compare_features("parkceleb_filtered", min_audio_length=0.1, deep_features="Whisper Medium", model=RandomForestClassifier)

Loading features from: whisper_medium_all_features.csv
Loading features from: traditional_all_features.csv

=== Processing Whisper Features with RandomForestClassifier ===
Feature dimension: 1024
Class distribution: [5239 8614]

=== Fold 1 ===
Model loaded from: experiment_results/models/whisper_medium/Whisper_RandomForestClassifier/fold_1.pkl
Using loaded model for Whisper...

=== Fold 2 ===
Model loaded from: experiment_results/models/whisper_medium/Whisper_RandomForestClassifier/fold_2.pkl
Using loaded model for Whisper...

=== Fold 3 ===
Model loaded from: experiment_results/models/whisper_medium/Whisper_RandomForestClassifier/fold_3.pkl
Using loaded model for Whisper...

=== Fold 4 ===
Model loaded from: experiment_results/models/whisper_medium/Whisper_RandomForestClassifier/fold_4.pkl
Using loaded model for Whisper...

=== Fold 5 ===
Model loaded from: experiment_results/models/whisper_medium/Whisper_RandomForestClassifier/fold_5.pkl
Using loaded model for Whisper...

=== Whisper

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

# Sanity checks
I messed up the save to csv part of whisper feature generation, so I had to fix it. That's what the code below is for. The function is also fixed now.

In [None]:
df = pd.read_csv("whisper_small_all_features.csv")
print("Unique speaker IDs:", df['id'].unique())

Unique speaker IDs: ['pd_01' 'pd_03' 'pd_05' 'pd_06' 'pd_07' 'pd_08' 'pd_10' 'pd_11' 'pd_12'
 'pd_13' 'pd_14' 'pd_16' 'pd_17' 'pd_18' 'pd_19' 'pd_20' 'pd_23' 'pd_24'
 'pd_25' 'pd_27' 'pd_28' 'pd_29' 'pd_30' 'pd_31' 'pd_32' 'pd_33' 'pd_34'
 'pd_35' 'pd_36' 'pd_37' 'pd_38' 'cn_01' 'cn_03' 'cn_05' 'cn_06' 'cn_07'
 'cn_08' 'cn_10' 'cn_11' 'cn_12' 'cn_13' 'cn_14' 'cn_16' 'cn_17' 'cn_18'
 'cn_19' 'cn_20' 'cn_23' 'cn_24' 'cn_25' 'cn_27' 'cn_28' 'cn_29' 'cn_30'
 'cn_31' 'cn_32' 'cn_33' 'cn_34' 'cn_35' 'cn_36' 'cn_37' 'cn_38']


In [None]:
# import shutil
shutil.copy2("whisper_all_features.csv", "whisper_all_features_BACKUP.csv")
shutil.copy2("traditional_all_features.csv", "traditional_all_features_BACKUP.csv")

NameError: name 'shutil' is not defined

In [None]:
df = pd.read_csv("whisper_all_features.csv")
print("PD count:", df[df['label'] == 1].shape[0])
print("CN count:", df[df['label'] == 0].shape[0])


df = pd.read_csv("traditional_all_features.csv")
print("PD count:", df[df['label'] == 1].shape[0])
print("CN count:", df[df['label'] == 0].shape[0])

In [None]:
def fix_labels_in_csv(csv_path):
    df = pd.read_csv(csv_path)

    # Create labels based on 'id' column
    df['label'] = df['id'].str.contains('pd', case=False).astype(int)

    # Save fixed CSV (backup original first!)
    df.to_csv(csv_path, index=False)
    print(f"Updated labels in {csv_path}. Class distribution: {df['label'].value_counts().to_dict()}")

# Apply to both feature CSVs
fix_labels_in_csv("whisper_medium_all_features.csv")
# fix_labels_in_csv("traditional_all_features.csv")

df = pd.read_csv("whisper_medium_all_features.csv")
print("PD count:", df[df['label'] == 1].shape[0])
print("CN count:", df[df['label'] == 0].shape[0])