# Clinical Validation of Multi-Disorder Voice Screening Platform

This notebook provides comprehensive clinical validation of the voice-based mental health screening platform, including performance metrics, ROC curves, and comparison with standardized clinical assessments.

## Overview
- **Objective**: Validate the accuracy and clinical utility of voice-based screening
- **Disorders**: Depression, Anxiety, PTSD, Cognitive Decline
- **Datasets**: Voiceome, DAIC-WOZ, Vocal Mind
- **Validation Methods**: Cross-validation, clinical correlation, subgroup analysis


In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Machine learning and evaluation
from sklearn.metrics import (
    roc_auc_score, roc_curve, precision_recall_curve, auc,
    confusion_matrix, classification_report, accuracy_score,
    precision_score, recall_score, f1_score
)
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.calibration import calibration_curve

# Statistical analysis
from scipy import stats
from scipy.stats import pearsonr, spearmanr
import scipy.stats as stats

# Visualization
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully")


## 1. Data Loading and Preparation


In [None]:
# Load features and metadata
features_path = "../data/features/extracted_features.csv"
metadata_path = "../data/processed/unified_metadata.csv"

if Path(features_path).exists():
    features_df = pd.read_csv(features_path)
    print(f"Loaded features: {features_df.shape}")
else:
    print("Features file not found. Creating sample data for demonstration.")
    # Create sample data for demonstration
    n_samples = 200
    n_features = 50
    
    # Generate sample features
    feature_data = np.random.randn(n_samples, n_features)
    feature_names = [f'feature_{i}' for i in range(n_features)]
    
    # Generate sample labels
    disorders = ['depression', 'anxiety', 'ptsd', 'cognitive']
    labels = {}
    
    for disorder in disorders:
        if disorder == 'cognitive':
            # 3 classes for cognitive
            labels[f'{disorder}_label'] = np.random.choice(['normal', 'mild_impairment', 'moderate_impairment'], n_samples, p=[0.7, 0.25, 0.05])
        else:
            # 4 classes for other disorders
            labels[f'{disorder}_label'] = np.random.choice(['none', 'mild', 'moderate', 'severe'], n_samples, p=[0.6, 0.25, 0.12, 0.03])
    
    # Create DataFrame
    features_df = pd.DataFrame(feature_data, columns=feature_names)
    features_df['participant_id'] = [f'P{i:03d}' for i in range(n_samples)]
    features_df['age'] = np.random.randint(18, 80, n_samples)
    features_df['gender'] = np.random.choice(['M', 'F'], n_samples)
    features_df['dataset'] = np.random.choice(['voiceome', 'daic', 'vocal_mind'], n_samples, p=[0.6, 0.25, 0.15])
    
    # Add labels
    for key, value in labels.items():
        features_df[key] = value

print(f"Dataset shape: {features_df.shape}")
print(f"Features: {features_df.columns.tolist()[:10]}...")
print(f"Disorders: {[col for col in features_df.columns if 'label' in col]}")
