# üß† Psychiatric Disorder Detection - Model Training

This notebook trains ML models for psychiatric disorder severity classification using the DASS-42 questionnaire.

**Pipeline:**
1. Download DASS-42 dataset from Kaggle
2. Preprocess and select 30 RFE-identified features
3. Apply feature scaling (StandardScaler)
4. Train 4 classifiers (LogReg, RF, SVM, GB)
5. Compare using comprehensive metrics
6. Save best model + scaler for deployment

---

## üîó Step 0: Mount Google Drive (Optional)

**Run this if using VS Code + Colab extension.** Skip if using browser Colab.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

DRIVE_OUTPUT = '/content/drive/MyDrive/PDD_Models'

import os
os.makedirs(DRIVE_OUTPUT, exist_ok=True)
print(f"‚úÖ Drive mounted. Output: {DRIVE_OUTPUT}")

## üì¶ Step 1: Install Dependencies

In [None]:
%pip install kagglehub -q

## üìö Step 2: Import Libraries

In [None]:
import os
import json
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score
)

warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("‚úÖ Libraries imported!")

## ‚öôÔ∏è Step 3: Configuration

In [None]:
SELECTED_FEATURES = [
    'Q1A', 'Q3A', 'Q4A', 'Q5A', 'Q7A', 'Q8A', 'Q9A', 'Q10A', 'Q11A', 'Q12A',
    'Q13A', 'Q16A', 'Q17A', 'Q20A', 'Q21A', 'Q22A', 'Q24A', 'Q26A', 'Q27A',
    'Q28A', 'Q29A', 'Q30A', 'Q32A', 'Q33A', 'Q34A', 'Q36A', 'Q38A', 'Q39A',
    'Q40A', 'Q41A'
]

CLASS_LABELS = {0: "None", 1: "Mild", 2: "Moderate", 3: "Severe"}
TEST_SIZE = 0.2

print(f"Features: {len(SELECTED_FEATURES)}, Classes: {len(CLASS_LABELS)}")

## üì• Step 4: Download Dataset

In [None]:
import kagglehub

path = kagglehub.dataset_download("lucasgreenwell/depression-anxiety-stress-scales-responses")
DATA_PATH = list(Path(path).glob("*.csv"))[0]
print(f"Dataset: {DATA_PATH}")

## üîÑ Step 5: Load and Preprocess Data

In [None]:
df = pd.read_csv(DATA_PATH, sep='\t')
print(f"Dataset shape: {df.shape}")

available_features = [f for f in SELECTED_FEATURES if f in df.columns]
X = df[available_features].copy()

# Create target from total DASS score (quartile-based)
q_cols = [col for col in df.columns if col.endswith('A') and col.startswith('Q')]
total_score = df[q_cols].sum(axis=1)

y = pd.cut(total_score, 
           bins=[0, total_score.quantile(0.25), total_score.quantile(0.50),
                 total_score.quantile(0.75), total_score.max() + 1],
           labels=[0, 1, 2, 3], include_lowest=True)

X = X.fillna(X.median())

# Handle NaN in target
if y.isna().any():
    print(f"‚ö†Ô∏è Dropping {y.isna().sum()} NaN targets")
    mask = ~y.isna()
    X, y = X[mask], y[mask]

y = y.astype(int)
print(f"Features: {X.shape}, Target: {y.shape}")

## üìä Step 6: Class Distribution

In [None]:
class_counts = y.value_counts().sort_index()
for cls, count in class_counts.items():
    print(f"  {CLASS_LABELS[cls]}: {count:,} ({count/len(y)*100:.1f}%)")

plt.figure(figsize=(8, 5))
plt.bar(range(4), class_counts.values, color=['#22c55e', '#eab308', '#f97316', '#ef4444'])
plt.xticks(range(4), CLASS_LABELS.values())
plt.title('Class Distribution')
plt.tight_layout()
plt.show()

## ‚úÇÔ∏è Step 7: Train/Test Split + Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# Apply StandardScaler (fit on train only)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train: {X_train.shape[0]}, Test: {X_test.shape[0]}")
print("‚úÖ Scaler fitted")

## ü§ñ Step 8: Train and Compare Models

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE),
    'SVM': SVC(kernel='rbf', probability=True, random_state=RANDOM_STATE),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=RANDOM_STATE)
}

results = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

for name, model in models.items():
    print(f"\n--- {name} ---")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)
    
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    try:
        roc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted')
    except (ValueError, TypeError):
        roc = None
    
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring='f1_weighted')
    
    results[name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted'),
        'f1_weighted': f1,
        'f1_macro': f1_score(y_test, y_pred, average='macro'),
        'roc_auc': roc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'model': model,
        'y_pred': y_pred
    }
    
    roc_str = f"{roc:.4f}" if roc else "N/A"
    print(f"  F1: {f1:.4f}, ROC-AUC: {roc_str}, CV: {cv_scores.mean():.4f}")

## üèÜ Step 9: Select Best Model

In [None]:
best_name = max(results, key=lambda x: results[x]['f1_weighted'])
best_model = results[best_name]['model']
best_score = results[best_name]['f1_weighted']

print(f"\nüèÜ BEST: {best_name} (F1={best_score:.4f})")

## üìà Step 10: Confusion Matrices

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for idx, (name, m) in enumerate(results.items()):
    cm = confusion_matrix(y_test, m['y_pred'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes.flat[idx],
                xticklabels=CLASS_LABELS.values(), yticklabels=CLASS_LABELS.values())
    axes.flat[idx].set_title(f"{name}\nF1={m['f1_weighted']:.4f}")

plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=150)
plt.show()

## üíæ Step 11: Save Model, Scaler, and Metadata

In [None]:
# Save model
joblib.dump(best_model, 'psychiatric_model.joblib')
print("‚úÖ psychiatric_model.joblib")

# Save scaler (IMPORTANT for inference)
joblib.dump(scaler, 'scaler.joblib')
print("‚úÖ scaler.joblib")

# Save feature names
with open('feature_names.json', 'w') as f:
    json.dump(available_features, f)
print("‚úÖ feature_names.json")

# Save training report
report = {'best_model': best_name, 'models': {}}
for name, m in results.items():
    report['models'][name] = {
        k: float(v) if isinstance(v, (int, float)) and v is not None else v
        for k, v in m.items() if k not in ['model', 'y_pred']
    }

with open('training_report.json', 'w') as f:
    json.dump(report, f, indent=2)
print("‚úÖ training_report.json")

## üì§ Step 12A: Save to Google Drive

In [None]:
import shutil

if 'DRIVE_OUTPUT' not in globals():
    raise RuntimeError("Run Step 0 first, or use Step 12B")

for f in ['psychiatric_model.joblib', 'scaler.joblib', 'feature_names.json', 
          'training_report.json', 'confusion_matrices.png']:
    shutil.copy(f, f'{DRIVE_OUTPUT}/{f}')

print(f"‚úÖ Saved to {DRIVE_OUTPUT}")
print("\nüìÅ Copy to backend/models/:")
print("   - psychiatric_model.joblib")
print("   - scaler.joblib")
print("   - feature_names.json")

## üì• Step 12B: Download Files (Browser Colab)

In [None]:
from google.colab import files

for f in ['psychiatric_model.joblib', 'scaler.joblib', 'feature_names.json', 
          'training_report.json', 'confusion_matrices.png']:
    files.download(f)

print("\n‚úÖ Downloaded! Place in backend/models/:")
print("   - psychiatric_model.joblib")
print("   - scaler.joblib") 
print("   - feature_names.json")

## üß™ Step 13: Test Prediction

In [None]:
# Test with sample input
sample = [[2] * len(available_features)]
sample_scaled = scaler.transform(sample)

pred = best_model.predict(sample_scaled)[0]
probs = best_model.predict_proba(sample_scaled)[0]

print(f"Sample (all 2s) ‚Üí {CLASS_LABELS[pred]} ({max(probs)*100:.1f}% confidence)")

---
## ‚úÖ Done!

**Files to copy to `backend/models/`:**
1. `psychiatric_model.joblib`
2. `scaler.joblib`
3. `feature_names.json`

The backend will automatically load the scaler and apply it before predictions.