In [None]:
# Colab setup
%pip install -q xgboost lightgbm shap pyarrow scikit-learn matplotlib seaborn joblib


# Robust Validation for Alzheimer's Disease Models

This notebook performs comprehensive validation using stratified K-fold cross-validation, learning curves, and bootstrap confidence intervals to ensure model reliability and generalization.

## Features:
- Fresh train/validation/test splits
- Stratified K-fold cross-validation
- Learning curves analysis
- Bootstrap confidence intervals (20 iterations, interrupt-safe)
- Overfitting detection
- Validation summary reports

## Outputs:
- Validation plots and learning curves
- `validation_summary.json` with comprehensive metrics
- Bootstrap confidence intervals


In [None]:
# Setup
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('./src')

# Set thread limits for stability
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['NUMEXPR_MAX_THREADS'] = '1'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json

# Import our validation module
from src.validation import RobustValidator
from src.advanced_model import AdvancedAlzheimerModel

# Create results directory
os.makedirs('results', exist_ok=True)

print("✅ Setup complete - Ready for robust validation")


In [None]:
# Load Data
print("📊 Loading data for validation...")

# Initialize validator
validator = RobustValidator(random_state=42)

# Load data
try:
    # Try NPZ first
    data = np.load('data/processed/preprocessed_alz_data.npz', allow_pickle=True)
    X = np.vstack([data['X_train'], data['X_test']])
    y = np.concatenate([data['y_train'], data['y_test']])
    
    # Handle multi-dimensional y
    if len(y.shape) > 1:
        if y.shape[1] == 1:
            y = y.ravel()
        else:
            y = np.argmax(y, axis=1)
    
    print(f"✅ Loaded NPZ data: {X.shape}")
    
except Exception as e:
    print(f"⚠️ NPZ loading failed: {e}")
    print("🔄 Creating sample data...")
    
    # Create sample data
    np.random.seed(42)
    X = np.random.randn(1200, 50)
    y = np.random.choice([0, 1, 2], 1200)
    print(f"✅ Created sample data: {X.shape}")

print(f"📊 Data shape: {X.shape}")
print(f"📊 Target distribution: {np.bincount(y)}")
print(f"📊 Classes: {len(np.unique(y))}")


In [None]:
# Create Fresh Splits
print("🔄 Creating fresh train/validation/test splits...")

# Clean data helper
def clean_data(X):
    X = np.array(X, dtype=np.float64)
    X = np.where(np.isinf(X), np.nan, X)
    X = np.where(np.abs(X) > 1e10, np.nan, X)
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy='median')
    X = imputer.fit_transform(X)
    return X

# Clean the data
X = clean_data(X)

# Create fresh splits
X_train, X_val, X_test, y_train, y_val, y_test = validator.create_fresh_splits(
    X, y, test_size=0.2, val_size=0.2
)

print(f"✅ Fresh splits created:")
print(f"  Train: {X_train.shape} (labels: {y_train.shape})")
print(f"  Validation: {X_val.shape} (labels: {y_val.shape})")
print(f"  Test: {X_test.shape} (labels: {y_test.shape})")

# Check class distribution
print(f"\n📊 Class distribution:")
print(f"  Train: {np.bincount(y_train)}")
print(f"  Validation: {np.bincount(y_val)}")
print(f"  Test: {np.bincount(y_test)}")


In [None]:
# Comprehensive Validation Report
print("📊 Running comprehensive validation...")

# Initialize advanced model
advanced_model = AdvancedAlzheimerModel(random_state=42)

# Run comprehensive validation
validation_results = validator.comprehensive_validation_report(
    advanced_model, X_train, y_train, X_val, y_val, X_test, y_test,
    cv_folds=3, n_bootstrap=20
)

print("\n📊 Validation Results Summary:")
print("-" * 50)

# Display results
for model_name, results in validation_results.items():
    print(f"\n{model_name}:")
    print(f"  Train Score: {results['train_score']:.4f}")
    print(f"  Val Score: {results['val_score']:.4f}")
    print(f"  Test Score: {results['test_score']:.4f}")
    print(f"  CV Mean: {results['cv_mean']:.4f} ± {results['cv_std']:.4f}")
    print(f"  Gap (Train-Val): {results['train_val_gap']:.4f}")
    print(f"  Gap (Train-Test): {results['train_test_gap']:.4f}")
    
    if 'bootstrap_ci' in results:
        ci = results['bootstrap_ci']
        print(f"  Bootstrap CI (95%): [{ci['ci_95_lower']:.4f}, {ci['ci_95_upper']:.4f}]")


In [None]:
# Learning Curves Analysis
print("📈 Generating learning curves...")

# Plot learning curves for key models
key_models = ['Random Forest (Regularized)', 'XGBoost (Regularized)', 'Logistic Regression (L1)']

plt.figure(figsize=(15, 5))

for i, model_name in enumerate(key_models):
    if model_name in validation_results and 'learning_curve' in validation_results[model_name]:
        plt.subplot(1, 3, i+1)
        
        lc_data = validation_results[model_name]['learning_curve']
        train_sizes = lc_data['train_sizes']
        train_scores = lc_data['train_scores']
        val_scores = lc_data['val_scores']
        
        plt.plot(train_sizes, train_scores, 'o-', label='Training Score', alpha=0.7)
        plt.plot(train_sizes, val_scores, 'o-', label='Validation Score', alpha=0.7)
        plt.fill_between(train_sizes, 
                        np.array(train_scores) - np.array(lc_data['train_scores_std']),
                        np.array(train_scores) + np.array(lc_data['train_scores_std']),
                        alpha=0.1)
        plt.fill_between(train_sizes,
                        np.array(val_scores) - np.array(lc_data['val_scores_std']),
                        np.array(val_scores) + np.array(lc_data['val_scores_std']),
                        alpha=0.1)
        
        plt.title(f'Learning Curve - {model_name}')
        plt.xlabel('Training Set Size')
        plt.ylabel('Score')
        plt.legend()
        plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("✅ Learning curves generated")


In [None]:
# Overfitting Analysis
print("🔍 Analyzing overfitting patterns...")

# Create overfitting analysis plot
plt.figure(figsize=(12, 8))

# Extract data for plotting
model_names = list(validation_results.keys())
train_scores = [validation_results[name]['train_score'] for name in model_names]
val_scores = [validation_results[name]['val_score'] for name in model_names]
test_scores = [validation_results[name]['test_score'] for name in model_names]
gaps = [validation_results[name]['train_val_gap'] for name in model_names]

# Plot 1: Score comparison
plt.subplot(2, 2, 1)
x = np.arange(len(model_names))
width = 0.25
plt.bar(x - width, train_scores, width, label='Train', alpha=0.7)
plt.bar(x, val_scores, width, label='Validation', alpha=0.7)
plt.bar(x + width, test_scores, width, label='Test', alpha=0.7)
plt.xlabel('Models')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(x, [name.replace(' (Regularized)', '') for name in model_names], rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: Overfitting gaps
plt.subplot(2, 2, 2)
plt.bar(range(len(model_names)), gaps, alpha=0.7, color='red')
plt.xlabel('Models')
plt.ylabel('Train-Val Gap')
plt.title('Overfitting Analysis (Train-Val Gap)')
plt.xticks(range(len(model_names)), [name.replace(' (Regularized)', '') for name in model_names], rotation=45)
plt.grid(True, alpha=0.3)

# Plot 3: CV scores with error bars
plt.subplot(2, 2, 3)
cv_means = [validation_results[name]['cv_mean'] for name in model_names]
cv_stds = [validation_results[name]['cv_std'] for name in model_names]
plt.errorbar(range(len(model_names)), cv_means, yerr=cv_stds, fmt='o', capsize=5)
plt.xlabel('Models')
plt.ylabel('CV Score')
plt.title('Cross-Validation Scores')
plt.xticks(range(len(model_names)), [name.replace(' (Regularized)', '') for name in model_names], rotation=45)
plt.grid(True, alpha=0.3)

# Plot 4: Bootstrap confidence intervals
plt.subplot(2, 2, 4)
for i, name in enumerate(model_names):
    if 'bootstrap_ci' in validation_results[name]:
        ci = validation_results[name]['bootstrap_ci']
        plt.errorbar(i, ci['mean'], 
                    yerr=[[ci['mean'] - ci['ci_95_lower']], [ci['ci_95_upper'] - ci['mean']]], 
                    fmt='o', capsize=5, label=name.replace(' (Regularized)', ''))

plt.xlabel('Models')
plt.ylabel('Bootstrap Score')
plt.title('Bootstrap Confidence Intervals')
plt.xticks(range(len(model_names)), [name.replace(' (Regularized)', '') for name in model_names], rotation=45)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("✅ Overfitting analysis complete")


In [None]:
# Save Validation Results
print("💾 Saving validation results...")

# Save comprehensive validation summary
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
json_path = f'results/validation_summary_{timestamp}.json'

# Convert numpy types to Python types for JSON serialization
def convert_numpy_types(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    else:
        return obj

# Clean the results for JSON serialization
clean_results = convert_numpy_types(validation_results)

with open(json_path, 'w') as f:
    json.dump(clean_results, f, indent=2)

print(f"✅ Validation summary saved to: {json_path}")

# Create summary table
summary_data = []
for model_name, results in validation_results.items():
    summary_data.append({
        'Model': model_name.replace(' (Regularized)', ''),
        'Train_Score': results['train_score'],
        'Val_Score': results['val_score'],
        'Test_Score': results['test_score'],
        'CV_Mean': results['cv_mean'],
        'CV_Std': results['cv_std'],
        'Train_Val_Gap': results['train_val_gap'],
        'Train_Test_Gap': results['train_test_gap']
    })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('Test_Score', ascending=False)

# Save summary CSV
csv_path = f'results/validation_summary_{timestamp}.csv'
summary_df.to_csv(csv_path, index=False)
print(f"✅ Validation summary CSV saved to: {csv_path}")

print(f"\n🎉 Validation complete!")
print(f"📊 Best performing model: {summary_df.iloc[0]['Model']} (Test Score: {summary_df.iloc[0]['Test_Score']:.4f})")
print(f"📊 All validation results saved to results/ directory")
