# 🧠 Alzheimer's Disease Prediction - Colab Training Notebook

This notebook provides a complete training pipeline for Alzheimer's Disease prediction with:
- Data loading with automatic fallback
- Runtime validation (GPU detection, version checks)
- Multiple ML model training
- Bootstrap confidence intervals

## 📋 Colab Setup Instructions

1. **Restart Runtime**: Runtime → Restart runtime (or Ctrl+M → Restart runtime)
2. **Enable GPU**: Runtime → Change runtime type → Hardware accelerator → GPU
3. **Mount Google Drive** (optional): If using external files
   ```python
   from google.colab import drive
   drive.mount('/content/drive')
   ```

4. **Install Dependencies**: Run the setup cell below

## 🔧 Setup & Installation

In [None]:
# Install dependencies
!pip install -q scikit-learn xgboost lightgbm optuna numpy pandas matplotlib seaborn

## 🔍 Runtime Validation

In [None]:
import sys
import platform
import warnings
warnings.filterwarnings('ignore')

print("🔍 Runtime Validation")
print("=" * 50)

# Python version
print(f"🐍 Python version: {sys.version.split()[0]}")
print(f"💻 Platform: {platform.system()} {platform.release()}")

# GPU detection
try:
    import torch
    if torch.cuda.is_available():
        print(f"⚡ GPU detected: {torch.cuda.get_device_name(0)}")
        print(f"📊 GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("⚠️ No GPU detected - using CPU")
except ImportError:
    try:
        import tensorflow as tf
        if tf.config.list_physical_devices('GPU'):
            gpu = tf.config.list_physical_devices('GPU')[0]
            print(f"⚡ GPU detected: {gpu}")
        else:
            print("⚠️ No GPU detected - using CPU")
    except ImportError:
        print("⚠️ PyTorch/TensorFlow not installed - cannot detect GPU")
        print("💡 To enable GPU: Runtime → Change runtime type → GPU")

# Package versions
print("\n📦 Package Versions:")
packages = ['sklearn', 'xgboost', 'lightgbm', 'optuna']
for pkg in packages:
    try:
        mod = __import__(pkg)
        version = getattr(mod, '__version__', 'unknown')
        print(f"  ✅ {pkg}: {version}")
    except ImportError:
        print(f"  ❌ {pkg}: not installed")

print("\n✅ Runtime validation complete")

## 📊 Data Loading with Fallback

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

print("📦 Loading data...")

X = None
y = None
data_source = None

# Try loading NPZ file first
try:
    data = np.load('preprocessed_data.npz', allow_pickle=True)
    if 'X' in data and 'y' in data:
        X = data['X']
        y = data['y']
        data_source = 'NPZ'
        print(f"✅ Data loaded from preprocessed_data.npz")
        print(f"   Shape: X={X.shape}, y={y.shape}")
    else:
        print("⚠️ NPZ file found but missing 'X' or 'y' keys")
        print(f"   Available keys: {list(data.keys())}")
except FileNotFoundError:
    print("⚠️ preprocessed_data.npz not found, trying CSV fallback...")
except Exception as e:
    print(f"⚠️ Error loading NPZ: {e}")
    print("   Trying CSV fallback...")

# Fallback to CSV
if X is None or y is None:
    try:
        df = pd.read_csv('fallback_data.csv')
        print(f"✅ Data loaded from fallback_data.csv")
        print(f"   Shape: {df.shape}")
        
        # Assume last column is target
        y = df.iloc[:, -1].values
        X = df.iloc[:, :-1].values
        data_source = 'CSV'
        print(f"   Extracted: X={X.shape}, y={y.shape}")
    except FileNotFoundError:
        print("❌ fallback_data.csv not found")
        print("💡 Creating sample data for demonstration...")
        # Create sample data
        np.random.seed(42)
        X = np.random.randn(1000, 50)
        y = np.random.choice([0, 1, 2], 1000)
        data_source = 'SAMPLE'
        print(f"   Generated sample: X={X.shape}, y={y.shape}")
    except Exception as e:
        print(f"❌ Error loading CSV: {e}")
        raise

# Handle multi-dimensional y
if len(y.shape) > 1:
    if y.shape[1] == 1:
        y = y.ravel()
    else:
        y = np.argmax(y, axis=1)

# Data preprocessing
print(f"\n🔧 Preprocessing data (source: {data_source})...")

# Handle NaN and infinity
if np.any(np.isnan(X)) or np.any(np.isinf(X)):
    print("   Cleaning NaN and infinity values...")
    X = np.where(np.isinf(X), np.nan, X)
    imputer = SimpleImputer(strategy='median')
    X = imputer.fit_transform(X)

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n✅ Data preprocessing complete")
print(f"   Train: X={X_train.shape}, y={y_train.shape}")
print(f"   Test: X={X_test.shape}, y={y_test.shape}")
print(f"   Classes: {len(np.unique(y_train))} (distribution: {np.bincount(y_train)})")

## 🤖 Model Training Loop

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import lightgbm as lgb

print("🤖 Training Models")
print("=" * 50)

# Define models
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ),
    'XGBoost': xgb.XGBClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss',
        verbosity=0
    ),
    'LightGBM': lgb.LGBMClassifier(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        verbose=-1
    ),
    'SVM': SVC(
        kernel='rbf',
        probability=True,
        random_state=42
    ),
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    )
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    print(f"\n🔁 Training {name}...")
    
    try:
        # Train
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'predictions': y_pred
        }
        
        print(f"   ✅ Accuracy: {accuracy:.4f}")
        
    except Exception as e:
        print(f"   ❌ Error: {e}")
        import traceback
        traceback.print_exc()

print("\n" + "=" * 50)
print("📊 Model Performance Summary:")
print("=" * 50)

# Sort by accuracy
sorted_results = sorted(results.items(), key=lambda x: x[1]['accuracy'], reverse=True)

for name, res in sorted_results:
    print(f"{name:20s}: {res['accuracy']:.4f}")

if sorted_results:
    best_name, best_result = sorted_results[0]
    print(f"\n🏆 Best Model: {best_name} (Accuracy: {best_result['accuracy']:.4f})")

## 📈 Bootstrap Confidence Intervals (Optional)

In [None]:
def bootstrap_confidence_interval(model, X_test, y_test, n_bootstrap=300, confidence=0.95):
    """
    Calculate bootstrap confidence intervals for model accuracy.
    """
    n_samples = len(y_test)
    accuracies = []
    
    print(f"🔄 Running {n_bootstrap} bootstrap iterations...")
    
    for i in range(n_bootstrap):
        # Bootstrap sample
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        X_boot = X_test[indices]
        y_boot = y_test[indices]
        
        # Predict
        y_pred = model.predict(X_boot)
        
        # Calculate accuracy
        acc = accuracy_score(y_boot, y_pred)
        accuracies.append(acc)
        
        # Progress indicator
        if (i + 1) % 50 == 0:
            print(f"   Completed {i + 1}/{n_bootstrap} iterations")
    
    # Calculate statistics
    accuracies = np.array(accuracies)
    mean_acc = np.mean(accuracies)
    std_acc = np.std(accuracies)
    
    # Calculate confidence interval
    alpha = 1 - confidence
    lower = np.percentile(accuracies, 100 * alpha / 2)
    upper = np.percentile(accuracies, 100 * (1 - alpha / 2))
    
    return {
        'mean': mean_acc,
        'std': std_acc,
        'lower': lower,
        'upper': upper,
        'confidence': confidence
    }

# Run bootstrap CI for top models
if results:
    print("\n📊 Bootstrap Confidence Intervals for Top Models")
    print("=" * 50)
    
    # Get top 3 models
    top_models = sorted_results[:3]
    
    bootstrap_results = {}
    
    for name, res in top_models:
        print(f"\n🔍 Analyzing {name}...")
        ci = bootstrap_confidence_interval(res['model'], X_test, y_test, n_bootstrap=300)
        bootstrap_results[name] = ci
        
        print(f"   Mean Accuracy: {ci['mean']:.4f} ± {ci['std']:.4f}")
        print(f"   {int(ci['confidence']*100)}% CI: [{ci['lower']:.4f}, {ci['upper']:.4f}]")
    
    print("\n✅ Bootstrap analysis complete")
else:
    print("⚠️ No trained models available for bootstrap analysis")

## 💾 Save Results (Optional)

In [None]:
import joblib
import json
from datetime import datetime

# Save best model
if results:
    best_name, best_result = sorted_results[0]
    
    # Save model
    model_filename = f'best_model_{best_name.replace(" ", "_")}.pkl'
    joblib.dump(best_result['model'], model_filename)
    print(f"💾 Saved best model: {model_filename}")
    
    # Save results summary
    summary = {
        'timestamp': datetime.now().isoformat(),
        'best_model': best_name,
        'best_accuracy': float(best_result['accuracy']),
        'all_results': {name: float(res['accuracy']) for name, res in results.items()}
    }
    
    if 'bootstrap_results' in locals():
        summary['bootstrap'] = {
            name: {k: float(v) for k, v in ci.items() if k != 'confidence'}
            for name, ci in bootstrap_results.items()
        }
    
    summary_filename = 'training_results.json'
    with open(summary_filename, 'w') as f:
        json.dump(summary, f, indent=2)
    
    print(f"💾 Saved results summary: {summary_filename}")
    print("\n📁 Files saved to current directory")
    print("   💡 To download: Right-click file → Download")
else:
    print("⚠️ No results to save")

## 📝 Notes

- **Runtime Management**: If you encounter memory issues, restart the runtime (Runtime → Restart runtime)
- **GPU Usage**: To enable GPU acceleration, go to Runtime → Change runtime type → Hardware accelerator → GPU
- **Data Upload**: Upload `preprocessed_data.npz` or `fallback_data.csv` to the Colab file system using the file browser
- **Download Results**: Right-click on saved files in the file browser to download them
- **Long Training**: For long training sessions, consider using Colab Pro for longer runtime sessions