In [None]:
# Step 1: Clone repository (Colab only - skip if running locally)
try:
    import google.colab
    !git clone https://github.com/Arnabs-ops/Alzheimer-s.git
    %cd Alzheimer-s
    print("✅ Repository cloned and directory changed")
except:
    print("ℹ️ Running locally - skipping git clone")


In [None]:
# Step 2: Install dependencies (Colab setup)
%pip install -q xgboost lightgbm shap pyarrow category_encoders scikit-learn matplotlib seaborn joblib


# Core AI Training for Alzheimer's Disease Prediction

This notebook trains baseline machine learning models on preprocessed genomic data and provides comprehensive evaluation with interpretability analysis.

## Features:
- Load preprocessed NPZ data or fallback to CSV
- Train multiple models (RF, XGBoost, LightGBM, SVM, Logistic Regression)
- Cross-validation and performance metrics
- SHAP analysis for interpretability
- ROC curves and confusion matrices
- Save best model and results

## Models:
- Random Forest (Regularized)
- XGBoost (Regularized) 
- LightGBM (Regularized)
- SVM (Regularized)
- Logistic Regression (L1/L2)


In [None]:
# Setup
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Colab path detection and setup
try:
    import google.colab
    IN_COLAB = True
    # Find project root (where src/ folder exists)
    if not os.path.exists('src'):
        # Try to find the project root
        possible_paths = ['/content/Alzheimer-s', '/content/Alzheimer-s/Alzhemiers', '.']
        for path in possible_paths:
            if os.path.exists(os.path.join(path, 'src')):
                os.chdir(path)
                print(f"✅ Changed working directory to: {path}")
                break
except ImportError:
    IN_COLAB = False

# Add src to path (try multiple ways)
current_dir = os.getcwd()
if os.path.exists('src'):
    sys.path.insert(0, current_dir)
    sys.path.insert(0, os.path.join(current_dir, 'src'))
elif os.path.exists(os.path.join('..', 'src')):
    sys.path.insert(0, os.path.abspath('..'))
    sys.path.insert(0, os.path.join(os.path.abspath('..'), 'src'))
else:
    # Try to find src folder
    for root, dirs, files in os.walk('.'):
        if 'src' in dirs:
            src_path = os.path.join(root, 'src')
            sys.path.insert(0, root)
            sys.path.insert(0, src_path)
            print(f"✅ Found src at: {src_path}")
            break

print(f"📂 Current directory: {os.getcwd()}")
print(f"📂 Python path includes: {sys.path[:3]}")

# Set thread limits for stability
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['NUMEXPR_MAX_THREADS'] = '1'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Import our modules
try:
    from src.model import get_models, train_and_eval
    from src.utils import load_data, split_data, ensure_dirs, save_artifacts, plot_roc_curves, plot_confusion
    from src.interpretability import plot_shap_summary, plot_feature_importance
    print("✅ Successfully imported src modules")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("💡 Trying direct imports...")
    # Fallback: define minimal functions inline
    def get_models(random_state=42):
        return {}
    def train_and_eval(*args, **kwargs):
        return {}
    def ensure_dirs(path):
        os.makedirs(path, exist_ok=True)
    print("⚠️ Using fallback functions - some features may be limited")

# Create results directory
ensure_dirs('results')

print("✅ Setup complete - Ready for model training")


In [None]:
# Override models for Colab with larger training values (n_estimators=1000, max_iter=1000)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb

COLAB_LARGE_TRAINING = True

models_large = {
    'Random Forest': RandomForestClassifier(
        n_estimators=1000,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    ),
    'XGBoost': xgb.XGBClassifier(
        n_estimators=1000,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.0,
        reg_lambda=1.0,
        eval_metric='logloss',
        use_label_encoder=False,
        verbosity=0,
        n_jobs=-1,
        random_state=42
    ),
    'LightGBM': lgb.LGBMClassifier(
        n_estimators=1000,
        max_depth=-1,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.0,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1
    ),
    'SVM': SVC(
        C=1.0,
        kernel='rbf',
        gamma='scale',
        probability=True,
        random_state=42
    ),
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        C=1.0,
        penalty='l2',
        solver='lbfgs',
        multi_class='ovr',
        random_state=42
    ),
    'MLP': MLPClassifier(
        hidden_layer_sizes=(128, 64),
        activation='relu',
        solver='adam',
        alpha=0.0001,
        learning_rate='adaptive',
        max_iter=1000,
        early_stopping=True,
        validation_fraction=0.1,
        random_state=42
    )
}

# Monkey-patch get_models to return the large config if desired
if COLAB_LARGE_TRAINING:
    def get_models(random_state=42):
        return models_large


In [None]:
# Load Data
print("📊 Loading preprocessed data...")

# Try to load NPZ data first
try:
    data = np.load('data/processed/preprocessed_alz_data.npz', allow_pickle=True)
    X_train = data['X_train']
    X_test = data['X_test']
    y_train = data['y_train']
    y_test = data['y_test']
    
    # Handle multi-dimensional y
    if len(y_train.shape) > 1:
        if y_train.shape[1] == 1:
            y_train = y_train.ravel()
            y_test = y_test.ravel()
        else:
            y_train = np.argmax(y_train, axis=1)
            y_test = np.argmax(y_test, axis=1)
    
    print(f"✅ Loaded NPZ data: Train {X_train.shape}, Test {X_test.shape}")
    
except Exception as e:
    print(f"⚠️ NPZ loading failed: {e}")
    print("🔄 Loading CSV fallback...")
    
    # Fallback to CSV
    try:
        df = load_data('data/processed/alz_clean.csv')
        X, y = split_data(df, 'Phenotype-derived')
        X_train, X_test, y_train, y_test = X[0], X[1], y[0], y[1]
        print(f"✅ Loaded CSV data: Train {X_train.shape}, Test {X_test.shape}")
    except Exception as e2:
        print(f"❌ CSV loading failed: {e2}")
        print("🔄 Creating sample data...")
        
        # Create sample data
        np.random.seed(42)
        X_train = np.random.randn(1000, 50)
        X_test = np.random.randn(200, 50)
        y_train = np.random.choice([0, 1, 2], 1000)
        y_test = np.random.choice([0, 1, 2], 200)
        print(f"✅ Created sample data: Train {X_train.shape}, Test {X_test.shape}")

print(f"📊 Target distribution: {np.bincount(y_train)}")
print(f"📊 Classes: {len(np.unique(y_train))}")


In [None]:
# Train Models
print("🤖 Training baseline models...")

# Get models
try:
    models = get_models(random_state=42)
    print(f"✅ Got {len(models)} models from get_models()")
except Exception as e:
    print(f"⚠️ get_models() failed: {e}")
    print("🔄 Using models_large from previous cell...")
    try:
        models = models_large
        print(f"✅ Using {len(models)} models from models_large")
    except:
        print("❌ models_large not available")
        models = {}

print(f"📊 Models to train: {list(models.keys())}")

# Train and evaluate
results = {}
if not models:
    print("❌ No models available to train!")
else:
    try:
        results = train_and_eval(models, X_train, y_train, X_test, y_test, cv_folds=3)
        print(f"✅ train_and_eval completed, got {len(results)} results")
    except Exception as e:
        print(f"⚠️ train_and_eval failed: {e}")
        print("🔄 Using inline training function...")
        
        # Inline training function
        from sklearn.model_selection import cross_val_score
        from sklearn.metrics import accuracy_score
        import warnings
        warnings.filterwarnings('ignore')
        
        results = {}
        for name, model in models.items():
            print(f"  Training {name}...")
            try:
                # Fit model
                model.fit(X_train, y_train)
                
                # Predictions
                y_pred = model.predict(X_test)
                acc = accuracy_score(y_test, y_pred)
                
                # Cross-validation
                cv_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1)
                
                results[name] = {
                    'model': model,
                    'pred': y_pred,
                    'accuracy': acc,
                    'cv_mean': cv_scores.mean(),
                    'cv_std': cv_scores.std()
                }
                print(f"    ✅ Accuracy: {acc:.4f}, CV: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
            except Exception as e2:
                print(f"    ❌ {name} failed: {e2}")
                import traceback
                traceback.print_exc()
                continue

print(f"\n📊 Total results collected: {len(results)}")
print(f"📊 Results keys: {list(results.keys())}")

print("\n📊 Model Performance Summary:")
print("-" * 50)

# Create summary DataFrame
if results:
    summary_data = []
    for name, res in results.items():
        print(f"Processing {name}: {type(res)}, keys: {res.keys() if isinstance(res, dict) else 'not dict'}")
        if isinstance(res, dict) and 'accuracy' in res:
            summary_data.append({
                'Model': name,
                'Accuracy': res['accuracy'],
                'CV_Mean': res['cv_mean'],
                'CV_Std': res['cv_std']
            })
        else:
            print(f"  ⚠️ Skipping {name} - invalid result format")
    
    if summary_data:
        summary_df = pd.DataFrame(summary_data)
        summary_df = summary_df.sort_values('Accuracy', ascending=False)
        print(summary_df.to_string(index=False))
    else:
        print("⚠️ No valid results to display")
        print(f"Debug: summary_data length = {len(summary_data)}")
        summary_df = pd.DataFrame()  # Empty dataframe
else:
    print("⚠️ No models were trained successfully")
    print("Debug: results is empty or None")
    summary_df = pd.DataFrame()  # Empty dataframe


In [None]:
# Visualizations
if not summary_df.empty and len(summary_df) > 0:
    print("📈 Generating visualizations...")
    
    # Accuracy comparison
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    summary_df.plot(x='Model', y='Accuracy', kind='bar', ax=plt.gca())
    plt.title('Model Accuracy Comparison')
    plt.xticks(rotation=45)
    plt.ylabel('Accuracy')
    
    # CV scores with error bars
    plt.subplot(1, 2, 2)
    plt.errorbar(range(len(summary_df)), summary_df['CV_Mean'], 
                 yerr=summary_df['CV_Std'], fmt='o', capsize=5)
    plt.xticks(range(len(summary_df)), summary_df['Model'], rotation=45)
    plt.title('Cross-Validation Scores')
    plt.ylabel('CV Score')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # ROC Curves (if function available)
    if results and 'plot_roc_curves' in globals():
        try:
            print("\n📊 ROC Curves:")
            plot_roc_curves(results, y_test)
        except Exception as e:
            print(f"⚠️ ROC curves failed: {e}")
    
    # Confusion Matrix for best model
    if results:
        try:
            best_model_name = summary_df.iloc[0]['Model']
            if best_model_name in results and 'model' in results[best_model_name]:
                best_model = results[best_model_name]['model']
                best_pred = results[best_model_name]['pred']
                
                if 'plot_confusion' in globals():
                    print(f"\n📊 Confusion Matrix - {best_model_name}:")
                    plot_confusion(y_test, best_pred, normalize=True)
                else:
                    # Fallback confusion matrix
                    from sklearn.metrics import confusion_matrix
                    cm = confusion_matrix(y_test, best_pred)
                    plt.figure(figsize=(8, 6))
                    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
                    plt.title(f'Confusion Matrix - {best_model_name}')
                    plt.xlabel('Predicted')
                    plt.ylabel('True')
                    plt.show()
        except Exception as e:
            print(f"⚠️ Confusion matrix failed: {e}")
else:
    print("⚠️ No results available for visualization")


In [None]:
# SHAP Analysis (for tree-based models)
print("🔍 SHAP Analysis for interpretability...")

try:
    import shap
    
    # Analyze tree-based models
    tree_models = ['Random Forest', 'XGBoost', 'LightGBM']
    
    for model_name in tree_models:
        if model_name in results:
            print(f"\n🔍 Analyzing {model_name}...")
            
            model = results[model_name]['model']
            
            # Create SHAP explainer
            if hasattr(model, 'predict_proba'):
                explainer = shap.TreeExplainer(model)
                shap_values = explainer.shap_values(X_test[:100])  # Sample for speed
                
                # Summary plot
                plt.figure(figsize=(10, 6))
                shap.summary_plot(shap_values, X_test[:100], show=False)
                plt.title(f'SHAP Summary - {model_name}')
                plt.tight_layout()
                plt.show()
                
                print(f"✅ SHAP analysis complete for {model_name}")
            else:
                print(f"⚠️ {model_name} doesn't support SHAP analysis")
                
except ImportError:
    print("⚠️ SHAP not installed. Install with: pip install shap")
except Exception as e:
    print(f"⚠️ SHAP analysis failed: {e}")

# Feature Importance for tree models
print("\n📊 Feature Importance Analysis:")
for model_name in ['Random Forest', 'XGBoost', 'LightGBM']:
    if model_name in results:
        plot_feature_importance(results[model_name]['model'], model_name, top_n=20)


In [None]:
# Save Results
if not summary_df.empty and len(results) > 0:
    print("💾 Saving results and best model...")
    
    # Save metrics CSV
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_path = f'results/model_summary_{timestamp}.csv'
    summary_df.to_csv(csv_path, index=False)
    print(f"✅ Metrics saved to: {csv_path}")
    
    # Save best model
    try:
        if len(summary_df) > 0 and not summary_df.empty:
            best_model_name = summary_df.iloc[0]['Model']
            if best_model_name in results and 'model' in results[best_model_name]:
                best_model = results[best_model_name]['model']
                model_path = f'results/best_model_{timestamp}.pkl'
                
                import joblib
                joblib.dump(best_model, model_path)
                print(f"✅ Best model ({best_model_name}) saved to: {model_path}")
            else:
                print(f"⚠️ Best model '{best_model_name}' not found in results")
        else:
            print("⚠️ summary_df is empty, cannot save best model")
    except Exception as e:
        print(f"⚠️ Could not save best model: {e}")
        import traceback
        traceback.print_exc()
    
    # Save detailed results JSON
    try:
        import json
        detailed_results = {}
        for name, res in results.items():
            if isinstance(res, dict) and 'accuracy' in res:
                detailed_results[name] = {
                    'accuracy': float(res['accuracy']),
                    'cv_mean': float(res['cv_mean']),
                    'cv_std': float(res['cv_std'])
                }
        
        if detailed_results:
            json_path = f'results/detailed_results_{timestamp}.json'
            with open(json_path, 'w') as f:
                json.dump(detailed_results, f, indent=2)
            print(f"✅ Detailed results saved to: {json_path}")
        else:
            print("⚠️ No valid results to save in JSON")
    except Exception as e:
        print(f"⚠️ Could not save JSON: {e}")
    
    print(f"\n🎉 Training complete!")
    if len(summary_df) > 0 and not summary_df.empty:
        print(f"🏆 Best model: {summary_df.iloc[0]['Model']} (Accuracy: {summary_df.iloc[0]['Accuracy']:.4f})")
    print(f"📊 All results saved to results/ directory")
else:
    print("⚠️ No results to save")
    print(f"Debug: summary_df.empty = {summary_df.empty if 'summary_df' in locals() else 'not defined'}")
    print(f"Debug: len(results) = {len(results) if 'results' in locals() else 'not defined'}")
