In [None]:
# Colab setup
%pip install -q xgboost lightgbm shap optuna pyarrow category_encoders scikit-learn matplotlib seaborn joblib


# Hyperparameter Tuning for Alzheimer's Disease Prediction

This notebook performs hyperparameter tuning using Optuna (with pruning and timeout) and also includes a quick RandomizedSearchCV path for fast experiments. It uses the shared utilities in `src/` and saves results to `results/`.

Note: This notebook is set up to run end-to-end. You can choose either the full Optuna loop or the quick tuner.


In [None]:
# Setup
import os, sys, warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('./src')

# Limit threads for stability
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['NUMEXPR_MAX_THREADS'] = '1'

import numpy as np
import pandas as pd
from datetime import datetime

from src.hyper_tuning import run_optuna_tuning, run_random_search
from src.advanced_model import load_real_data
from src.validation import RobustValidator

# Optional: install optuna if missing (uncomment to run)
# %pip install optuna


In [None]:
# Load and split data
print('Loading data...')
X, y = load_real_data()
print(f'Shape X: {X.shape}, y: {y.shape}')

validator = RobustValidator(random_state=42)
X_train, X_val, X_test, y_train, y_val, y_test = validator.create_fresh_splits(
    X, y, test_size=0.2, val_size=0.2
)

# Clean data helper
def clean_data(X):
    X = np.array(X, dtype=np.float64)
    X = np.where(np.isinf(X), np.nan, X)
    X = np.where(np.abs(X) > 1e10, np.nan, X)
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(strategy='median')
    X = imputer.fit_transform(X)
    return X

X_train = clean_data(X_train)
X_val = clean_data(X_val)
X_test = clean_data(X_test)


In [None]:
# Define Optuna model builders

def build_xgboost_model(trial):
    import xgboost as xgb
    return xgb.XGBClassifier(
        n_estimators=trial.suggest_int('n_estimators', 50, 300, step=25),
        max_depth=trial.suggest_int('max_depth', 3, 8),
        learning_rate=trial.suggest_float('learning_rate', 0.03, 0.2, log=True),
        subsample=trial.suggest_float('subsample', 0.6, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
        reg_alpha=trial.suggest_float('reg_alpha', 0.0, 2.0),
        reg_lambda=trial.suggest_float('reg_lambda', 0.1, 5.0),
        gamma=trial.suggest_float('gamma', 0.0, 1.0),
        min_child_weight=trial.suggest_int('min_child_weight', 1, 10),
        eval_metric='logloss', use_label_encoder=False, random_state=42, verbosity=0, n_jobs=1
    )


def build_random_forest_model(trial):
    from sklearn.ensemble import RandomForestClassifier
    return RandomForestClassifier(
        n_estimators=trial.suggest_int('n_estimators', 50, 300, step=25),
        max_depth=trial.suggest_int('max_depth', 4, 15),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
        max_features=trial.suggest_categorical('max_features', ['sqrt', 'log2', 0.3, 0.5, 0.7]),
        bootstrap=trial.suggest_categorical('bootstrap', [True, False]),
        random_state=42, n_jobs=1
    )


def build_lightgbm_model(trial):
    import lightgbm as lgb
    return lgb.LGBMClassifier(
        n_estimators=trial.suggest_int('n_estimators', 50, 300, step=25),
        max_depth=trial.suggest_int('max_depth', 3, 10),
        learning_rate=trial.suggest_float('learning_rate', 0.03, 0.2, log=True),
        subsample=trial.suggest_float('subsample', 0.6, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.6, 1.0),
        reg_alpha=trial.suggest_float('reg_alpha', 0.0, 2.0),
        reg_lambda=trial.suggest_float('reg_lambda', 0.1, 5.0),
        num_leaves=trial.suggest_int('num_leaves', 10, 100),
        min_child_samples=trial.suggest_int('min_child_samples', 5, 50),
        random_state=42, verbose=-1, n_jobs=1
    )


def build_svm_model(trial):
    from sklearn.svm import SVC
    kernel = trial.suggest_categorical('kernel', ['rbf', 'poly', 'sigmoid'])
    degree = trial.suggest_int('degree', 2, 5) if kernel == 'poly' else 3
    return SVC(
        C=trial.suggest_float('C', 0.01, 100, log=True),
        gamma=trial.suggest_categorical('gamma', ['scale', 'auto', 0.001, 0.01, 0.1, 1.0]),
        kernel=kernel,
        degree=degree,
        probability=True,
        random_state=42
    )


def build_logistic_regression_model(trial):
    from sklearn.linear_model import LogisticRegression
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    solver = 'saga' if penalty in ['l1', 'elasticnet'] else 'liblinear'
    return LogisticRegression(
        C=trial.suggest_float('C', 0.001, 10, log=True),
        penalty=penalty,
        solver=solver,
        l1_ratio=trial.suggest_float('l1_ratio', 0.1, 0.9) if penalty == 'elasticnet' else None,
        max_iter=500,
        random_state=42,
        n_jobs=1
    )


def build_mlp_model(trial):
    from sklearn.neural_network import MLPClassifier
    hidden_layers = trial.suggest_int('n_layers', 1, 3)
    hidden_sizes = [trial.suggest_int(f'layer_{i}_size', 20, 200) for i in range(hidden_layers)]
    return MLPClassifier(
        hidden_layer_sizes=tuple(hidden_sizes),
        activation=trial.suggest_categorical('activation', ['relu', 'tanh', 'logistic']),
        solver=trial.suggest_categorical('solver', ['adam', 'lbfgs']),
        alpha=trial.suggest_float('alpha', 1e-5, 1e-1, log=True),
        learning_rate=trial.suggest_categorical('learning_rate', ['constant', 'adaptive']),
        max_iter=500,
        early_stopping=True,
        validation_fraction=0.1,
        random_state=42
    )

models_to_tune = {
    'XGBoost': build_xgboost_model,
    'Random Forest': build_random_forest_model,
    'LightGBM': build_lightgbm_model,
    'SVM': build_svm_model,
    'Logistic Regression': build_logistic_regression_model,
    'MLP': build_mlp_model
}


## Optuna Tuning (Full)
- 30 trials per model, 10 minute timeout per model
- Pruning enabled to skip weak trials
- Uses 3-fold Stratified CV
- Saves best params and validation/test scores


In [None]:
# Optuna Tuning (Full)
print("🔧 Starting Optuna hyperparameter tuning...")

optuna_results = {}

for model_name, model_builder in models_to_tune.items():
    print(f"\n🔧 Tuning {model_name}...")
    
    try:
        best_model, tuning_info = run_optuna_tuning(
            model_builder=model_builder,
            model_name=model_name,
            X=X_train,
            y=y_train,
            n_trials=30,
            timeout=600,  # 10 minutes
            cv_folds=3,
            n_jobs=1,
            random_state=42,
            enable_pruning=True,
            verbose=True
        )
        
        if best_model is not None:
            # Evaluate on validation set
            val_score = best_model.score(X_val, y_val)
            test_score = best_model.score(X_test, y_test)
            
            optuna_results[model_name] = {
                'best_model': best_model,
                'val_score': val_score,
                'test_score': test_score,
                'tuning_info': tuning_info
            }
            
            print(f"✅ {model_name} tuned successfully!")
            print(f"  Validation Score: {val_score:.4f}")
            print(f"  Test Score: {test_score:.4f}")
            print(f"  Best Score: {tuning_info['best_score']:.4f}")
            print(f"  Trials: {tuning_info['n_trials_completed']}")
            print(f"  Runtime: {tuning_info['runtime_seconds']:.2f}s")
        else:
            print(f"⚠️ {model_name} tuning failed - no valid trials")
            
    except Exception as e:
        print(f"❌ {model_name} tuning failed: {e}")

print(f"\n✅ Optuna tuning complete! {len(optuna_results)} models tuned successfully.")


In [None]:
# Quick Tuning (RandomizedSearchCV)
print("⚡ Starting quick RandomizedSearchCV tuning...")

quick_results = {}

# Select a subset of models for quick tuning
quick_models = ['XGBoost', 'Random Forest', 'LightGBM']

for model_name in quick_models:
    if model_name in models_to_tune:
        print(f"\n⚡ Quick tuning {model_name}...")
        
        try:
            # Get base model
            base_model = models_to_tune[model_name](None)  # Dummy trial for base model
            
            best_model, tuning_info = run_random_search(
                model=base_model,
                model_name=model_name,
                X=X_train,
                y=y_train,
                n_iter=15,  # Fewer iterations for speed
                cv_folds=3,
                n_jobs=1,
                random_state=42
            )
            
            if best_model is not None:
                # Evaluate on validation set
                val_score = best_model.score(X_val, y_val)
                test_score = best_model.score(X_test, y_test)
                
                quick_results[model_name] = {
                    'best_model': best_model,
                    'val_score': val_score,
                    'test_score': test_score,
                    'tuning_info': tuning_info
                }
                
                print(f"✅ {model_name} quick tuned!")
                print(f"  Validation Score: {val_score:.4f}")
                print(f"  Test Score: {test_score:.4f}")
                print(f"  Best Score: {tuning_info['best_score']:.4f}")
            else:
                print(f"⚠️ {model_name} quick tuning failed")
                
        except Exception as e:
            print(f"❌ {model_name} quick tuning failed: {e}")

print(f"\n✅ Quick tuning complete! {len(quick_results)} models tuned.")


In [None]:
# Compare Results
print("📊 Comparing tuning results...")

# Combine results
all_results = {}

# Add Optuna results
for model_name, results in optuna_results.items():
    all_results[f"{model_name} (Optuna)"] = results

# Add Quick results
for model_name, results in quick_results.items():
    all_results[f"{model_name} (Quick)"] = results

if all_results:
    # Create comparison table
    comparison_data = []
    for model_name, results in all_results.items():
        comparison_data.append({
            'Model': model_name,
            'Val_Score': results['val_score'],
            'Test_Score': results['test_score'],
            'Best_CV_Score': results['tuning_info']['best_score'],
            'Tuning_Time': results['tuning_info'].get('runtime_seconds', 0)
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.sort_values('Test_Score', ascending=False)
    
    print("\n📊 Tuning Results Comparison:")
    print("-" * 60)
    print(comparison_df.to_string(index=False))
    
    # Find best model
    best_model_name = comparison_df.iloc[0]['Model']
    best_model = all_results[best_model_name]['best_model']
    best_score = comparison_df.iloc[0]['Test_Score']
    
    print(f"\n🏆 Best tuned model: {best_model_name}")
    print(f"📊 Test Score: {best_score:.4f}")
    
else:
    print("⚠️ No tuning results available for comparison")


In [None]:
# Save Results
print("💾 Saving hyperparameter tuning results...")

# Save comparison results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

if all_results:
    # Save comparison CSV
    csv_path = f'results/hyperparameter_tuning_summary_{timestamp}.csv'
    comparison_df.to_csv(csv_path, index=False)
    print(f"✅ Comparison results saved to: {csv_path}")
    
    # Save best tuned model
    best_model_path = f'results/best_tuned_model_{timestamp}.pkl'
    import joblib
    joblib.dump(best_model, best_model_path)
    print(f"✅ Best tuned model saved to: {best_model_path}")
    
    # Save detailed tuning info
    detailed_info = {}
    for model_name, results in all_results.items():
        detailed_info[model_name] = {
            'val_score': float(results['val_score']),
            'test_score': float(results['test_score']),
            'best_cv_score': float(results['tuning_info']['best_score']),
            'tuning_time': float(results['tuning_info'].get('runtime_seconds', 0)),
            'best_params': results['tuning_info'].get('best_params', {})
        }
    
    json_path = f'results/hyperparameter_tuning_details_{timestamp}.json'
    with open(json_path, 'w') as f:
        json.dump(detailed_info, f, indent=2)
    print(f"✅ Detailed tuning info saved to: {json_path}")
    
    print(f"\n🎉 Hyperparameter tuning complete!")
    print(f"🏆 Best model: {best_model_name} (Test Score: {best_score:.4f})")
    print(f"📊 All results saved to results/ directory")
    
else:
    print("⚠️ No results to save")


## Optuna Tuning (Full)
- 30 trials per model, 10 minute timeout per model
- Pruning enabled to skip weak trials
- Uses 3-fold Stratified CV
- Saves best params and validation/test scores
