# XGBoost Grid Search

This notebook performs a basic experiment for hyperparameter tuning for XGBoost models using grid search.

In [1]:
# Import libraries
import pandas as pd
from xgboost_grid_search import train_best_model

In [2]:

files = {'Dataset_1/2visit_CN_MCI.csv','Dataset_1/2visit_MCI_AD.csv', 'Dataset_1/3visit_CN_MCI.csv','Dataset_1/3visit_MCI_AD.csv', 'Dataset_1/4visit_CN_MCI.csv','Dataset_1/4visit_MCI_AD.csv'
         'Dataset_1/5visit_CN_MCI.csv','Dataset_1/5visit_MCI_AD.csv'}

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

In [None]:
# Batch grid search across datasets and progression types, saving artifacts and reports
import os
from datetime import datetime
from xgboost_grid_search import train_best_model

# Datasets and their progression type
files = [
    ("Dataset_1/2visit_CN_MCI.csv", "MCI"),
    ("Dataset_1/2visit_MCI_AD.csv", "AD"),
    ("Dataset_1/3visit_CN_MCI.csv", "MCI"),
    ("Dataset_1/3visit_MCI_AD.csv", "AD"),
    ("Dataset_1/4visit_CN_MCI.csv", "MCI"),
    ("Dataset_1/4visit_MCI_AD.csv", "AD"),
    ("Dataset_1/5visit_CN_MCI.csv", "MCI"),
    ("Dataset_1/5visit_MCI_AD.csv", "AD"),
]

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

results_dir = "grid_results"
os.makedirs(results_dir, exist_ok=True)

for path, prog in files:
    try:
        df = pd.read_csv(path)
        base = os.path.splitext(os.path.basename(path))[0]
        csv_out = os.path.join(results_dir, f"{base}_{prog}_cv_scores.csv")
        model_base = f"{base}"
        print(f"\n=== Running grid search for {base} ({prog}) ===")
        model, cols = train_best_model(
            df,
            progression_type=prog,
            param_grid=param_grid,
            csv_path=csv_out,
            save_dir="saved_models",
            model_base_name=model_base,
            save_artifacts=True,
        )
    except Exception as e:
        print(f"Error processing {path}: {e}")


=== Running grid search for 2visit_CN_MCI (MCI) ===
Using StratifiedKFold with n_splits=5
Using StratifiedKFold with n_splits=5
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       209
           1       0.31      0.25      0.28        16

    accuracy                           0.91       225
   macro avg       0.63      0.60      0.61       225
weighted avg       0.90      0.91      0.90       225


ROC AUC Score: 0.7593
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       209
           1       0.31      0.25      0.28        16

    accuracy                           0.91       225
   macro avg       0.63      0.60      0.61       225
weighted avg       0.90      0.91      0.90       225


ROC AUC Score: 0.7593
Bootstrap classification metrics: attempted=1000, auc_valid=1000, auc_skipped=0

Bootstrap 95% CI (n=1000):
- Accuracy: 0.907

In [5]:
from xgboost_grid_search import preprocess_data, create_delta_features
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTENC
import numpy as np


path, prog = "Dataset_1/5visit_MCI_AD.csv", "AD"
df = pd.read_csv(path)
processed_df, _, _ = preprocess_data(create_delta_features(df), prog)

X = processed_df.drop('target', axis=1)
y = processed_df['target']

# Identify categorical column indices BEFORE transforms
categorical_cols = ['SEX', 'NACCFAM', 'CVHATT', 'CVAFIB', 'DIABETES', 
                    'HYPERCHO', 'HYPERTEN', 'B12DEF', 'DEPD', 'ANX', 'NACCTBI', 'RACE']
categorical_cols = [col for col in categorical_cols if col in X.columns.tolist()]
cat_indices = [X.columns.get_loc(col) for col in categorical_cols]

# 1. Impute (use most_frequent to keep categoricals as ints)
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

# 2. Scale BEFORE SMOTENC (SMOTENC handles categorical cols internally)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# 3. Apply SMOTENC on scaled data
#    Pass categorical_features as keyword; use lower k_neighbors if minority class is small
sm = SMOTENC(categorical_features=cat_indices, k_neighbors=3, random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_scaled, y)

# Check results
print(f"Total samples: {len(y_resampled)}")
print(f"Class 0: {(y_resampled == 0).sum()}")
print(f"Class 1: {(y_resampled == 1).sum()}")
# Save resampled data to CSV with highlighting of synthetic rows
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['target'] = y_resampled

# Add a column to mark synthetic vs original rows
# Original rows: first 138 (length of original y)
# Synthetic rows: after index 138
df_resampled['is_synthetic'] = ['Original'] * len(y) + ['SMOTE_Generated'] * (len(y_resampled) - len(y))

df_resampled.to_csv('smote_resampled_data.csv', index=False)
print(f"\nResampled data saved to 'smote_resampled_data.csv'")
print(f"Original samples: {len(y)}")
print(f"SMOTE generated samples: {len(y_resampled) - len(y)}")
print("\nSample of SMOTE-generated instances:")
print(df_resampled[df_resampled['is_synthetic'] == 'SMOTE_Generated'].head(10))

Total samples: 254
Class 0: 127
Class 1: 127

Resampled data saved to 'smote_resampled_data.csv'
Original samples: 138
SMOTE generated samples: 116

Sample of SMOTE-generated instances:
          SEX      EDUC   ALCOHOL  NACCFAM  CVHATT    CVAFIB  DIABETES  \
138 -0.929981 -1.140183 -0.204808  0.83887     0.0 -0.279508 -0.470360   
139  1.075291 -0.677870 -0.204808  0.83887     0.0 -0.279508 -0.470360   
140 -0.929981  0.212983 -0.204808  0.83887     0.0 -0.279508 -0.470360   
141 -0.929981 -0.733847 -0.204808  0.83887     0.0 -0.279508 -0.470360   
142 -0.929981 -0.930840 -0.204808  0.83887     0.0 -0.279508 -0.470360   
143  1.075291 -0.281641  3.433847  0.83887     0.0 -0.279508  2.126029   
144 -0.929981 -0.210307 -0.204808  0.83887     0.0 -0.279508 -0.470360   
145 -0.929981  0.560667 -0.204808  0.83887     0.0 -0.279508 -0.470360   
146  1.075291 -0.537461  1.786694  0.83887     0.0 -0.279508  2.126029   
147  1.075291 -0.930524  5.444951  0.83887     0.0 -0.279508  2.126029   


## Experiment #2 

What's changed: 
1. SMOTE for synthetic oversampling within training folds. 
2. Additional patients added to 5 visit cohort
3. GDS and categorical longitudinal. 
3. Hearing and vision variables changed. 
4. MMSE imputation corrected. 

In [2]:
# Experiment #2: Dataset_2 with post-split MMSE imputation + SMOTENC
# Reload module to pick up changes
import importlib
import xgboost_grid_search
importlib.reload(xgboost_grid_search)
from xgboost_grid_search import train_best_model

import os
import pandas as pd

# Dataset_2 files (MMSE not pre-imputed; will be imputed post-split)
files = [
    ("Dataset_2/2visit_CN_MCI.csv", "MCI"),
    ("Dataset_2/2visit_MCI_AD.csv", "AD"),
    ("Dataset_2/3visit_CN_MCI.csv", "MCI"),
    ("Dataset_2/3visit_MCI_AD.csv", "AD"),
    ("Dataset_2/4visit_CN_MCI.csv", "MCI"),
    ("Dataset_2/4visit_MCI_AD.csv", "AD"),
    ("Dataset_2/5visit_CN_MCI.csv", "MCI"),
    ("Dataset_2/5visit_MCI_AD.csv", "AD"),
]

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

results_dir = "grid_results"
os.makedirs(results_dir, exist_ok=True)

for path, prog in files:
    try:
        df = pd.read_csv(path)
        base = os.path.splitext(os.path.basename(path))[0]
        csv_out = os.path.join(results_dir, f"{base}_{prog}_cv_scores.csv")
        model_base = f"{base}"
        print(f"\n{'='*60}")
        print(f"=== Running grid search for {base} ({prog}) ===")
        print(f"{'='*60}")
        model, cols = train_best_model(
            df,
            progression_type=prog,
            param_grid=param_grid,
            csv_path=csv_out,
            save_dir="saved_models_2",
            model_base_name=model_base,
            save_artifacts=True,
        )
    except Exception as e:
        import traceback
        print(f"Error processing {path}: {e}")
        traceback.print_exc()


=== Running grid search for 2visit_CN_MCI (MCI) ===
No MMSE NaN values found; skipping MMSE imputation.
Using StratifiedKFold with n_splits=5
Using StratifiedKFold with n_splits=5
SMOTE resampling: 900 -> 1670 training samples
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       209
           1       0.35      0.44      0.39        16

    accuracy                           0.90       225
   macro avg       0.65      0.69      0.67       225
weighted avg       0.91      0.90      0.91       225


ROC AUC Score: 0.6953
SMOTE resampling: 900 -> 1670 training samples
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       209
           1       0.35      0.44      0.39        16

    accuracy                           0.90       225
   macro avg       0.65      0.69      0.67       225
weighted avg       0.91      0.90      0.91       225


R



MMSE imputation complete (train fit, test transformed).
Using StratifiedKFold with n_splits=5
Using StratifiedKFold with n_splits=5
SMOTE resampling: 470 -> 880 training samples
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       111
           1       0.33      0.14      0.20         7

    accuracy                           0.93       118
   macro avg       0.64      0.56      0.58       118
weighted avg       0.91      0.93      0.92       118


ROC AUC Score: 0.9524
SMOTE resampling: 470 -> 880 training samples
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       111
           1       0.33      0.14      0.20         7

    accuracy                           0.93       118
   macro avg       0.64      0.56      0.58       118
weighted avg       0.91      0.93      0.92       118


ROC AUC Score: 0.9524
Bootstrap: valid=1000, total_a