# XGBoost Grid Search

This notebook performs a basic experiment for hyperparameter tuning for XGBoost models using grid search.

In [1]:
# Import libraries
import pandas as pd
from xgboost_grid_search import train_best_model

In [2]:

files = {'Dataset_1/2visit_CN_MCI.csv','Dataset_1/2visit_MCI_AD.csv', 'Dataset_1/3visit_CN_MCI.csv','Dataset_1/3visit_MCI_AD.csv', 'Dataset_1/4visit_CN_MCI.csv','Dataset_1/4visit_MCI_AD.csv'
         'Dataset_1/5visit_CN_MCI.csv','Dataset_1/5visit_MCI_AD.csv'}

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

In [None]:
### In this cell, call the tested SMOTE resampling method and then re-run the grid search. 


In [None]:
# Batch grid search across datasets and progression types, saving artifacts and reports
import os
from datetime import datetime
from xgboost_grid_search import train_best_model

# Datasets and their progression type
files = [
    ("Dataset_1/2visit_CN_MCI.csv", "MCI"),
    ("Dataset_1/2visit_MCI_AD.csv", "AD"),
    ("Dataset_1/3visit_CN_MCI.csv", "MCI"),
    ("Dataset_1/3visit_MCI_AD.csv", "AD"),
    ("Dataset_1/4visit_CN_MCI.csv", "MCI"),
    ("Dataset_1/4visit_MCI_AD.csv", "AD"),
    ("Dataset_1/5visit_CN_MCI.csv", "MCI"),
    ("Dataset_1/5visit_MCI_AD.csv", "AD"),
]

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

results_dir = "grid_results"
os.makedirs(results_dir, exist_ok=True)

for path, prog in files:
    try:
        df = pd.read_csv(path)
        base = os.path.splitext(os.path.basename(path))[0]
        csv_out = os.path.join(results_dir, f"{base}_{prog}_cv_scores.csv")
        model_base = f"{base}"
        print(f"\n=== Running grid search for {base} ({prog}) ===")
        model, cols = train_best_model(
            df,
            progression_type=prog,
            param_grid=param_grid,
            csv_path=csv_out,
            save_dir="saved_models",
            model_base_name=model_base,
            save_artifacts=True,
        )
    except Exception as e:
        print(f"Error processing {path}: {e}")


=== Running grid search for 2visit_CN_MCI (MCI) ===
Using StratifiedKFold with n_splits=5
Using StratifiedKFold with n_splits=5
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       209
           1       0.31      0.25      0.28        16

    accuracy                           0.91       225
   macro avg       0.63      0.60      0.61       225
weighted avg       0.90      0.91      0.90       225


ROC AUC Score: 0.7593
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       209
           1       0.31      0.25      0.28        16

    accuracy                           0.91       225
   macro avg       0.63      0.60      0.61       225
weighted avg       0.90      0.91      0.90       225


ROC AUC Score: 0.7593
Bootstrap classification metrics: attempted=1000, auc_valid=1000, auc_skipped=0

Bootstrap 95% CI (n=1000):
- Accuracy: 0.907

In [None]:
from xgboost_grid_search import preprocess_data, create_delta_features
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTENC
import numpy as np


path, prog = "Dataset_1/5visit_MCI_AD.csv", "AD"
df = pd.read_csv(path)
processed_df, _, _ = preprocess_data(create_delta_features(df), prog)

X = processed_df.drop('target', axis=1)
y = processed_df['target']

# Identify categorical column indices BEFORE transforms
categorical_cols = ['SEX', 'NACCFAM', 'CVHATT', 'CVAFIB', 'DIABETES', 
                    'HYPERCHO', 'HYPERTEN', 'B12DEF', 'DEPD', 'ANX', 'NACCTBI', 'RACE']
categorical_cols = [col for col in categorical_cols if col in X.columns.tolist()]
cat_indices = [X.columns.get_loc(col) for col in categorical_cols]

# 1. Impute (use most_frequent to keep categoricals as ints)
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

# 2. Scale BEFORE SMOTENC (SMOTENC handles categorical cols internally)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# 3. Apply SMOTENC on scaled data
#    Pass categorical_features as keyword; use lower k_neighbors if minority class is small
sm = SMOTENC(categorical_features=cat_indices, k_neighbors=3, random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_scaled, y)

# Check results
print(f"Total samples: {len(y_resampled)}")
print(f"Class 0: {(y_resampled == 0).sum()}")
print(f"Class 1: {(y_resampled == 1).sum()}")

TypeError: Input should have at least 1 dimension i.e. satisfy `len(x.shape) > 0`, got scalar `np.False_` instead.