<a href="https://colab.research.google.com/github/GVSU-CIS635/projects-team-1-1/blob/njuek-patch-1/Kelvin's_XG_Boost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Loading and cleaning the dataset

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K  [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [9]:
import pandas as pd
import numpy as np
import optuna

from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping


from google.colab import drive
drive.mount('/content/drive')

# Load
file_path = '/content/drive/MyDrive/train.csv'
df = pd.read_csv(file_path, sep=';', quotechar='"', skipinitialspace=True)
print(df.head(2))

# Normalize headers
df.columns = df.columns.str.strip().str.lower()

# Basic feature engineering
df['contacted_before'] = (df['pdays'] != -1).astype(int)
df['pdays_numeric']	 = df['pdays'].replace(-1, 0)

# Drop leakage column
df = df.drop(columns=['duration'])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   age         job marital  education default  balance housing loan  contact  \
0   58  management married   tertiary       no     2143     yes   no  unknown  
1   44  technician  single  secondary       no       29     yes   no  unknown  

  day month  duration  campaign  pdays  previous poutcome   y  
0   5   may       261         1     -1         0  unknown  no  
1   5   may       151         1     -1         0  unknown  no  


# 2. Define feature groups

In [3]:
num_features = ["age", "balance", "day", "campaign", "pdays_numeric", "previous"]
cat_features = ["job", "marital", "education", "contact", "month", "poutcome"]
bin_features = ["default", "housing", "loan"]  # yes/no strings

# Map binary yes/no (and target y) → 0/1
for col in bin_features + ["y"]:
    if col in df.columns:
        df[col] = df[col].map({"yes": 1, "no": 0})

# 3. Train/Test split

In [4]:
# 70% train / 30% internal test (keep separate test.csv untouched)
X = df[num_features + cat_features + bin_features]
y = df["y"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=1972
)

# 4. Preprocessing transformer

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features),
        ('bin', 'passthrough', bin_features),
    ],
    remainder='drop'
)

# 5. Model & Optuna Optimization

In [17]:
import numpy as np
import optuna
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    f1_score, roc_auc_score, accuracy_score, precision_recall_curve,
    classification_report, confusion_matrix
)
from xgboost import XGBClassifier

# class imbalance from TRAIN only
neg, pos = np.bincount(y_train)
global_scale_pos_weight = neg / max(pos, 1)

def build_pipeline(trial):
    params = {
        "random_state": 1972,
        "tree_method": "hist",           # use 'gpu_hist' if GPU available
        "eval_metric": "auc",
        "scale_pos_weight": global_scale_pos_weight,
        "n_estimators": trial.suggest_int("n_estimators", 300, 1200),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_float("min_child_weight", 1.0, 10.0),
        "gamma": trial.suggest_float("gamma", 0.0, 0.5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1.0, 10.0),
        "n_jobs": -1,
    }
    model = XGBClassifier(**params)
    return Pipeline(steps=[('preprocess', preprocessor), ('xgb', model)])

def _best_threshold_for_f1(y_true, y_prob):
    p, r, thr = precision_recall_curve(y_true, y_prob)
    f1 = 2 * (p * r) / (p + r + 1e-12)
    idx = int(np.nanargmax(f1))
    return 0.5 if idx >= len(thr) else float(thr[idx])

def _metrics_at_threshold(y_true, y_prob, thr):
    y_pred = (y_prob >= thr).astype(int)
    return (
        f1_score(y_true, y_pred),
        accuracy_score(y_true, y_pred),
        roc_auc_score(y_true, y_prob)
    )

def objective(trial):
    pipe = build_pipeline(trial)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1972)

    f1s, accs, aucs = [], [], []
    for tr_idx, va_idx in skf.split(X_train, y_train):
        Xtr, Xva = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        ytr, yva = y_train.iloc[tr_idx], y_train.iloc[va_idx]

        pipe.fit(Xtr, ytr)
        va_prob = pipe.predict_proba(Xva)[:, 1]
        thr = _best_threshold_for_f1(yva, va_prob)
        f1, acc, auc = _metrics_at_threshold(yva, va_prob, thr)
        f1s.append(f1); accs.append(acc); aucs.append(auc)

    trial.set_user_attr("mean_accuracy", float(np.mean(accs)))
    trial.set_user_attr("mean_auc", float(np.mean(aucs)))
    return float(np.mean(f1s))

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30, show_progress_bar=True)

print("Best trial params:", study.best_trial.params)
print("CV-best F1 (opt objective):", study.best_value)
print("CV mean Accuracy (best):", study.best_trial.user_attrs.get("mean_accuracy"))
print("CV mean AUC-ROC (best):", study.best_trial.user_attrs.get("mean_auc"))

[I 2025-11-12 22:30:07,811] A new study created in memory with name: no-name-9e96b5e7-226c-43ea-a980-efc3734ab019


  0%|       | 0/30 [00:00<?, ?it/s]

[I 2025-11-12 22:30:18,077] Trial 0 finished with value: 0.4818050146996572 and parameters: {'n_estimators': 818, 'max_depth': 3, 'learning_rate': 0.1895460717282038, 'subsample': 0.7760959058602337, 'colsample_bytree': 0.6278730149530939, 'min_child_weight': 9.000643492998568, 'gamma': 0.30421756311597986, 'reg_alpha': 0.11935239492663985, 'reg_lambda': 9.54392347558503}. Best is trial 0 with value: 0.4818050146996572.
[I 2025-11-12 22:30:24,816] Trial 1 finished with value: 0.4958607828404954 and parameters: {'n_estimators': 626, 'max_depth': 3, 'learning_rate': 0.026404217551412566, 'subsample': 0.7718061062174465, 'colsample_bytree': 0.7107697513702507, 'min_child_weight': 9.142534192022461, 'gamma': 0.4794151435532946, 'reg_alpha': 0.40939010059350045, 'reg_lambda': 1.598683736273029}. Best is trial 1 with value: 0.4958607828404954.
[I 2025-11-12 22:30:38,064] Trial 2 finished with value: 0.4981249994972307 and parameters: {'n_estimators': 849, 'max_depth': 5, 'learning_rate': 0.0

# 6. Refit best pipeline + internal

In [18]:
best_pipe = build_pipeline(study.best_trial)

# small val split to tune threshold
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=1972)
(tr_idx, va_idx) = next(sss.split(X_train, y_train))
Xtr, Xva = X_train.iloc[tr_idx], X_train.iloc[va_idx]
ytr, yva = y_train.iloc[tr_idx], y_train.iloc[va_idx]

best_pipe.fit(Xtr, ytr)

val_prob = best_pipe.predict_proba(Xva)[:, 1]
best_thr = _best_threshold_for_f1(yva, val_prob)

# Internal 30% test metrics
test_prob = best_pipe.predict_proba(X_test)[:, 1]
test_pred = (test_prob >= best_thr).astype(int)

print("\n=== Internal 30% Test (Optuna-tuned, thr tuned on val) ===")
print("F1:      {:.4f}".format(f1_score(y_test, test_pred)))
print("Accuracy: {:.4f}".format(accuracy_score(y_test, test_pred)))
print("AUC-ROC:  {:.4f}".format(roc_auc_score(y_test, test_prob)))
print("\nClassification report:\n", classification_report(y_test, test_pred, digits=4))


=== Internal 30% Test (Optuna-tuned, thr tuned on val) ===
F1:        0.4822
Accuracy: 0.8669
AUC-ROC:  0.8001

Classification report:
              precision    recall  f1-score   support

           0     0.9360    0.9115    0.9236     11977
           1     0.4424    0.5299    0.4822      1587

    accuracy                         0.8669     13564
   macro avg     0.6892    0.7207    0.7029     13564
weighted avg     0.8783    0.8669    0.8720     13564



# 7. External final test

In [19]:
def prepare_external_minimal(df_ext: pd.DataFrame) -> pd.DataFrame:
    dfx = df_ext.copy()
    dfx.columns = dfx.columns.str.strip().str.lower()
    if "duration" in dfx.columns:
        dfx = dfx.drop(columns=["duration"])
    dfx["contacted_before"] = (dfx["pdays"] != -1).astype(int)
    dfx["pdays_numeric"]     = dfx["pdays"].replace(-1, 0)
    for col in bin_features + ["y"]:
        if col in dfx.columns:
            dfx[col] = dfx[col].map({"yes": 1, "no": 0})
    return dfx

# Load external validation file
file_path = '/content/drive/MyDrive/test.csv'
df_ext = pd.read_csv(file_path, sep=';', quotechar='"', skipinitialspace=True)
df_ext = prepare_external_minimal(df_ext)

X_ext = df_ext[num_features + cat_features + bin_features]
y_ext = df_ext['y'].astype(int)

ext_prob = best_pipe.predict_proba(X_ext)[:, 1]
ext_pred = (ext_prob >= best_thr).astype(int)

print("\n=== External Final Test (threshold from train-val) ===")
print("F1:      {:.4f}".format(f1_score(y_ext, ext_pred)))
print("Accuracy: {:.4f}".format(accuracy_score(y_ext, ext_pred)))
print("AUC-ROC:  {:.4f}".format(roc_auc_score(y_ext, ext_prob)))
print("\nClassification report:\n", classification_report(y_ext, ext_pred, digits=4))


=== External Final Test (threshold from train-val) ===
F1:        0.4820
Accuracy: 0.8693
AUC-ROC:  0.8151

Classification report:
              precision    recall  f1-score   support

           0     0.9369    0.9137    0.9252      4000
           1     0.4435    0.5278    0.4820       521

    accuracy                         0.8693      4521
   macro avg     0.6902    0.7208    0.7036      4521
weighted avg     0.8801    0.8693    0.8741      4521

