In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from optuna import Trial, create_study
from sklearn.model_selection import train_test_split
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
labeled_data_path = "https://www.nbi.dk/~koskinen/Teaching/AdvancedMethodsInAppliedStatistics2016/data/breast-cancer-wisconsin_train-test.txt"
data_labeled = pd.read_csv(
    labeled_data_path,
    delimiter="\s+",
    names=[
        "ID",
        "Clump Thickness",
        "Uniformity of Cell Size",
        "Uniformity of Cell Shape",
        "Marginal Adhesion",
        "Single Epithelial Cell Size",
        "Bare Nuclei",
        "Bland Chromatin",
        "Normal Nucleoli",
        "Mitoses",
        "Class",
    ]
)
data_labeled["Class"] = data_labeled["Class"].replace({2: 0, 4: 1})

In [3]:
class XGBObjective:
    def __init__(self, X_train, X_test, y_train, y_test, objective):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

    def __call__(self, trial: Trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "learning_rate": trial.suggest_float("learning_rate", 0, 1),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
            "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 20),
        }
        bst = XGBClassifier(**params)
        bst.fit(self.X_train, self.y_train)
        bst_pred = bst.predict(self.X_test)
        score = accuracy_score(self.y_test, bst_pred)
        return score

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data_labeled.iloc[:, :-1], data_labeled.iloc[:, -1], test_size=0.5)
objective = XGBObjective(X_train, X_test, y_train, y_test)
study = create_study(direction = "maximize")
study.optimize(objective, n_trials = 100)
trial = study.best_trial
optimal_params = trial.params

[I 2024-03-09 13:17:18,458] A new study created in memory with name: no-name-c44a715d-d87d-439b-865a-d0e33005e52d


[I 2024-03-09 13:17:18,670] Trial 0 finished with value: 0.95 and parameters: {'n_estimators': 530, 'learning_rate': 0.6693627951856326, 'max_depth': 5, 'lambda': 0.0008144732177082368, 'alpha': 0.00796809696945198, 'subsample': 0.689949826003817, 'colsample_bytree': 0.682728057424874, 'gamma': 11.245316822906561}. Best is trial 0 with value: 0.95.
[I 2024-03-09 13:17:18,770] Trial 1 finished with value: 0.93 and parameters: {'n_estimators': 332, 'learning_rate': 0.520262702167543, 'max_depth': 9, 'lambda': 3.2268498160882096e-05, 'alpha': 0.03318166839682001, 'subsample': 0.5884292241587352, 'colsample_bytree': 0.7389465902788817, 'gamma': 15.843821737247074}. Best is trial 0 with value: 0.95.
[I 2024-03-09 13:17:18,955] Trial 2 finished with value: 0.99 and parameters: {'n_estimators': 858, 'learning_rate': 0.03425111350659127, 'max_depth': 9, 'lambda': 1.154790854266402e-06, 'alpha': 1.615194674186914e-08, 'subsample': 0.3103984063791833, 'colsample_bytree': 0.20852780611718985, 'ga

In [5]:
pd.DataFrame(optimal_params, index=[0])

Unnamed: 0,n_estimators,learning_rate,max_depth,lambda,alpha,subsample,colsample_bytree,gamma
0,858,0.034251,9,1e-06,1.615195e-08,0.310398,0.208528,9.04558


In [6]:
bst = XGBClassifier(**optimal_params)
bst.fit(X_train, y_train)
prob_train = bst.predict_proba(X_train)[:, 1]
prob_test = bst.predict_proba(X_test)[:, 1]
pred_train = bst.predict(X_train)
pred_test = bst.predict(X_test)
auc_train = roc_auc_score(y_train, prob_train)
auc_test = roc_auc_score(y_test, prob_test)
print(f"\nROC AUC training sample: {auc_train:.3f}\nROC AUC test sample: {auc_test:.3f}")


ROC AUC training sample: 0.996
ROC AUC test sample: 0.989


In [7]:
TPR_train = (pred_train == y_train.values).sum() / len(y_train)
TPR_test = (pred_test == y_test.values).sum() / len(y_test)
print(f"\nTPR training sample: {TPR_train:.3f}\nTPR test sample: {TPR_test:.3f}")


TPR training sample: 0.980
TPR test sample: 0.990


### Re-doing exercise 2 using Optuna
Let us try to re-do exercise 2 that we tuned using a GridSearch but now using the new hyper-parameter tuning in Optuna.

In [8]:
data_path_ex2 = "https://www.nbi.dk/~koskinen/Teaching/data/BDT_16var.txt"
df_ex2 = pd.read_csv(
    data_path_ex2,
    delimiter="\s+",
    names=[
        "var1",
        "var2",
        "var3",
        "var4",
        "var5",
        "var6",
        "var7",
        "var8",
        "var9",
        "var10",
        "var11",
        "var12",
        "var13",
        "var14",
        "var15",
        "var16",
    ],
    index_col=0,
    engine="python",
)
ex2_sig = df_ex2.iloc[::2]
ex2_bg = df_ex2.iloc[1::2]
ex2_bg_sig = pd.concat((ex2_bg, ex2_sig)).reset_index(drop=True)
ex2_y = np.append(np.zeros(len(ex2_bg)), np.ones(len(ex2_sig)))
X_train_ex2, X_test_ex2, y_train_ex2, y_test_ex2 = train_test_split(
    ex2_bg_sig, ex2_y, test_size=0.4
)

In [9]:
objective_ex2 = XGBObjective(X_train_ex2, X_test_ex2, y_train_ex2, y_test_ex2)
study_ex2 = create_study(direction = "maximize")
study_ex2.optimize(objective_ex2, n_trials = 100)
trial_ex2 = study_ex2.best_trial
optimal_params_ex2 = trial_ex2.params

[I 2024-03-09 13:17:34,716] A new study created in memory with name: no-name-ef5349cc-f776-4bb2-9b74-27e3fb605111
[I 2024-03-09 13:17:34,844] Trial 0 finished with value: 0.8665 and parameters: {'n_estimators': 191, 'learning_rate': 0.8847264828046679, 'max_depth': 5, 'lambda': 5.713768169811667e-07, 'alpha': 2.0211762610872702e-05, 'subsample': 0.9647418205171212, 'colsample_bytree': 0.2781414246593378, 'gamma': 8.288710816354515}. Best is trial 0 with value: 0.8665.
[I 2024-03-09 13:17:35,193] Trial 1 finished with value: 0.868 and parameters: {'n_estimators': 685, 'learning_rate': 0.10636886447304938, 'max_depth': 4, 'lambda': 3.817066143582005e-06, 'alpha': 0.053278081601770555, 'subsample': 0.9661212501636769, 'colsample_bytree': 0.9164527351301437, 'gamma': 16.918804906468775}. Best is trial 1 with value: 0.868.
[I 2024-03-09 13:17:35,585] Trial 2 finished with value: 0.87175 and parameters: {'n_estimators': 698, 'learning_rate': 0.6197153430239828, 'max_depth': 8, 'lambda': 0.00

In [10]:
bst_ex2 = XGBClassifier(**optimal_params_ex2)
bst_ex2.fit(X_train_ex2, y_train_ex2)
prob_train_ex2 = bst_ex2.predict_proba(X_train_ex2)[:, 1]
prob_test_ex2 = bst_ex2.predict_proba(X_test_ex2)[:, 1]
pred_train_ex2 = bst_ex2.predict(X_train_ex2)
pred_test_ex2 = bst_ex2.predict(X_test_ex2)
auc_train_ex2 = roc_auc_score(y_train_ex2, prob_train_ex2)
auc_test_ex2 = roc_auc_score(y_test_ex2, prob_test_ex2)
print(f"\nROC AUC training sample: {auc_train_ex2:.3f}\nROC AUC test sample: {auc_test_ex2:.3f}")


ROC AUC training sample: 0.985
ROC AUC test sample: 0.950


In [11]:
TPR_train_ex2 = (pred_train_ex2 == y_train_ex2).sum() / len(y_train_ex2)
TPR_test_ex2 = (pred_test_ex2 == y_test_ex2).sum() / len(y_test_ex2)
print(f"\nTPR training sample: {TPR_train_ex2:.3f}\nTPR test sample: {TPR_test_ex2:.3f}")


TPR training sample: 0.934
TPR test sample: 0.881
