In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from optuna import Trial, create_study
from sklearn.model_selection import train_test_split
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
labeled_data_path = "https://www.nbi.dk/~koskinen/Teaching/AdvancedMethodsInAppliedStatistics2016/data/breast-cancer-wisconsin_train-test.txt"
data_labeled = pd.read_csv(
    labeled_data_path,
    delimiter="\s+",
    names=[
        "ID",
        "Clump Thickness",
        "Uniformity of Cell Size",
        "Uniformity of Cell Shape",
        "Marginal Adhesion",
        "Single Epithelial Cell Size",
        "Bare Nuclei",
        "Bland Chromatin",
        "Normal Nucleoli",
        "Mitoses",
        "Class",
    ]
)
data_labeled["Class"] = data_labeled["Class"].replace({2: 0, 4: 1})

In [3]:
class XGBObjective:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

    def __call__(self, trial: Trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "learning_rate": trial.suggest_float("learning_rate", 0, 1),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
            "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 20),
        }
        bst = XGBClassifier(**params)
        bst.fit(self.X_train, self.y_train)
        bst_pred = bst.predict(self.X_test)
        score = accuracy_score(self.y_test, bst_pred)
        return score

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data_labeled.iloc[:, :-1], data_labeled.iloc[:, -1], test_size=0.5)
objective = XGBObjective(X_train, X_test, y_train, y_test)
study = create_study(direction = "maximize")
study.optimize(objective, n_trials = 100)
trial = study.best_trial
optimal_params = trial.params

[I 2024-03-09 13:56:36,143] A new study created in memory with name: no-name-1c4ad091-cb60-43d6-b25e-c8ab588757c4
[I 2024-03-09 13:56:36,571] Trial 0 finished with value: 0.96 and parameters: {'n_estimators': 639, 'learning_rate': 0.46330134242887433, 'max_depth': 9, 'lambda': 0.00016473169870055845, 'alpha': 0.41919535904727684, 'subsample': 0.5002750208536795, 'colsample_bytree': 0.6727276948069671, 'gamma': 18.405412928792742}. Best is trial 0 with value: 0.96.
[I 2024-03-09 13:56:36,681] Trial 1 finished with value: 0.96 and parameters: {'n_estimators': 465, 'learning_rate': 0.910403719738179, 'max_depth': 4, 'lambda': 2.698033308232869e-07, 'alpha': 3.974471701940834e-05, 'subsample': 0.9785101089858428, 'colsample_bytree': 0.558391468534692, 'gamma': 19.973221456185787}. Best is trial 0 with value: 0.96.
[I 2024-03-09 13:56:36,787] Trial 2 finished with value: 0.96 and parameters: {'n_estimators': 140, 'learning_rate': 0.8063159306167302, 'max_depth': 5, 'lambda': 5.7995517025936

In [5]:
pd.DataFrame(optimal_params, index=[0])

Unnamed: 0,n_estimators,learning_rate,max_depth,lambda,alpha,subsample,colsample_bytree,gamma
0,201,0.383735,7,3e-06,1e-06,0.370748,0.779714,2.506857


In [6]:
bst = XGBClassifier(**optimal_params)
bst.fit(X_train, y_train)
prob_train = bst.predict_proba(X_train)[:, 1]
prob_test = bst.predict_proba(X_test)[:, 1]
pred_train = bst.predict(X_train)
pred_test = bst.predict(X_test)
auc_train = roc_auc_score(y_train, prob_train)
auc_test = roc_auc_score(y_test, prob_test)
print(f"\nROC AUC training sample: {auc_train:.3f}\nROC AUC test sample: {auc_test:.3f}")


ROC AUC training sample: 0.997
ROC AUC test sample: 0.988


In [7]:
TPR_train = (pred_train == y_train.values).sum() / len(y_train)
TPR_test = (pred_test == y_test.values).sum() / len(y_test)
print(f"\nTPR training sample: {TPR_train:.3f}\nTPR test sample: {TPR_test:.3f}")


TPR training sample: 0.970
TPR test sample: 0.990


### Re-doing exercise 2 using Optuna
Let us try to re-do exercise 2 that we tuned using a GridSearch but now using the new hyper-parameter tuning in Optuna.

In [8]:
data_path_ex2 = "https://www.nbi.dk/~koskinen/Teaching/data/BDT_16var.txt"
df_ex2 = pd.read_csv(
    data_path_ex2,
    delimiter="\s+",
    names=[
        "var1",
        "var2",
        "var3",
        "var4",
        "var5",
        "var6",
        "var7",
        "var8",
        "var9",
        "var10",
        "var11",
        "var12",
        "var13",
        "var14",
        "var15",
        "var16",
    ],
    index_col=0,
    engine="python",
)
ex2_sig = df_ex2.iloc[::2]
ex2_bg = df_ex2.iloc[1::2]
ex2_bg_sig = pd.concat((ex2_bg, ex2_sig)).reset_index(drop=True)
ex2_y = np.append(np.zeros(len(ex2_bg)), np.ones(len(ex2_sig)))
X_train_ex2, X_test_ex2, y_train_ex2, y_test_ex2 = train_test_split(
    ex2_bg_sig, ex2_y, test_size=0.4
)

In [9]:
objective_ex2 = XGBObjective(X_train_ex2, X_test_ex2, y_train_ex2, y_test_ex2)
study_ex2 = create_study(direction = "maximize")
study_ex2.optimize(objective_ex2, n_trials = 100)
trial_ex2 = study_ex2.best_trial
optimal_params_ex2 = trial_ex2.params

[I 2024-03-09 13:56:49,896] A new study created in memory with name: no-name-8251df50-de9e-479a-83c7-edfa36ce9c92
[I 2024-03-09 13:56:50,306] Trial 0 finished with value: 0.8515 and parameters: {'n_estimators': 870, 'learning_rate': 0.7568343376352018, 'max_depth': 3, 'lambda': 8.75908632954778e-05, 'alpha': 5.199208932713126e-07, 'subsample': 0.4846770986875402, 'colsample_bytree': 0.7101940741757997, 'gamma': 1.9472533048377416}. Best is trial 0 with value: 0.8515.
[I 2024-03-09 13:56:50,821] Trial 1 finished with value: 0.8695 and parameters: {'n_estimators': 638, 'learning_rate': 0.19586393093657462, 'max_depth': 10, 'lambda': 0.00010279290157210648, 'alpha': 1.218106914858315e-06, 'subsample': 0.7709654986129753, 'colsample_bytree': 0.26000708018813123, 'gamma': 0.4675222749520147}. Best is trial 1 with value: 0.8695.
[I 2024-03-09 13:56:50,903] Trial 2 finished with value: 0.874 and parameters: {'n_estimators': 267, 'learning_rate': 0.6531449985549557, 'max_depth': 3, 'lambda': 3

In [10]:
bst_ex2 = XGBClassifier(**optimal_params_ex2)
bst_ex2.fit(X_train_ex2, y_train_ex2)
prob_train_ex2 = bst_ex2.predict_proba(X_train_ex2)[:, 1]
prob_test_ex2 = bst_ex2.predict_proba(X_test_ex2)[:, 1]
pred_train_ex2 = bst_ex2.predict(X_train_ex2)
pred_test_ex2 = bst_ex2.predict(X_test_ex2)
auc_train_ex2 = roc_auc_score(y_train_ex2, prob_train_ex2)
auc_test_ex2 = roc_auc_score(y_test_ex2, prob_test_ex2)
print(f"\nROC AUC training sample: {auc_train_ex2:.3f}\nROC AUC test sample: {auc_test_ex2:.3f}")


ROC AUC training sample: 0.996
ROC AUC test sample: 0.958


In [11]:
TPR_train_ex2 = (pred_train_ex2 == y_train_ex2).sum() / len(y_train_ex2)
TPR_test_ex2 = (pred_test_ex2 == y_test_ex2).sum() / len(y_test_ex2)
print(f"\nTPR training sample: {TPR_train_ex2:.3f}\nTPR test sample: {TPR_test_ex2:.3f}")


TPR training sample: 0.964
TPR test sample: 0.890
