# XGBoost Evaluation

## 1. Generate Random Data

In [1]:
import sys
sys.path.append('../../src')

import pandas as pd
import numpy as np
import random
from random_data_generator import random_data_generator

#tunning
from tuning import find_best_model

#models
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

#metrics
from sklearn.metrics import roc_auc_score

random.seed(42)

In [2]:
dataset = random_data_generator("binary", 5)


## 2. XGBoost

In [3]:
xgboost_params = {
    'learning_rate': np.arange(0.001, 0.1, 0.005),
    'max_depth': np.arange(2, 8),
    'n_estimators': np.arange(50, 150, 10),
    'subsample': np.arange(0.3, 0.9, 0.1),
    'colsample_bytree': np.arange(0.6, 1.0, 0.05),
    'gamma': np.arange(0.1, 5, 0.1),
    'early_stopping_rounds': np.arange(5, 15, 5),
    'eval_metric':['auc']
}

scores = []
for i in range(0, len(dataset)):
    x_train, y_train, x_val, y_val, x_test, y_test = dataset[i][0], dataset[i][1], dataset[i][2], dataset[i][3], dataset[i][4], dataset[i][5]
    best_params = find_best_model("xgb", x_train, y_train, x_val, y_val, trials=25)
    xgb_clf = XGBClassifier(**best_params.params, random_state=42).fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=False)
    y_pred = xgb_clf.predict_proba(x_test)
    scores.append(roc_auc_score(y_test, y_pred[:,1]))
mean =  np.mean(np.array(scores))
print(scores)
print(mean)

[I 2024-03-13 13:47:45,079] A new study created in memory with name: no-name-7ec6b02e-6c6e-4916-9383-746a4d1e234a


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2024-03-13 13:49:12,416] Trial 0 finished with value: 0.7130515470021098 and parameters: {'n_estimators': 79, 'learning_rate': 0.05414611899791147, 'max_depth': 4, 'subsample': 0.3606311665564293, 'colsample_bytree': 0.7167051793932449, 'min_child_weight': 3, 'gamma': 2.119711757924736}. Best is trial 0 with value: 0.7130515470021098.
[I 2024-03-13 13:49:48,621] Trial 1 finished with value: 0.7152228129077933 and parameters: {'n_estimators': 57, 'learning_rate': 0.09554567839601556, 'max_depth': 2, 'subsample': 0.7760409034403136, 'colsample_bytree': 0.7699128621610842, 'min_child_weight': 12, 'gamma': 4.07594920759284}. Best is trial 1 with value: 0.7152228129077933.
[I 2024-03-13 13:50:45,479] Trial 2 finished with value: 0.7128139172268446 and parameters: {'n_estimators': 94, 'learning_rate': 0.05837445051745511, 'max_depth': 2, 'subsample': 0.6356596026175128, 'colsample_bytree': 0.6434415415322129, 'min_child_weight': 1, 'gamma': 2.8802255900533935}. Best is trial 1 with value:

[I 2024-03-13 15:05:37,426] A new study created in memory with name: no-name-b952db0a-5033-435b-9d4c-33410f6844ab


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2024-03-13 15:07:31,054] Trial 0 finished with value: 0.5875386969371073 and parameters: {'n_estimators': 59, 'learning_rate': 0.01641020015734228, 'max_depth': 5, 'subsample': 0.42999619018823754, 'colsample_bytree': 0.9342389987684656, 'min_child_weight': 1, 'gamma': 3.2155316158057574}. Best is trial 0 with value: 0.5875386969371073.
[I 2024-03-13 15:11:42,002] Trial 1 finished with value: 0.5918342489587132 and parameters: {'n_estimators': 87, 'learning_rate': 0.044196221600772315, 'max_depth': 8, 'subsample': 0.5794242750038319, 'colsample_bytree': 0.954518026532054, 'min_child_weight': 19, 'gamma': 1.0522031913255014}. Best is trial 1 with value: 0.5918342489587132.
[I 2024-03-13 15:15:49,248] Trial 2 finished with value: 0.5967664437017454 and parameters: {'n_estimators': 86, 'learning_rate': 0.08228583787528365, 'max_depth': 8, 'subsample': 0.47918348031698926, 'colsample_bytree': 0.7200577634537271, 'min_child_weight': 7, 'gamma': 0.5003149757892517}. Best is trial 2 with v

[I 2024-03-13 17:08:00,840] A new study created in memory with name: no-name-6f3daa29-49d1-435c-9a4b-f18978379d1b


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2024-03-13 17:09:27,740] Trial 0 finished with value: 0.752365124002296 and parameters: {'n_estimators': 85, 'learning_rate': 0.0937837886178058, 'max_depth': 2, 'subsample': 0.7823335893166139, 'colsample_bytree': 0.7580124949809035, 'min_child_weight': 11, 'gamma': 0.8632422313453947}. Best is trial 0 with value: 0.752365124002296.
[I 2024-03-13 17:15:36,522] Trial 1 finished with value: 0.7707545446487836 and parameters: {'n_estimators': 116, 'learning_rate': 0.05632352874104713, 'max_depth': 8, 'subsample': 0.7090844903925562, 'colsample_bytree': 0.7937782769499305, 'min_child_weight': 7, 'gamma': 3.5640467262831628}. Best is trial 1 with value: 0.7707545446487836.
[I 2024-03-13 17:16:57,416] Trial 2 finished with value: 0.7559226816876181 and parameters: {'n_estimators': 65, 'learning_rate': 0.08821782874176858, 'max_depth': 3, 'subsample': 0.891583494725662, 'colsample_bytree': 0.6795492981502846, 'min_child_weight': 14, 'gamma': 2.9539123312072184}. Best is trial 1 with value

[I 2024-03-13 18:34:51,604] A new study created in memory with name: no-name-c867c841-dcef-4333-bfa6-9ad1106ff892


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2024-03-13 18:38:12,690] Trial 0 finished with value: 0.7275495366174731 and parameters: {'n_estimators': 141, 'learning_rate': 0.05275857661735221, 'max_depth': 3, 'subsample': 0.6513999515532688, 'colsample_bytree': 0.9431866322936762, 'min_child_weight': 5, 'gamma': 2.2537495436000814}. Best is trial 0 with value: 0.7275495366174731.
[I 2024-03-13 18:39:24,103] Trial 1 finished with value: 0.7267690658416364 and parameters: {'n_estimators': 66, 'learning_rate': 0.07932520589005021, 'max_depth': 2, 'subsample': 0.4940398592495343, 'colsample_bytree': 0.9902162474128013, 'min_child_weight': 11, 'gamma': 1.8490224191932985}. Best is trial 0 with value: 0.7275495366174731.
[I 2024-03-13 18:44:09,376] Trial 2 finished with value: 0.7221339830174635 and parameters: {'n_estimators': 83, 'learning_rate': 0.056226239033984944, 'max_depth': 8, 'subsample': 0.8297046447330148, 'colsample_bytree': 0.9680896448169379, 'min_child_weight': 18, 'gamma': 4.347575810699177}. Best is trial 0 with v

[I 2024-03-13 20:31:41,567] A new study created in memory with name: no-name-ba592121-0b89-48a5-ac22-b73281813d88


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2024-03-13 20:35:15,048] Trial 0 finished with value: 0.7964121071574641 and parameters: {'n_estimators': 126, 'learning_rate': 0.03565682330912511, 'max_depth': 4, 'subsample': 0.8650021930414857, 'colsample_bytree': 0.6865402592159283, 'min_child_weight': 11, 'gamma': 1.7243943045342407}. Best is trial 0 with value: 0.7964121071574641.
[I 2024-03-13 20:38:18,073] Trial 1 finished with value: 0.7896907590057096 and parameters: {'n_estimators': 73, 'learning_rate': 0.08220794477543913, 'max_depth': 7, 'subsample': 0.468156611539696, 'colsample_bytree': 0.9623046543344115, 'min_child_weight': 14, 'gamma': 4.559711985500037}. Best is trial 0 with value: 0.7964121071574641.
[I 2024-03-13 20:41:03,757] Trial 2 finished with value: 0.7879269977975951 and parameters: {'n_estimators': 75, 'learning_rate': 0.09269499352795477, 'max_depth': 6, 'subsample': 0.640551843609513, 'colsample_bytree': 0.9973109556885217, 'min_child_weight': 9, 'gamma': 0.4007201244584918}. Best is trial 0 with valu

In [4]:
print(scores)
print(mean)

[0.8105024414129285, 0.8122161479442824, 0.7041972517674223, 0.7790856457886611, 0.717678612206739]
0.7647360198240066
