In [1]:
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index
from sklearn.metrics import classification_report

import optuna
from optuna.samplers import TPESampler
from params_manager import save_params

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from params_manager import INTERNAL_PATH

seed = 42

## Load features

In [4]:
t = pd.read_csv(INTERNAL_PATH / 'train.csv')
t = t[t['outlier'] == 0].drop('outlier', axis=1)

X = t.drop('target', axis=1)
y = t['target']

cat_col = X.select_dtypes(include=['object']).columns.tolist()
y.shape, X.shape

((63636,), (63636, 167))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X.drop(cat_col, axis=1), y,
                                                    test_size=0.2, random_state=seed, stratify=y)

## Hyperparameter Optimization with Optuna

In [6]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 600, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'gamma' : trial.suggest_float('gamma', 1e-9, 0.5),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'max_depth': trial.suggest_int('max_depth', 1, 30),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 100.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 100.0, log=True),
    }
    params['booster'] = 'gbtree'
    params['grow_policy'] = 'depthwise'
    params['objective'] = 'binary:logistic'
    params["tree_method"] = 'hist'
    params["device"] = 'cuda'
    params["verbosity"] = 0
    
    xgb = XGBClassifier(**params)
    xgb.fit(X_train, y_train)
    
    return concordance_index(y_test, xgb.predict_proba(X_test)[:, 1])

sqlite_db = "sqlite:///xgb_sqlite.db"
study_name = "binary_classification_XGBoost"
study = optuna.create_study(storage=sqlite_db, study_name=study_name, 
                            sampler=TPESampler(n_startup_trials=50, multivariate=True, seed=142),
                            direction="maximize", load_if_exists=True)

study.optimize(objective, n_trials=10)

[I 2024-05-20 20:08:52,136] A new study created in RDB with name: binary_classification_XGBoost
[I 2024-05-20 20:09:17,266] Trial 0 finished with value: 0.7597951268472334 and parameters: {'n_estimators': 4569, 'learning_rate': 0.03612497338391456, 'gamma': 0.3279923530972189, 'subsample': 0.8492242714949125, 'colsample_bytree': 0.27989576854581266, 'max_depth': 4, 'min_child_weight': 78, 'reg_lambda': 5.690809841819152e-05, 'reg_alpha': 1.7302327962557243e-05}. Best is trial 0 with value: 0.7597951268472334.
[I 2024-05-20 20:09:40,850] Trial 1 finished with value: 0.7490717136201045 and parameters: {'n_estimators': 4048, 'learning_rate': 0.06102834897771546, 'gamma': 0.08140430390004257, 'subsample': 0.20360199864761197, 'colsample_bytree': 0.3992693940732932, 'max_depth': 19, 'min_child_weight': 88, 'reg_lambda': 0.005802958114041871, 'reg_alpha': 4.6136314033887355e-05}. Best is trial 0 with value: 0.7597951268472334.
[I 2024-05-20 20:09:56,075] Trial 2 finished with value: 0.769450

In [7]:
print(f"best optimized roc_auc: {study.best_value:0.5f}")

params = study.best_params
params['booster'] = 'gbtree'
params['grow_policy'] = 'depthwise'
params['objective'] = 'binary:logistic'
params["tree_method"] = 'hist'
params["device"] = 'cuda'
params["verbosity"] = 0

params

best optimized roc_auc: 0.77645


{'n_estimators': 612,
 'learning_rate': 0.010782989364623309,
 'gamma': 0.057931026029600444,
 'subsample': 0.8630466550021311,
 'colsample_bytree': 0.4428827911553296,
 'max_depth': 16,
 'min_child_weight': 47,
 'reg_lambda': 4.566774169180873e-06,
 'reg_alpha': 4.9375122547368406e-05,
 'booster': 'gbtree',
 'grow_policy': 'depthwise',
 'objective': 'binary:logistic',
 'tree_method': 'hist',
 'device': 'cuda',
 'verbosity': 0}

In [8]:
save_params(params, 'xbg')

In [9]:
model = XGBClassifier(**params)
model.fit(X_train, y_train)

In [10]:
print(f'Concordance Index: {concordance_index(y_test, model.predict_proba(X_test)[:, 1])}')

Concordance Index: 0.7764508107224112


In [11]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     11658
           1       0.66      0.11      0.19      1070

    accuracy                           0.92     12728
   macro avg       0.79      0.55      0.57     12728
weighted avg       0.90      0.92      0.89     12728
