In [1]:
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index
from sklearn.metrics import classification_report

import optuna
from optuna.samplers import TPESampler
from params_manager import save_params

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from params_manager import INTERNAL_PATH

seed = 142

## Load features

In [4]:
t = pd.read_csv(INTERNAL_PATH / 'train.csv')
t = t[t['outlier'] == 0].drop('outlier', axis=1)

X = t.drop('target', axis=1)
y = t['target']

cat_col = X.select_dtypes(include=['object']).columns.tolist()
y.shape, X.shape

((63636,), (63636, 167))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X.drop(cat_col, axis=1), y,
                                                    test_size=0.2, random_state=seed, stratify=y)

## Hyperparameter Optimization with Optuna

In [6]:
old_verbosity = optuna.logging.get_verbosity()
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 600, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'gamma' : trial.suggest_float('gamma', 1e-9, 0.5),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'max_depth': trial.suggest_int('max_depth', 1, 30),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 100.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 100.0, log=True),
    }
    params['booster'] = 'gbtree'
    params['grow_policy'] = 'depthwise'
    params['objective'] = 'binary:logistic'
    params["tree_method"] = 'hist'
    params["device"] = 'cuda'
    params["verbosity"] = 0
    
    xgb = XGBClassifier(**params)
    xgb.fit(X_train, y_train)
    
    return concordance_index(y_test, xgb.predict_proba(X_test)[:, 1])

sqlite_db = "sqlite:///xgb_sqlite2.db"
study_name = "binary_classification_XGBoost2"
study = optuna.create_study(storage=sqlite_db, study_name=study_name, 
                            sampler=TPESampler(n_startup_trials=50, multivariate=True, seed=seed),
                            direction="maximize", load_if_exists=True)

study.optimize(objective, n_trials=1000)

In [7]:
print(f"best optimized roc_auc: {study.best_value:0.5f}")

params = study.best_params
params['booster'] = 'gbtree'
params['grow_policy'] = 'depthwise'
params['objective'] = 'binary:logistic'
params["tree_method"] = 'hist'
params["device"] = 'cuda'
params["verbosity"] = 0

params

best optimized roc_auc: 0.78947


{'n_estimators': 742,
 'learning_rate': 0.01167163077187354,
 'gamma': 0.461250817551969,
 'subsample': 0.7930625663808353,
 'colsample_bytree': 0.9331617058462893,
 'max_depth': 6,
 'min_child_weight': 3,
 'reg_lambda': 0.027486950836903636,
 'reg_alpha': 1.859654985959253e-05,
 'booster': 'gbtree',
 'grow_policy': 'depthwise',
 'objective': 'binary:logistic',
 'tree_method': 'hist',
 'device': 'cuda',
 'verbosity': 0}

In [8]:
save_params(params, 'xbg2')

In [9]:
model = XGBClassifier(**params)
model.fit(X_train, y_train)

In [10]:
print(f'Concordance Index: {concordance_index(y_test, model.predict_proba(X_test)[:, 1])}')

Concordance Index: 0.7894705893670545


In [11]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.99      0.96     11658
           1       0.67      0.13      0.22      1070

    accuracy                           0.92     12728
   macro avg       0.80      0.56      0.59     12728
weighted avg       0.90      0.92      0.90     12728
