In [1]:
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index
from sklearn.metrics import classification_report

import json
import optuna
from optuna.samplers import TPESampler
from params_manager import save_params

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from params_manager import INTERNAL_PATH

seed = 142

## Load features

In [4]:
t = pd.read_csv(INTERNAL_PATH / 'train.csv')
t = t[t['outlier'] == 0].drop(['outlier', 'user_id'], axis=1)

X = t.drop('target', axis=1)
y = t['target']

cat_col = X.select_dtypes(include=['object']).columns.tolist()
y.shape, X.shape

((63636,), (63636, 166))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X.drop(cat_col, axis=1), y,
                                                    test_size=0.2, random_state=seed, stratify=y)

## Hyperparameter Optimization with Optuna

In [6]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

ci_best = 0
bests = []

def objective(trial: optuna.trial.Trial):
    global ci_best, bests
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 600, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'gamma' : trial.suggest_float('gamma', 1e-9, 0.5),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'max_depth': trial.suggest_int('max_depth', 1, 30),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 100.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 100.0, log=True),
    }
    params['booster'] = 'gbtree'
    params['grow_policy'] = 'depthwise'
    params['objective'] = 'binary:logistic'
    params["tree_method"] = 'hist'
    params["device"] = 'cuda'
    params["verbosity"] = 0
    
    xgb = XGBClassifier(**params)
    xgb.fit(X_train, y_train)
    CI = concordance_index(y_test, xgb.predict_proba(X_test)[:,1])
    
    if CI > ci_best:
        ci_best = CI
        print(f'New best CI on {trial.number} trial: {ci_best}')
        bests.append((trial.number, CI, trial.params, params))
    
    if len(bests) % 2 == 0 or trial.number == 4999:
        with open(f'params/xbgs.json', 'w', encoding='utf-8') as f:
            json.dump(bests, f)
            
    return CI

sqlite_db = "sqlite:///xgb_sqlite.db"
study_name = "binary_classification_XGBoost"
study = optuna.create_study(storage=sqlite_db, study_name=study_name, 
                            sampler=TPESampler(n_startup_trials=50, multivariate=True, seed=seed),
                            direction="maximize", load_if_exists=True)

study.optimize(objective, n_trials=5000)

New best CI on 0 trial: 0.7582451102527966
New best CI on 2 trial: 0.7727479264970667
New best CI on 5 trial: 0.7826822221474002
New best CI on 92 trial: 0.783633275773886
New best CI on 130 trial: 0.7836607327526082
New best CI on 177 trial: 0.7839223957556721
New best CI on 215 trial: 0.7839470068285707
New best CI on 291 trial: 0.7840454511201645
New best CI on 301 trial: 0.78481961767059
New best CI on 329 trial: 0.7851191993625171
New best CI on 1481 trial: 0.7851220452683408
New best CI on 3604 trial: 0.7852052980344811


In [7]:
print(f"best optimized roc_auc: {study.best_value:0.5f}")


params = study.best_params
params['booster'] = 'gbtree'
params['grow_policy'] = 'depthwise'
params['objective'] = 'binary:logistic'
params["tree_method"] = 'hist'
params["device"] = 'cuda'
params["verbosity"] = 0

params

best optimized roc_auc: 0.78521


{'n_estimators': 616,
 'learning_rate': 0.010527109253362778,
 'gamma': 0.012349661426635947,
 'subsample': 0.8534720350236409,
 'colsample_bytree': 0.8043442648731229,
 'max_depth': 6,
 'min_child_weight': 42,
 'reg_lambda': 0.0004746750410906658,
 'reg_alpha': 0.0005328387762813716,
 'booster': 'gbtree',
 'grow_policy': 'depthwise',
 'objective': 'binary:logistic',
 'tree_method': 'hist',
 'device': 'cuda',
 'verbosity': 0}

In [8]:
model = XGBClassifier(**params)
model.fit(X_train, y_train)

In [9]:
print(f'Concordance Index: {concordance_index(y_test, model.predict_proba(X_test)[:, 1])}')

Concordance Index: 0.7852052980344811


In [10]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.92      0.99      0.96     11658
           1       0.66      0.11      0.19      1070

    accuracy                           0.92     12728
   macro avg       0.79      0.55      0.58     12728
weighted avg       0.90      0.92      0.89     12728
