In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
import pickle
import os
import gc
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
import seaborn as sns
import time
import json
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from utilities import (
    RANDOM_STATE, TARGET_COL, N_FOLD,
)

INPUT_PATH = '../input/tabular-playground-series-oct-2021'
PATH_NOTEBOOK = '../input/preprocess-gpu'

In [2]:
train = pd.read_pickle(
    os.path.join(PATH_NOTEBOOK, 'train_unscaled.pkl')
)

In [3]:
with open(os.path.join(PATH_NOTEBOOK, 'feature_dic.pkl'), 'rb') as file:
    feature_dic = pickle.load(file)

In [4]:
#CONSTANT
FEATURE = feature_dic['feature']
CAT_COL = feature_dic['categorical']
NUMERIC_COL = feature_dic['numerical']

FOLD_LIST = list(range(N_FOLD))

gc.collect()

63

In [5]:
#train test split for optuna-study
train_x, test_x, train_y, test_y = train_test_split(
    train[FEATURE], train[TARGET_COL], random_state = RANDOM_STATE, 
    stratify = train[TARGET_COL], test_size = .75
)

dtrain = xgb.DMatrix(train_x, label=train_y)
dtest = xgb.DMatrix(test_x, label=test_y)

gc.collect()

11

In [6]:
def objective(trial):
    params_study = {
        "verbosity": 0,
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'eval_metric': 'auc',
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'seed': RANDOM_STATE,
        "learning_rate": .1,
        "subsample": trial.suggest_float("subsample", .5, 1),
        "colsample_bytree": trial.suggest_float("colsample_bytree", .5, 1),
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", .5, 1),
#         "colsample_bynode": trial.suggest_float("colsample_bynode", .5, 1),
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0),
        "alpha": trial.suggest_float("alpha", 1e-8, 10.0),
        "max_depth": trial.suggest_int("max_depth", 4, 8),
        "gamma": trial.suggest_float("gamma", 1e-8, 3.0),
    }
    
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    model = xgb.train(
        params_study, dtrain, 
        evals=[(dtest, "validation")], callbacks=[pruning_callback], num_boost_round = 10000, 
        early_stopping_rounds = 100, verbose_eval  = 0
    )
    
    preds = model.predict(dtest)
    
    auc = roc_auc_score(test_y, preds)
    return auc

In [7]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps = 100, n_startup_trials = 30, n_min_trials = 10), direction="maximize"
)
study.optimize(objective, timeout=3600 * 2.5)

[32m[I 2021-10-15 06:01:13,196][0m A new study created in memory with name: no-name-60fa6963-3ed0-452e-b8de-d990565befd0[0m
[32m[I 2021-10-15 06:01:29,592][0m Trial 0 finished with value: 0.8541851149874561 and parameters: {'subsample': 0.9068742966552055, 'colsample_bytree': 0.6717345324863757, 'lambda': 5.471351738883394, 'alpha': 4.772891162622184, 'max_depth': 4, 'gamma': 2.7550992792844777}. Best is trial 0 with value: 0.8541851149874561.[0m
[32m[I 2021-10-15 06:01:43,142][0m Trial 1 finished with value: 0.852504903662149 and parameters: {'subsample': 0.9865330895207187, 'colsample_bytree': 0.9052206306246804, 'lambda': 7.5208593996556035, 'alpha': 5.838521998195147, 'max_depth': 6, 'gamma': 2.683046754838135}. Best is trial 0 with value: 0.8541851149874561.[0m
[32m[I 2021-10-15 06:01:58,045][0m Trial 2 finished with value: 0.8516092553551637 and parameters: {'subsample': 0.9209961749291721, 'colsample_bytree': 0.7332647742367451, 'lambda': 3.4653579276929594, 'alpha': 

In [8]:
best_score = study.best_trial.values
print(best_score)

[0.8543853430232207]


In [9]:
final_params = study.best_trial.params
print(final_params)

{'subsample': 0.8081548613972851, 'colsample_bytree': 0.5093883996922348, 'lambda': 6.593897401079495, 'alpha': 8.220939046693244, 'max_depth': 4, 'gamma': 0.9917294458236702}


In [10]:
with open("final_xgb_param.pkl", "wb") as file_name:
    pickle.dump(final_params, file_name)
