In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
import pickle
import os
import gc
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
import seaborn as sns
import time
import json
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from utilities import (
    RANDOM_STATE, TARGET_COL, N_FOLD,
)

INPUT_PATH = '../input/tabular-playground-series-oct-2021'
PATH_NOTEBOOK = '../input/preprocess'

In [2]:
train = pd.read_pickle(
    os.path.join(PATH_NOTEBOOK, 'train_unscaled.pkl')
)

In [3]:
with open(os.path.join(PATH_NOTEBOOK, 'feature_dic.pkl'), 'rb') as file:
    feature_dic = pickle.load(file)

In [4]:
#CONSTANT
FEATURE = feature_dic['feature']
CAT_COL = feature_dic['categorical']
NUMERIC_COL = feature_dic['numerical']

FOLD_LIST = list(range(N_FOLD))

gc.collect()

63

In [5]:
#train test split for optuna-study
train_x, test_x, train_y, test_y = train_test_split(
    train[FEATURE], train[TARGET_COL], random_state = RANDOM_STATE, 
    stratify = train[TARGET_COL], test_size = .75
)

gc.collect()

21

In [6]:
def objective(trial):
    dtrain = lgb.Dataset(
        train_x, label=train_y, 
        categorical_feature=CAT_COL
    )
    dtest = lgb.Dataset(
        test_x, label=test_y, 
        categorical_feature=CAT_COL
    )

    params_study = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'auc',
        'learning_rate': 0.1,
        'random_state': RANDOM_STATE,
        'verbose': -1,
        'n_jobs': -1,
        "num_leaves": trial.suggest_int("num_leaves", 2**6, 2**10),
        "bagging_fraction": trial.suggest_float("bagging_fraction", .3, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 5),
        "feature_fraction": trial.suggest_float("feature_fraction", .3, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        "extra_trees": trial.suggest_categorical("extra_trees", [False, True]),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 1e-8, 10),
        "path_smooth": trial.suggest_float("path_smooth", 0, 2),
    }
    
    pruning_callback = optuna.integration.LightGBMPruningCallback(
        trial, "auc"
    )
    
    model = lgb.train(
        params_study, dtrain, 
        valid_sets = dtest, 
        verbose_eval = False, 
        callbacks = [pruning_callback],
        num_boost_round = 100000, 
        early_stopping_rounds = 100,
    )
    
    preds = model.predict(test_x)
    
    auc = roc_auc_score(test_y, preds)
    return auc

In [7]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), 
    direction="maximize",
)
study.optimize(objective, timeout=30500, show_progress_bar = False)

[32m[I 2021-10-14 06:50:30,284][0m A new study created in memory with name: no-name-b33994bc-130c-4be3-b2a6-88bedd034ab8[0m
[32m[I 2021-10-14 06:54:05,160][0m Trial 0 finished with value: 0.8539932246946582 and parameters: {'num_leaves': 386, 'bagging_fraction': 0.452783871101248, 'bagging_freq': 4, 'feature_fraction': 0.9135261360263458, 'lambda_l1': 3.4263448293227303, 'lambda_l2': 4.60805931254312, 'min_data_in_leaf': 53, 'extra_trees': True, 'min_gain_to_split': 4.163510696281897, 'path_smooth': 0.35467298246731516}. Best is trial 0 with value: 0.8539932246946582.[0m
[32m[I 2021-10-14 07:00:18,516][0m Trial 1 finished with value: 0.8545305342000189 and parameters: {'num_leaves': 108, 'bagging_fraction': 0.9826419035313325, 'bagging_freq': 3, 'feature_fraction': 0.4391703923164765, 'lambda_l1': 9.981933259825583, 'lambda_l2': 4.787223416853744, 'min_data_in_leaf': 47, 'extra_trees': True, 'min_gain_to_split': 1.3870747701430455, 'path_smooth': 1.6530551435631298}. Best is tr

In [8]:
best_score = study.best_trial.values
print(best_score)

[0.8545305342000189]


In [9]:
final_params = study.best_trial.params
print(final_params)

{'num_leaves': 108, 'bagging_fraction': 0.9826419035313325, 'bagging_freq': 3, 'feature_fraction': 0.4391703923164765, 'lambda_l1': 9.981933259825583, 'lambda_l2': 4.787223416853744, 'min_data_in_leaf': 47, 'extra_trees': True, 'min_gain_to_split': 1.3870747701430455, 'path_smooth': 1.6530551435631298}


In [10]:
with open("final_xgb_param.pkl", "wb") as file_name:
    pickle.dump(final_params, file_name)
