In [1]:
import pandas as pd
import numpy as np
import catboost as cb
import optuna
import pickle
import os
import gc
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
import seaborn as sns
import time
import json
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from utilities import (
    RANDOM_STATE, TARGET_COL, N_FOLD,
)

INPUT_PATH = '../input/tabular-playground-series-oct-2021'
PATH_NOTEBOOK = '../input/preprocess-gpu'

In [2]:
train = pd.read_pickle(
    os.path.join(PATH_NOTEBOOK, 'train_unscaled.pkl')
)

In [3]:
with open(os.path.join(PATH_NOTEBOOK, 'feature_dic.pkl'), 'rb') as file:
    feature_dic = pickle.load(file)

In [4]:
#CONSTANT
FEATURE = feature_dic['feature']
CAT_COL = feature_dic['categorical']
NUMERIC_COL = feature_dic['numerical']

FOLD_LIST = list(range(N_FOLD))

gc.collect()

63

In [5]:
#train test split for optuna-study
train_x, test_x, train_y, test_y = train_test_split(
    train[FEATURE], train[TARGET_COL], random_state = RANDOM_STATE, 
    stratify = train[TARGET_COL], test_size = .75
)

train_pool = cb.Pool(train_x, label= train_y, cat_features = CAT_COL)
valid_pool = cb.Pool(test_x, label= test_y, cat_features = CAT_COL)

gc.collect()

0

In [6]:
def objective(trial):
    param = {
        'eval_metric': 'AUC',
        'loss_function': 'Logloss',
        'verbose': 100,
        'random_state':RANDOM_STATE,
        'task_type' : 'GPU',
        'early_stopping_rounds' : 100,
        'iterations': 100000,
        'learning_rate': .1,
        'metric_period': 25,
        "depth": trial.suggest_int("depth", 4, 8),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 5),
        "random_strength": trial.suggest_float("random_strength", .8, 1.2),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 32),
    }
    
    model = cb.CatBoostClassifier(**param)
    
    model.fit(train_pool, eval_set = valid_pool, verbose=False, use_best_model=True)
    
    pred_prob = model.predict_proba(test_x)[:, 1]
    
    auc = roc_auc_score(test_y, pred_prob)
    return auc

In [7]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps = 100, n_startup_trials = 30, n_min_trials = 10), direction="maximize"
)
study.optimize(objective, timeout=3600 * 2.5)

[32m[I 2021-10-21 09:16:04,470][0m A new study created in memory with name: no-name-f2a5ba13-b1db-409f-b40f-534385e71b02[0m
[32m[I 2021-10-21 09:17:57,268][0m Trial 0 finished with value: 0.8546332989332877 and parameters: {'depth': 6, 'bagging_temperature': 1.4699815466953992, 'l2_leaf_reg': 1.5246259396958362, 'random_strength': 1.1396845596416516, 'min_data_in_leaf': 30}. Best is trial 0 with value: 0.8546332989332877.[0m
[32m[I 2021-10-21 09:19:07,201][0m Trial 1 finished with value: 0.851403024643521 and parameters: {'depth': 5, 'bagging_temperature': 5.951806047831038, 'l2_leaf_reg': 3.916624483168183, 'random_strength': 0.9602220376238143, 'min_data_in_leaf': 12}. Best is trial 0 with value: 0.8546332989332877.[0m
[32m[I 2021-10-21 09:20:41,139][0m Trial 2 finished with value: 0.8511696587916453 and parameters: {'depth': 4, 'bagging_temperature': 8.296548490653032, 'l2_leaf_reg': 2.3428972050801473, 'random_strength': 0.9217618517575213, 'min_data_in_leaf': 3}. Best i

In [8]:
best_score = study.best_trial.values
print(best_score)

[0.8552217566681732]


In [9]:
final_params = study.best_trial.params
print(final_params)

{'depth': 4, 'bagging_temperature': 1.1418824362276156, 'l2_leaf_reg': 4.509209960369067, 'random_strength': 0.8823223562163118, 'min_data_in_leaf': 5}


In [10]:
with open("final_cb_param.pkl", "wb") as file_name:
    pickle.dump(final_params, file_name)
