In [1]:
import pandas as pd

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index
from sklearn.metrics import classification_report

import json
import optuna
from optuna.samplers import TPESampler
from params_manager import save_params

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from params_manager import INTERNAL_PATH

seed = 142

## Load features

In [4]:
t = pd.read_csv(INTERNAL_PATH / 'train.csv')
t = t[t['outlier'] == 0].drop(['outlier', 'user_id'], axis=1)

X = t.drop('target', axis=1)
y = t['target']

cat_col = X.select_dtypes(include=['object']).columns.tolist()
y.shape, X.shape

((63636,), (63636, 166))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=seed, stratify=y)

## Hyperparameter Optimization with Optuna

In [6]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

ci_best = 0
bests = []

def objective(trial: optuna.trial.Trial):
    global ci_best, bests
    
    params = {
        'verbose': False,
        'random_seed': seed,
        'eval_metric': "AUC",
        'iterations': trial.suggest_int("iterations", 1000, 2000),
        'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0.01, 0.2),
        'depth': trial.suggest_int("depth", 5, 8),
        'bootstrap_type': trial.suggest_categorical("bootstrap_type", ["Bernoulli", "MVS"]),
        'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 1, 10),
        'subsample': trial.suggest_float("subsample", 0.1, 1),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 30, log=True),
        'random_strength': trial.suggest_float("random_strength", 0.1, 1.7),
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.1),
        'min_data_in_leaf': trial.suggest_int("min_data_in_leaf", 1, 12),
    }
    params['cat_features'] = cat_col 
    params['boosting_type'] = "Plain"
    
    clf = CatBoostClassifier(**params)
    clf.fit(X_train, y_train)
    CI = concordance_index(y_test, clf.predict_proba(X_test)[:,1])
    
    if CI > ci_best:
        ci_best = CI
        print(f'New best CI on {trial.number} trial: {ci_best}')
        bests.append((trial.number, CI, trial.params, params))
    
    if len(bests) % 2 == 0 or trial.number == 999:
        with open(f'params/cats.json', 'w', encoding='utf-8') as f:
            json.dump(bests, f)
    
    return CI

sqlite_db = "sqlite:///cat_sqlite.db"
study_name = "binary_classification_CatBoost"
study = optuna.create_study(storage=sqlite_db, study_name=study_name, 
                            sampler=TPESampler(n_startup_trials=30, multivariate=True, seed=seed),
                            direction="maximize", load_if_exists=True)

study.optimize(objective, n_trials=1000)

New best CI on 0 trial: 0.7465176534344071
New best CI on 1 trial: 0.7621092891969415
New best CI on 2 trial: 0.7714642225546454
New best CI on 4 trial: 0.7782734731114008
New best CI on 13 trial: 0.7789253859609462
New best CI on 15 trial: 0.7798642142173439
New best CI on 22 trial: 0.7817162174945447
New best CI on 33 trial: 0.7822154134259415
New best CI on 35 trial: 0.782330211655227
New best CI on 43 trial: 0.7828836802131783
New best CI on 100 trial: 0.7829230418965437
New best CI on 114 trial: 0.7829814030075212
New best CI on 161 trial: 0.7833809521519056
New best CI on 168 trial: 0.7835172349660015
New best CI on 280 trial: 0.7839136576222978
New best CI on 325 trial: 0.7843146497611844
New best CI on 836 trial: 0.7843841539963733


In [7]:
print(f"best optimized roc_auc: {study.best_value:0.5f}")

params = study.best_params
params['cat_features'] = cat_col 
params['boosting_type'] = "Plain"

params

best optimized roc_auc: 0.78438


{'iterations': 1447,
 'colsample_bylevel': 0.18432741186614293,
 'depth': 6,
 'bootstrap_type': 'MVS',
 'l2_leaf_reg': 9.770072739536044,
 'subsample': 0.7632616222061618,
 'scale_pos_weight': 1.7424763558619707,
 'random_strength': 0.1740840663217069,
 'learning_rate': 0.010357781097438023,
 'min_data_in_leaf': 11,
 'cat_features': ['employee_count_nm',
  'most_time_of_day__agg_user',
  'most_season_of_year__agg_user'],
 'boosting_type': 'Plain'}

In [8]:
model = CatBoostClassifier(**params)
model.fit(X_train, y_train)

0:	learn: 0.6849836	total: 42.2ms	remaining: 1m
1:	learn: 0.6767761	total: 75ms	remaining: 54.2s
2:	learn: 0.6685281	total: 112ms	remaining: 54.1s
3:	learn: 0.6606026	total: 147ms	remaining: 53s
4:	learn: 0.6528705	total: 217ms	remaining: 1m 2s
5:	learn: 0.6453543	total: 267ms	remaining: 1m 4s
6:	learn: 0.6381149	total: 310ms	remaining: 1m 3s
7:	learn: 0.6310804	total: 347ms	remaining: 1m 2s
8:	learn: 0.6242500	total: 394ms	remaining: 1m 2s
9:	learn: 0.6174874	total: 429ms	remaining: 1m 1s
10:	learn: 0.6108450	total: 464ms	remaining: 1m
11:	learn: 0.6047642	total: 501ms	remaining: 59.9s
12:	learn: 0.5984479	total: 542ms	remaining: 59.8s
13:	learn: 0.5923367	total: 580ms	remaining: 59.3s
14:	learn: 0.5863175	total: 622ms	remaining: 59.4s
15:	learn: 0.5806799	total: 657ms	remaining: 58.8s
16:	learn: 0.5749685	total: 692ms	remaining: 58.2s
17:	learn: 0.5694939	total: 728ms	remaining: 57.8s
18:	learn: 0.5638495	total: 767ms	remaining: 57.6s
19:	learn: 0.5586786	total: 808ms	remaining: 57.7

<catboost.core.CatBoostClassifier at 0x134178f38b0>

In [9]:
print(f'Concordance Index: {concordance_index(y_test, model.predict_proba(X_test)[:, 1])}')

Concordance Index: 0.7844363422975359


In [10]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      0.99      0.96     11658
           1       0.61      0.15      0.25      1070

    accuracy                           0.92     12728
   macro avg       0.77      0.57      0.60     12728
weighted avg       0.90      0.92      0.90     12728
