In [1]:
import pandas as pd

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index
from sklearn.metrics import classification_report

import optuna
from optuna.samplers import TPESampler
from params_manager import save_params

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from params_manager import INTERNAL_PATH

seed = 142

## Load features

In [4]:
t = pd.read_csv(INTERNAL_PATH / 'train.csv')
t = t[t['outlier'] == 0].drop('outlier', axis=1)

X = t.drop('target', axis=1)
y = t['target']

cat_col = X.select_dtypes(include=['object']).columns.tolist()
y.shape, X.shape

((63636,), (63636, 167))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=seed, stratify=y)

## Hyperparameter Optimization with Optuna

In [6]:
old_verbosity = optuna.logging.get_verbosity()
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(trial):
    param = {
        'verbose': False,
        'random_seed': seed,
        'eval_metric': "AUC",
        'iterations': trial.suggest_int("iterations", 1000, 2000),
        'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0.01, 0.2),
        'depth': trial.suggest_int("depth", 5, 8),
        'bootstrap_type': trial.suggest_categorical("bootstrap_type", ["Bernoulli", "MVS"]),
        'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 1, 10),
        'subsample': trial.suggest_float("subsample", 0.1, 1),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 30, log=True),
        'random_strength': trial.suggest_float("random_strength", 0.1, 1.7),
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.1),
        'min_data_in_leaf': trial.suggest_int("min_data_in_leaf", 1, 12),
    }
    param['cat_features'] = cat_col 
    param['boosting_type'] = "Plain"
    
    clf = CatBoostClassifier(**param)
    clf.fit(X_train, y_train)
    
    return concordance_index(y_test, clf.predict_proba(X_test)[:,1])

sqlite_db = "sqlite:///cat_sqlite2.db"
study_name = "binary_classification_CatBoost2"
study = optuna.create_study(storage=sqlite_db, study_name=study_name, 
                            sampler=TPESampler(n_startup_trials=30, multivariate=True, seed=seed),
                            direction="maximize", load_if_exists=True)

study.optimize(objective, n_trials=100)

In [7]:
print(f"best optimized roc_auc: {study.best_value:0.5f}")

params = study.best_params
params['cat_features'] = cat_col 
params['boosting_type'] = "Plain"

params

best optimized roc_auc: 0.78754


{'iterations': 1256,
 'colsample_bylevel': 0.13972351147317655,
 'depth': 8,
 'bootstrap_type': 'MVS',
 'l2_leaf_reg': 6.446291993378921,
 'subsample': 0.7419687042057698,
 'scale_pos_weight': 0.4927418580798812,
 'random_strength': 0.9336186575859631,
 'learning_rate': 0.019444305287352793,
 'min_data_in_leaf': 6,
 'cat_features': ['employee_count_nm',
  'most_time_of_day__agg_user',
  'most_season_of_year__agg_user'],
 'boosting_type': 'Plain'}

In [8]:
save_params(params, 'cat2')

In [9]:
model = CatBoostClassifier(**params)
model.fit(X_train, y_train)

0:	learn: 0.6640801	total: 75.5ms	remaining: 1m 34s
1:	learn: 0.6384287	total: 186ms	remaining: 1m 56s
2:	learn: 0.6138602	total: 342ms	remaining: 2m 22s
3:	learn: 0.5902694	total: 438ms	remaining: 2m 17s
4:	learn: 0.5680710	total: 521ms	remaining: 2m 10s
5:	learn: 0.5455490	total: 602ms	remaining: 2m 5s
6:	learn: 0.5249185	total: 696ms	remaining: 2m 4s
7:	learn: 0.5053810	total: 764ms	remaining: 1m 59s
8:	learn: 0.4866572	total: 839ms	remaining: 1m 56s
9:	learn: 0.4697650	total: 933ms	remaining: 1m 56s
10:	learn: 0.4540593	total: 1.02s	remaining: 1m 55s
11:	learn: 0.4385752	total: 1.1s	remaining: 1m 54s
12:	learn: 0.4237690	total: 1.2s	remaining: 1m 54s
13:	learn: 0.4095079	total: 1.27s	remaining: 1m 52s
14:	learn: 0.3967110	total: 1.36s	remaining: 1m 52s
15:	learn: 0.3843178	total: 1.45s	remaining: 1m 52s
16:	learn: 0.3734096	total: 1.54s	remaining: 1m 52s
17:	learn: 0.3627240	total: 1.62s	remaining: 1m 51s
18:	learn: 0.3518809	total: 1.72s	remaining: 1m 51s
19:	learn: 0.3420653	tota

<catboost.core.CatBoostClassifier at 0x21998b87200>

In [10]:
print(f'Concordance Index: {concordance_index(y_test, model.predict_proba(X_test)[:, 1])}')

Concordance Index: 0.7865805519614304


In [11]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     11658
           1       0.84      0.08      0.15      1070

    accuracy                           0.92     12728
   macro avg       0.88      0.54      0.55     12728
weighted avg       0.92      0.92      0.89     12728
