In [1]:
import pandas as pd

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index
from sklearn.metrics import classification_report

import optuna
from optuna.samplers import TPESampler
from params_manager import save_params

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from params_manager import INTERNAL_PATH

seed = 42

## Load features

In [4]:
t = pd.read_csv(INTERNAL_PATH / 'train.csv')
t = t[t['outlier'] == 0].drop('outlier', axis=1)

X = t.drop('target', axis=1)
y = t['target']

cat_col = X.select_dtypes(include=['object']).columns.tolist()
y.shape, X.shape

((63636,), (63636, 167))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=seed, stratify=y)

## Hyperparameter Optimization with Optuna

In [6]:
def objective(trial):
    param = {
        'verbose': False,
        'random_seed': seed,
        'eval_metric': "AUC",
        'iterations': trial.suggest_int("iterations", 1000, 2000),
        'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0.01, 0.2),
        'depth': trial.suggest_int("depth", 5, 8),
        'bootstrap_type': trial.suggest_categorical("bootstrap_type", ["Bernoulli", "MVS"]),
        'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 1, 10),
        'subsample': trial.suggest_float("subsample", 0.1, 1),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 30, log=True),
        'random_strength': trial.suggest_float("random_strength", 0.1, 1.7),
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.1),
        'min_data_in_leaf': trial.suggest_int("min_data_in_leaf", 1, 12),
    }
    param['cat_features'] = cat_col 
    param['boosting_type'] = "Plain"
    
    clf = CatBoostClassifier(**param)
    clf.fit(X_train, y_train)
    
    return concordance_index(y_test, clf.predict_proba(X_test)[:,1])

sqlite_db = "sqlite:///cat_sqlite.db"
study_name = "binary_classification_CatBoost"
study = optuna.create_study(storage=sqlite_db, study_name=study_name, 
                            sampler=TPESampler(n_startup_trials=30, multivariate=True, seed=142),
                            direction="maximize", load_if_exists=True)

study.optimize(objective, n_trials=10)

[I 2024-05-20 20:00:39,559] A new study created in RDB with name: binary_classification_CatBoost
[I 2024-05-20 20:02:29,765] Trial 0 finished with value: 0.7435800372933913 and parameters: {'iterations': 1902, 'colsample_bylevel': 0.11598343179935988, 'depth': 7, 'bootstrap_type': 'Bernoulli', 'l2_leaf_reg': 2.1452883176380775, 'subsample': 0.7942951965535842, 'scale_pos_weight': 1.1771482485008467, 'random_strength': 0.7164515694158969, 'learning_rate': 0.08052790330550415, 'min_data_in_leaf': 10}. Best is trial 0 with value: 0.7435800372933913.
[I 2024-05-20 20:03:26,041] Trial 1 finished with value: 0.7535815123544379 and parameters: {'iterations': 1162, 'colsample_bylevel': 0.03187153304782919, 'depth': 6, 'bootstrap_type': 'MVS', 'l2_leaf_reg': 6.533894992945944, 'subsample': 0.4816035095503931, 'scale_pos_weight': 23.588556440638424, 'random_strength': 1.4665351469715433, 'learning_rate': 0.07967307480418344, 'min_data_in_leaf': 11}. Best is trial 1 with value: 0.7535815123544379

In [7]:
print(f"best optimized roc_auc: {study.best_value:0.5f}")

params = study.best_params
params['cat_features'] = cat_col 
params['boosting_type'] = "Plain"

params

best optimized roc_auc: 0.77541


{'iterations': 1582,
 'colsample_bylevel': 0.0539994952777691,
 'depth': 7,
 'bootstrap_type': 'MVS',
 'l2_leaf_reg': 5.520007203112511,
 'subsample': 0.5881291485538114,
 'scale_pos_weight': 0.10780172805153221,
 'random_strength': 0.17471128604587313,
 'learning_rate': 0.08394542561551735,
 'min_data_in_leaf': 9,
 'cat_features': ['employee_count_nm',
  'most_time_of_day__agg_user',
  'most_season_of_year__agg_user'],
 'boosting_type': 'Plain'}

In [8]:
save_params(params, 'cat')

In [9]:
model = CatBoostClassifier(**params)
model.fit(X_train, y_train)

0:	learn: 0.5216372	total: 52.7ms	remaining: 1m 23s
1:	learn: 0.3928998	total: 105ms	remaining: 1m 23s
2:	learn: 0.3024597	total: 152ms	remaining: 1m 19s
3:	learn: 0.2374300	total: 203ms	remaining: 1m 20s
4:	learn: 0.1907279	total: 249ms	remaining: 1m 18s
5:	learn: 0.1551324	total: 303ms	remaining: 1m 19s
6:	learn: 0.1307722	total: 349ms	remaining: 1m 18s
7:	learn: 0.1129227	total: 398ms	remaining: 1m 18s
8:	learn: 0.0987300	total: 446ms	remaining: 1m 17s
9:	learn: 0.0878455	total: 495ms	remaining: 1m 17s
10:	learn: 0.0799375	total: 557ms	remaining: 1m 19s
11:	learn: 0.0740976	total: 613ms	remaining: 1m 20s
12:	learn: 0.0693632	total: 665ms	remaining: 1m 20s
13:	learn: 0.0656100	total: 710ms	remaining: 1m 19s
14:	learn: 0.0628370	total: 754ms	remaining: 1m 18s
15:	learn: 0.0604669	total: 801ms	remaining: 1m 18s
16:	learn: 0.0585434	total: 853ms	remaining: 1m 18s
17:	learn: 0.0570386	total: 909ms	remaining: 1m 18s
18:	learn: 0.0557540	total: 956ms	remaining: 1m 18s
19:	learn: 0.0547580	

<catboost.core.CatBoostClassifier at 0x1b8497ab110>

In [10]:
print(f'Concordance Index: {concordance_index(y_test, model.predict_proba(X_test)[:, 1])}')

Concordance Index: 0.7723326647458807


In [11]:
print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     11658
           1       0.82      0.05      0.09      1070

    accuracy                           0.92     12728
   macro avg       0.87      0.52      0.52     12728
weighted avg       0.91      0.92      0.88     12728
