In [2]:
import numpy as np
import optuna
import pandas as pd
import pickle
from optuna.integration import CatBoostPruningCallback

from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import warnings
warnings.filterwarnings("ignore")

In [13]:
def cross_val_score_catboost(catboost_config, X, Y, cat_cols, pruning_callback, n_splits = 10, use_folds = 10, average = 'macro', seed = 42):
    kf = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state=seed)
    scores = [] 
    for i,(tr_id, ts_id) in enumerate(kf.split(X, Y)):
        model = CatBoostClassifier(**catboost_config)

        X_tr, Y_tr = X.iloc[tr_id], Y.iloc[tr_id]
        X_ts, Y_ts = X.iloc[ts_id], Y.iloc[ts_id]


        tr_pool = Pool(X_tr, Y_tr, cat_features=cat_cols)
        ts_pool = Pool(X_ts, Y_ts, cat_features=cat_cols)

        model.fit(tr_pool, eval_set = ts_pool, verbose = False, callbacks=[pruning_callback])

        pruning_callback.check_pruned()

        pr_ts = np.squeeze(model.predict(ts_pool))

        scores.append(f1_score(Y.iloc[ts_id], pr_ts, average=average))

        if (i >= use_folds - 1):
            break

    return np.mean(scores)

In [14]:
def load_data():
    data = pd.read_csv('processed_train.csv')
    X, Y = data.drop(columns=['ID', 'Статус']), data['Статус']
    with open('categorical_features.pickle', 'rb') as f:
        cat_cols = pickle.load(f)
    return X, Y, cat_cols


In [15]:

def objective(trial: optuna.Trial, X, Y, cat_cols) -> float:


    catboost_config = {
        'use_best_model':True,
        'early_stopping_rounds':300,
        'eval_metric': 'TotalF1:average=Macro',
        'random_seed' : 14121995, 
        'max_ctr_complexity' : 0,
        'iterations' : 1500,
        #'rsm' : trial.suggest_float('rsm', 0.1, 1, step = 0.1),
        'rsm': 0.5,
        #"objective": trial.suggest_categorical("objective", ["MultiClass", "MultiClassOneVsAll"]),
        "depth": trial.suggest_int("depth", 3, 7),
        #'l2_leaf_reg': trial.suggest_float("reg_lambda", 0, 7), 
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.15)
        #"boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        #"bootstrap_type": trial.suggest_categorical( "bootstrap_type", ["Bayesian", "Bernoulli"]),
        
    }
    #if catboost_config["bootstrap_type"] == "Bayesian":
    #    catboost_config["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    #elif catboost_config["bootstrap_type"] == "Bernoulli":
    #    catboost_config["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)


    pruning_callback = CatBoostPruningCallback(trial, "TotalF1:average=Macro")
    score = cross_val_score_catboost(catboost_config, X, Y, cat_cols, pruning_callback, use_folds = 5)


    return score


In [16]:
study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_startup_trials = 20, n_warmup_steps=50), direction="maximize")
study.optimize(lambda trial : objective(trial, *load_data()), n_trials=500)

[32m[I 2022-09-17 03:11:22,369][0m A new study created in memory with name: no-name-24b466e0-019d-4871-8aa8-2a26657d44b6[0m
[32m[I 2022-09-17 03:13:00,534][0m Trial 0 finished with value: 0.7843979556551525 and parameters: {'depth': 6, 'learning_rate': 0.09641663725977909}. Best is trial 0 with value: 0.7843979556551525.[0m
[32m[I 2022-09-17 03:15:48,691][0m Trial 1 finished with value: 0.7925352014639795 and parameters: {'depth': 7, 'learning_rate': 0.13907458506242387}. Best is trial 1 with value: 0.7925352014639795.[0m
[32m[I 2022-09-17 03:18:36,922][0m Trial 2 finished with value: 0.7845759615683551 and parameters: {'depth': 6, 'learning_rate': 0.03830989456160043}. Best is trial 1 with value: 0.7925352014639795.[0m
[32m[I 2022-09-17 03:20:18,077][0m Trial 3 finished with value: 0.786569056250689 and parameters: {'depth': 4, 'learning_rate': 0.08074281932281033}. Best is trial 1 with value: 0.7925352014639795.[0m
[32m[I 2022-09-17 03:23:04,525][0m Trial 4 finished 

In [18]:
study.best_params

{'depth': 7, 'learning_rate': 0.13504974389679972}

In [19]:
experiments = study.trials_dataframe()

In [20]:
experiments = experiments.loc[experiments['state'] != 'PRUNED']

In [24]:
cols = ['value', 'params_depth',  'params_learning_rate']

In [28]:
experiments[cols].sort_values('value', ascending = False).iloc[-30:]

Unnamed: 0,value,params_depth,params_learning_rate
206,0.789666,7,0.136399
110,0.789648,7,0.121349
132,0.789434,7,0.140359
66,0.789341,7,0.138087
133,0.789165,7,0.120276
34,0.789064,7,0.102151
292,0.788975,7,0.115553
54,0.788892,7,0.13068
53,0.788651,7,0.124579
146,0.788631,7,0.125025


In [71]:
study.best_params

{'iterations': 500,
 'rsm': 0.4,
 'depth': 7,
 'reg_lambda': 25.3399242267677,
 'boosting_type': 'Ordered',
 'bootstrap_type': 'Bernoulli',
 'subsample': 0.7178700936638458}