# CatBoost Optuna

## Load data

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


In [3]:
cat_cols = [x for x in train.columns if x.startswith('cat')]
cat_cols

['cat0',
 'cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9',
 'cat10',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18']

In [4]:
from categorical_transform import CategoricalTransform,IntegerCategoricalTransform
ct = IntegerCategoricalTransform(cat_cols)
x_train = ct.fit_transform(train)
x_test = ct.transform(test)

In [5]:
x_train = train.drop(columns=['id','target'])
y_train = train['target']

In [6]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True)

# Optuna optimization

In [7]:
from sklearn.metrics import roc_auc_score
import optuna
import numpy as np
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline

def objective(trial):
    params = {'iterations':10000,
              'depth': trial.suggest_int("depth", 4, 16),
              'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 0.0001, 25, log=True),
              'bagging_temperature': trial.suggest_float("bagging_temperature", 0, 10),
              'auto_class_weights':trial.suggest_categorical('auto_class_weights', [None,'Balanced','SqrtBalanced']),
              'grow_policy': 'Lossguide',
              'early_stopping_rounds':200,
              'eval_metric':'AUC',
              'bootstrap_type':'Bayesian',
              'use_best_model':True,
              'task_type':'GPU', 
              'cat_features':cat_cols,
              'verbose':False,
              'border_count':254              
             }
    #'grow_policy': trial.suggest_categorical('grow_policy',['SymmetricTree','Depthwise','Lossguide']),              
    #if params['grow_policy'] in ['Depthwise','Lossguide']:
    #    params['min_data_in_leaf'] = trial.suggest_int("min_data_in_leaf", 1, 5000, log=True)
    #if params['grow_policy'] in ['Lossguide']:
    #    params['max_leaves'] = trial.suggest_int("max_leaves", 1, 64)
    
    cbc = CatBoostClassifier(**params)
    kf = KFold(n_splits=5, shuffle=True)
    roc_test = []
    for train_index, test_index in kf.split(x_train):
        x_train_fold, x_test_fold = x_train.loc[train_index], x_train.loc[test_index]
        y_train_fold, y_test_fold = y_train.loc[train_index], y_train.loc[test_index]
        cbc.fit(x_train_fold, y_train_fold, eval_set=(x_test_fold, y_test_fold))    
        proba = cbc.predict_proba(x_test_fold)[:,1]
        roc_test.append(roc_auc_score(y_test_fold, proba))
    return np.mean(roc_test)

In [8]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=5*60*60)
print(study.best_trial)

[32m[I 2021-03-28 14:38:22,323][0m A new study created in memory with name: no-name-60d31029-b42f-4829-a976-f406ed136f56[0m
[32m[I 2021-03-28 15:13:49,449][0m Trial 0 finished with value: 0.8915953788292288 and parameters: {'depth': 5, 'l2_leaf_reg': 0.48417598394214867, 'bagging_temperature': 4.4242383805599275, 'auto_class_weights': None}. Best is trial 0 with value: 0.8915953788292288.[0m
[32m[I 2021-03-28 15:52:31,828][0m Trial 1 finished with value: 0.8936609956494618 and parameters: {'depth': 14, 'l2_leaf_reg': 4.319785912176077, 'bagging_temperature': 5.909967489974926, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8936609956494618.[0m
[32m[I 2021-03-28 16:31:23,616][0m Trial 2 finished with value: 0.8941469743841314 and parameters: {'depth': 15, 'l2_leaf_reg': 0.0046887635296032305, 'bagging_temperature': 8.287477579452695, 'auto_class_weights': 'Balanced'}. Best is trial 2 with value: 0.8941469743841314.[0m
[32m[I 2021-03-28 17:00:57,521][0

FrozenTrial(number=2, values=[0.8941469743841314], datetime_start=datetime.datetime(2021, 3, 28, 15, 52, 31, 829255), datetime_complete=datetime.datetime(2021, 3, 28, 16, 31, 23, 616020), params={'depth': 15, 'l2_leaf_reg': 0.0046887635296032305, 'bagging_temperature': 8.287477579452695, 'auto_class_weights': 'Balanced'}, distributions={'depth': IntUniformDistribution(high=16, low=4, step=1), 'l2_leaf_reg': LogUniformDistribution(high=25, low=0.0001), 'bagging_temperature': UniformDistribution(high=10, low=0), 'auto_class_weights': CategoricalDistribution(choices=(None, 'Balanced', 'SqrtBalanced'))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=2, state=TrialState.COMPLETE, value=None)


In [9]:
study.best_params

{'depth': 15,
 'l2_leaf_reg': 0.0046887635296032305,
 'bagging_temperature': 8.287477579452695,
 'auto_class_weights': 'Balanced'}

In [10]:
study.best_value

0.8941469743841314

In [11]:
len(study.trials)

9

In [12]:
from optuna.visualization import plot_optimization_history, plot_param_importances
plot_optimization_history(study)

In [13]:
plot_param_importances(study)