# Example of Catboost using HyperOpt as hyperparameter tuning

Example taken from https://github.com/catboost/tutorials/blob/master/python_tutorial.ipynb

In [1]:
'''
always enable this when using catboost
'''
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier, Pool, metrics, cv

import hyperopt

import pandas as pd
import numpy as np
from numpy.random import RandomState

import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [3]:
df = pd.read_csv('../data/titanic.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
'''
train test split
'''
X = df.drop('Survived', axis='columns')
y = df.Survived
X.fillna(-999, inplace=True)
y.fillna(0, inplace=True)
X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                train_size=0.75,
                                                                random_state=42)

In [5]:
'''
list categorical features (catboost needs the indexes instead col names)
'''
#cat_features_indices = np.where((X.dtypes != float) & (X.dtypes != int))[0] #we dont use this because some categorical features are encoded in int
cat_features_indices = np.where(X.dtypes != float)[0]
cat_features_indices

array([ 0,  1,  2,  3,  4,  6,  7,  8, 10, 11])

In [6]:
train_pool = Pool(X_train, y_train, cat_features=cat_features_indices)
validation_pool = Pool(X_validation, y_validation, cat_features=cat_features_indices)

In [7]:
params = {'custom_loss': [metrics.Accuracy()], 
          'random_seed': 42, 
          'logging_level': 'Silent'}

model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validation_pool)

<catboost.core.CatBoostClassifier at 0x7f0aa8f928b0>

***

## Hyperparameter tuning

In [8]:
'''
set hyperopt objective, please pay attention to params and params_space
'''
def hyperopt_objective(params_space):
    '''
    whithin this function where we define our configuration/parameters
    '''
    params = {
        'l2_leaf_reg': int(params_space['l2_leaf_reg']),
        'learning_rate': params_space['learning_rate'],
        'iterations': 500,
        'eval_metric':  metrics.Accuracy(),
        'random_seed': 42,
        'verbose': False, 
        'loss_function': metrics.Logloss()}
 
    train_pool = Pool(X_train, y_train, cat_features=cat_features_indices)
    validation_pool = Pool(X_validation, y_validation, cat_features=cat_features_indices)
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=validation_pool)
    
    '''
    here we use cv because purpose of tuning is cross validation
    
    cv reference https://catboost.ai/en/docs/concepts/python-reference_cv
    
    we can also add nfold as arguments for kfold cv
    and also have plot=True
    '''
    #cv_data = cv(train_pool, nfold=3, model.get_params(), logging_level='Silent', plot=True) #if we want to have nfold cv and plot
    cv_data = cv(train_pool, model.get_params(), logging_level='Silent')
    
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy #as hyperopt minimises

### Hyperparameter tuning process

In [10]:
'''
set params space for hyperparamters
'''
params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1)}

'''
trials for tuning
'''
trials = hyperopt.Trials()

'''
retrieve the best parameters
'''
hyperopt_params = {
    'fn': hyperopt_objective,
    'space': params_space,
    'algo': hyperopt.tpe.suggest,
    'max_evals': 50,
    'trials': trials}
best_parameters = hyperopt.fmin(**hyperopt_params)

100%|████████| 50/50 [12:24<00:00, 14.89s/trial, best loss: 0.18959121635174248]


In [11]:
print(f'best parameters: {best_parameters}')

best parameters: {'l2_leaf_reg': 1.0, 'learning_rate': 0.22152507179801384}


### Re-train using best parameters

In [59]:
'''
re-train model and do cross validation using best parameters get from hyperparamters trials
'''
train_pool = Pool(X_train, y_train, cat_features=cat_features_indices)

best_hyperparamters_params = {
        'l2_leaf_reg': int(best_parameters['l2_leaf_reg']),
        'learning_rate': best_parameters['learning_rate'],
        'iterations': 500,
        'eval_metric':  metrics.Accuracy(),
        'random_seed': 42,
        'verbose': False,
        'loss_function': metrics.Logloss()}

model_best_parameters = CatBoostClassifier(**best_hyperparamters_params)
model_best_parameters.fit(train_pool, eval_set=validation_pool)

cv_data = cv(train_pool, model_best_parameters.get_params(), nfold=3, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]

bestTest = 0.8048780488
bestIteration = 15

Training on fold [1/3]

bestTest = 0.8287461774
bestIteration = 49

Training on fold [2/3]

bestTest = 0.8067484663
bestIteration = 49



In [62]:
print(f'Precise validation accuracy score: {np.max(cv_data["test-Accuracy-mean"]):.3f}')

Precise validation accuracy score: 0.810
