# Example of Catboost usage

Adapted from https://github.com/catboost/tutorials/blob/master/python_tutorial.ipynb 

In [1]:
'''
always enable this extension everytime working with catboost
'''
!jupyter nbextension enable --py widgetsnbextension 

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier, Pool, metrics, cv

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

%matplotlib ipympl

In [3]:
df = pd.read_csv('../data/titanic.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
'''
null value stats
'''
null_value_stats = df.isnull().sum(axis=0)
null_value_stats

Unnamed: 0        0
PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [5]:
null_value_stats[null_value_stats != 0]

Survived     418
Age          263
Fare           1
Cabin       1014
Embarked       2
dtype: int64

In [6]:
'''
train test split
'''
X = df.drop('Survived', axis='columns')
y = df.Survived
X.fillna(-999, inplace=True)
y.fillna(0, inplace=True)
X_train, X_validation, y_train, y_validation = train_test_split(X, y, 
                                                                train_size=0.75,
                                                                random_state=42)

In [7]:
'''
list categorical features (catboost needs the indexes instead col names)
'''
#cat_features_indices = np.where((X.dtypes != float) & (X.dtypes != int))[0] #we dont use this line because sometimes categorical function is encoded in int
cat_features_indices = np.where(X.dtypes != float)[0]
cat_features_indices

array([ 0,  1,  2,  3,  4,  6,  7,  8, 10, 11])

***

### CatBoost basics

In [9]:
params = {'custom_loss': [metrics.Accuracy()], 
          'random_seed': 42, 
          'logging_level': 'Silent'}
model = CatBoostClassifier(**params)

In [10]:
'''
Important: Pool is a function in catboost that used as dataset processing and can also work as wrapper to simplify the code
https://catboost.ai/en/docs/concepts/python-reference_pool
'''
train_pool = Pool(X_train, y_train, 
                  cat_features=cat_features_indices)
validation_pool = Pool(X_validation, y_validation,
                       cat_features=cat_features_indices)
model.fit(train_pool,
          eval_set=validation_pool, 
          plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f85f8878ca0>

In [11]:
model.best_score_

{'learn': {'Accuracy': 0.9653414882772681, 'Logloss': 0.15794144834360274},
 'validation': {'Accuracy': 0.8109756097560976, 'Logloss': 0.4474072552809534}}

In [12]:
model.best_iteration_

371

In [13]:
'''
model crossvalidation
'''
cv_params = model.get_params()
cv_params.update({
    'loss_function': metrics.Logloss()
})

'''
Important: Pool is a function in catboost that used as dataset processing and can also work as wrapper to simplify the code
https://catboost.ai/en/docs/concepts/python-reference_pool
'''
cv_pool = Pool(X, y, cat_features=cat_features_indices)
cv_data = cv(cv_pool, cv_params, nfold=3, shuffle=True, return_models=True, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [14]:
'''
print scores
'''
print(f"best validation score: {np.max(cv_data[0]['test-Accuracy-mean']):.3f} \
 with standard deviation: {np.max(cv_data[0]['test-Accuracy-std'][np.argmax(cv_data[0]['test-Accuracy-mean'])]):.3f} \
 on step: {np.argmax(cv_data[0]['test-Accuracy-mean'])}")

best validation score: 0.811  with standard deviation: 0.009  on step: 171


***

### Model applying

In [15]:
predictions = model.predict(X_validation)
predictions_probs = model.predict_proba(X_validation)
print(predictions[:10])
print(predictions_probs[:10])

[0. 0. 0. 0. 1. 0. 1. 1. 0. 0.]
[[0.97965556 0.02034444]
 [0.69622263 0.30377737]
 [0.89913763 0.10086237]
 [0.87501596 0.12498404]
 [0.4502423  0.5497577 ]
 [0.58115197 0.41884803]
 [0.4502423  0.5497577 ]
 [0.33616123 0.66383877]
 [0.69314912 0.30685088]
 [0.73590357 0.26409643]]


***

### CatBoost features

In [16]:
params = {'iterations': 100, 
          'learning_rate': 0.1, 
          'eval_metric': metrics.Accuracy(),
          'logging_level': 'Silent', 
          'use_best_model': False}

'''
Important: Pool is a function in catboost that used as dataset processing and can also work as wrapper to simplify the code
https://catboost.ai/en/docs/concepts/python-reference_pool
'''
train_pool = Pool(X_train, y_train, cat_features=cat_features_indices)
validation_pool = Pool(X_validation, y_validation, cat_features=cat_features_indices)

#### Use random seed on fit function

In [17]:
model_without_seed = CatBoostClassifier(**params)
model_without_seed.fit(train_pool, eval_set=validation_pool)
print(f'model without seed best score: {model_without_seed.best_score_["learn"]["Accuracy"]:.3f} with steps: {model_without_seed.best_iteration_}')

model without seed best score: 0.846 with steps: 80


In [18]:
random_seed_params = params.copy()
random_seed_params.update({'random_seed': 42})

model_with_seed = CatBoostClassifier(**random_seed_params)
model_with_seed.fit(train_pool, eval_set=validation_pool)
print(f'model with seed 42 best score: {model_with_seed.best_score_["learn"]["Accuracy"]:.3f} with steps: {model_with_seed.best_iteration_}')

model with seed 42 best score: 0.844 with steps: 86


#### Train model and compare between simple model and best model

In [19]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validation_pool)
print(f'Simple model validation: {accuracy_score(y_validation, model.predict(X_validation)):.3f}')

Simple model validation: 0.802


In [20]:
best_model_params = params.copy()
best_model_params.update({'use_best_model': True})

best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validation_pool)
print(f'Best model validation: {accuracy_score(y_validation, best_model.predict(X_validation)):.3f}')

Best model validation: 0.808


#### Train model and compare between simple model and early stoping model

In [21]:
%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validation_pool)
print(f'Simple model validation: {accuracy_score(y_validation, model.predict(X_validation)):.3f}')

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 10 µs
Simple model validation: 0.802


In [22]:
%time
early_stoping_params = params.copy()
early_stoping_params.update({
    'od_type': 'Iter',
    'od_wait': 40})

early_stoping_model = CatBoostClassifier(**early_stoping_params)
early_stoping_model.fit(train_pool, eval_set=validation_pool)
print(f'Early stoping model validation: {accuracy_score(y_validation, early_stoping_model.predict(X_validation)):.3f}')

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 10.3 µs
Early stoping model validation: 0.802


#### Snapshot support

In [23]:
'''
Catboost supports snapshots. 
You can use it for recovering training after an interruption or for starting training with previous results.
'''
model_with_snapshot = CatBoostClassifier(**params)
model_with_snapshot.fit(train_pool, 
                        eval_set=validation_pool, 
                        save_snapshot=True)

<catboost.core.CatBoostClassifier at 0x7f85f7290760>

***

### Feature importances

In [24]:
model = CatBoostClassifier(**params)
model.fit(train_pool, 
          eval_set=validation_pool)

feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns

for score, col_name in sorted(zip(feature_importances, feature_names), reverse=True):
    print(f'{col_name}: {score}')

Sex: 35.68326654804635
Unnamed: 0: 18.662296111508656
Pclass: 9.077638681491615
Age: 7.822145566688731
Cabin: 6.2542593000164315
Ticket: 6.102122076839929
Parch: 4.81928912498079
Fare: 4.339803117116453
Embarked: 4.215511956697817
SibSp: 3.023667516613263
PassengerId: 0.0
Name: 0.0


***

### Eval metrics

In [25]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validation_pool)

'''
We are using two metrices, please be aware that there is tab
to switch between two
'''
eval_metrics = model.eval_metrics(validation_pool, [metrics.AUC(), metrics.Accuracy()], plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [26]:
print(eval_metrics['AUC'][:6])
print(eval_metrics['Accuracy'][:6])

[0.7874613566102928, 0.7864384433533369, 0.7949399890889253, 0.7960992907801419, 0.799827241316603, 0.797372249499909]
[0.774390243902439, 0.774390243902439, 0.774390243902439, 0.774390243902439, 0.7469512195121951, 0.7439024390243902]


***

### Learning processes comparison

In [27]:
depth1_params = params.copy()
depth1_params.update({
    'depth': 1,
    'train_dir': 'model_depth1/', 
    'logging_level': 'Silent'})
model_depth1 = CatBoostClassifier(**depth1_params).fit(train_pool, eval_set=validation_pool)
depth5_params = params.copy()
depth5_params.update({
    'depth': 5,
    'train_dir': 'model_depth5/', 
    'logging_level': 'Silent'})
model_depth5 = CatBoostClassifier(**depth5_params).fit(train_pool, eval_set=validation_pool)

In [28]:
from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth1', 'model_depth5'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

***

### Model saving

In [30]:
'''
saving model
'''
model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('./catboost_model.dump')
'''
load model
'''
model = CatBoostClassifier()
model.load_model('./catboost_model.dump');