In [1]:
import pandas as pd 
import numpy as np 

# Classifiers
from catboost import CatBoostClassifier

# Model selection
from sklearn.model_selection import StratifiedKFold, PredefinedSplit

# Metrics
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import make_scorer


In [2]:
tr_df = pd.read_csv('../../../data/feature/cb_train.csv')

val_df = pd.read_csv('../../../data/feature/cb_validation.csv').sample(frac=1)
cv_size = val_df.shape[0]//2
full_stack = [tr_df, val_df.iloc[:cv_size, :]]
full_df = pd.concat(full_stack)

ind_list = [-1,]*tr_df.shape[0]+[0,]*cv_size
prd = PredefinedSplit(ind_list)

ts_df = val_df.iloc[cv_size:, :]

In [3]:
model = CatBoostClassifier()

In [4]:
grid = {'learning_rate': [0.01, 0.05, 0.1],
        'depth': [4, 10, 12],
        'l2_leaf_reg': [ 3, 7, 12],
        }

In [None]:
grid_search_result = model.grid_search(grid, 
                                       X=full_df.iloc[:, :-1], 
                                       y=full_df.iloc[:,-1], 
                                       plot=True)

In [4]:
model.fit(tr_df.iloc[:, :-1], tr_df.iloc[:, -1], eval_set=(val_df.iloc[:cv_size, :-1],val_df.iloc[:cv_size, -1]), plot=False)

emaining: 7.24s
545:	learn: 0.5473044	test: 1.1175668	best: 0.9670742 (6)	total: 8.69s	remaining: 7.22s
546:	learn: 0.5471958	test: 1.1177052	best: 0.9670742 (6)	total: 8.7s	remaining: 7.21s
547:	learn: 0.5469984	test: 1.1179372	best: 0.9670742 (6)	total: 8.71s	remaining: 7.19s
548:	learn: 0.5467781	test: 1.1180550	best: 0.9670742 (6)	total: 8.73s	remaining: 7.17s
549:	learn: 0.5466137	test: 1.1181215	best: 0.9670742 (6)	total: 8.75s	remaining: 7.16s
550:	learn: 0.5464633	test: 1.1182070	best: 0.9670742 (6)	total: 8.76s	remaining: 7.14s
551:	learn: 0.5463257	test: 1.1181839	best: 0.9670742 (6)	total: 8.77s	remaining: 7.12s
552:	learn: 0.5462097	test: 1.1182308	best: 0.9670742 (6)	total: 8.79s	remaining: 7.1s
553:	learn: 0.5460329	test: 1.1182196	best: 0.9670742 (6)	total: 8.8s	remaining: 7.09s
554:	learn: 0.5458838	test: 1.1186686	best: 0.9670742 (6)	total: 8.81s	remaining: 7.07s
555:	learn: 0.5456925	test: 1.1184381	best: 0.9670742 (6)	total: 8.83s	remaining: 7.05s
556:	learn: 0.54551

<catboost.core.CatBoostClassifier at 0x25a54e1f1c8>

In [6]:
y_pred = model.predict(ts_df.iloc[:, :-1])
print(f1_score(ts_df.iloc[:, -1], y_pred,average='macro'))

0.24924104432301153
