In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
dataset_prepared = np.load('dataset_train_prepared.npy')

dataset_X = dataset_prepared[:, :273]
dataset_y = dataset_prepared[:, 273:274].ravel()

seed = 52

X_train, X_test, y_train, y_test = train_test_split(dataset_X, dataset_y, test_size=0.1, random_state=seed)

In [3]:
rf_model = RandomForestClassifier(n_estimators=500,
                                  criterion='entropy', 
                                  max_features=34,
                                  n_jobs=-1, 
                                  bootstrap=True,
                                  random_state=seed)
rf_model.fit(X_train, y_train)

rf_y_pred = rf_model.predict(X_test)
print('accuracy_score:', round(accuracy_score(y_test, rf_y_pred), 3))
print('f1_score:', round(f1_score(y_test, rf_y_pred), 3))

accuracy_score: 0.947
f1_score: 0.872


In [4]:
from catboost import CatBoostClassifier, Pool

cb_dataset_train = Pool(data=X_train, label=y_train)
cb_dataset_test = Pool(data=X_test, label=y_test)

cb_model = CatBoostClassifier(iterations=500, learning_rate=0.03, grow_policy='Depthwise', random_seed=seed)

cb_model.fit(
    X=cb_dataset_train,
    eval_set=cb_dataset_test,
    use_best_model=True,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6538160	test: 0.6537683	best: 0.6537683 (0)	total: 289ms	remaining: 2m 24s
1:	learn: 0.6153445	test: 0.6150965	best: 0.6150965 (1)	total: 456ms	remaining: 1m 53s
2:	learn: 0.5751089	test: 0.5745950	best: 0.5745950 (2)	total: 642ms	remaining: 1m 46s
3:	learn: 0.5368275	test: 0.5361785	best: 0.5361785 (3)	total: 837ms	remaining: 1m 43s
4:	learn: 0.5078125	test: 0.5069459	best: 0.5069459 (4)	total: 1.01s	remaining: 1m 39s
5:	learn: 0.4740279	test: 0.4728936	best: 0.4728936 (5)	total: 1.19s	remaining: 1m 38s
6:	learn: 0.4487070	test: 0.4473995	best: 0.4473995 (6)	total: 1.38s	remaining: 1m 36s
7:	learn: 0.4241321	test: 0.4225577	best: 0.4225577 (7)	total: 1.57s	remaining: 1m 36s
8:	learn: 0.4036798	test: 0.4020935	best: 0.4020935 (8)	total: 1.77s	remaining: 1m 36s
9:	learn: 0.3863286	test: 0.3847627	best: 0.3847627 (9)	total: 1.96s	remaining: 1m 36s
10:	learn: 0.3706767	test: 0.3690315	best: 0.3690315 (10)	total: 2.15s	remaining: 1m 35s
11:	learn: 0.3550765	test: 0.3534076	best

<catboost.core.CatBoostClassifier at 0x249d23bdbd0>

In [5]:
y_pred = cb_model.predict(Pool(data=X_test))
print('accuracy_score:', round(accuracy_score(y_test, y_pred), 3))
print('f1_score:', round(f1_score(y_test, y_pred), 3))

accuracy_score: 0.947
f1_score: 0.872


In [6]:
import pickle
# 0.877
pickle.dump(rf_model, open('models/rf_model.pkl', 'wb'), protocol=4)
pickle.dump(cb_model, open('models/cb_model.pkl', 'wb'), protocol=4)