In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [34]:
dataset_prepared = np.load('dataset_train_prepared.npy')
dataset_prepared.shape

dataset_X = dataset_prepared[:, :257]
dataset_y = dataset_prepared[:, 257:258].ravel()

seed = 52

X_train, X_test, y_train, y_test = train_test_split(dataset_X, dataset_y, test_size=0.1, random_state=seed)

In [35]:
rf_model = RandomForestClassifier(n_estimators=500,
                                  criterion='entropy', 
                                  max_features=34,
                                  n_jobs=-1, 
                                  bootstrap=True,
                                  random_state=seed)
rf_model.fit(X_train, y_train)

In [39]:
rf_y_pred = rf_model.predict(X_test)
print('accuracy_score:', round(accuracy_score(y_test, rf_y_pred), 3))

accuracy_score: 0.947


In [37]:
from catboost import CatBoostClassifier, Pool

cb_dataset_train = Pool(data=X_train, label=y_train)
cb_dataset_test = Pool(data=X_test, label=y_test)

cb_model = CatBoostClassifier(iterations=500, learning_rate=0.03, grow_policy='Depthwise', random_seed=seed)

cb_model.fit(
    X=cb_dataset_train,
    eval_set=cb_dataset_test,
    use_best_model=True,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6450438	test: 0.6448289	best: 0.6448289 (0)	total: 1.31s	remaining: 10m 54s
1:	learn: 0.6044951	test: 0.6040421	best: 0.6040421 (1)	total: 2.4s	remaining: 9m 57s
2:	learn: 0.5627054	test: 0.5619021	best: 0.5619021 (2)	total: 3.5s	remaining: 9m 40s
3:	learn: 0.5246858	test: 0.5235714	best: 0.5235714 (3)	total: 4.64s	remaining: 9m 35s
4:	learn: 0.4950466	test: 0.4936826	best: 0.4936826 (4)	total: 5.8s	remaining: 9m 34s
5:	learn: 0.4701777	test: 0.4685753	best: 0.4685753 (5)	total: 6.87s	remaining: 9m 25s
6:	learn: 0.4412751	test: 0.4394938	best: 0.4394938 (6)	total: 7.95s	remaining: 9m 20s
7:	learn: 0.4164450	test: 0.4145637	best: 0.4145637 (7)	total: 9.07s	remaining: 9m 18s
8:	learn: 0.3931812	test: 0.3911824	best: 0.3911824 (8)	total: 10.2s	remaining: 9m 18s
9:	learn: 0.3758723	test: 0.3738491	best: 0.3738491 (9)	total: 11.4s	remaining: 9m 19s
10:	learn: 0.3556310	test: 0.3535816	best: 0.3535816 (10)	total: 12.6s	remaining: 9m 19s
11:	learn: 0.3374542	test: 0.3353538	best: 

<catboost.core.CatBoostClassifier at 0x2769019ca10>

In [40]:
y_pred = cb_model.predict(Pool(data=X_test))
print('accuracy_score:', round(accuracy_score(y_test, y_pred), 3))

accuracy_score: 0.948


: 

In [28]:
import pickle
# 0.872
pickle.dump(rf_model, open('models/rf_model.pkl', 'wb'), protocol=4)
pickle.dump(cb_model, open('models/cb_model.pkl', 'wb'), protocol=4)