In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
dataset_prepared = np.load('dataset_train_prepared.npy')
dataset_prepared.shape

In [None]:
dataset_X = dataset_prepared[:, :257]
dataset_y = dataset_prepared[:, 257:258].ravel()

seed = 52

X_train, X_test, y_train, y_test = train_test_split(dataset_X, dataset_y, test_size=0.1, random_state=seed)

rf_model = RandomForestClassifier(n_estimators=500,
                                  criterion='entropy', 
                                  max_features=34,
                                  n_jobs=-1, 
                                  bootstrap=True,
                                  random_state=seed)
rf_model.fit(X_train, y_train)

rf_y_pred_train = rf_model.predict(X_train)
rf_y_pred_test = rf_model.predict(X_test)

print(f1_score(y_test, rf_y_pred_test))


In [None]:
from catboost import CatBoostClassifier, Pool

cb_dataset_train = Pool(data=X_train, label=y_train, baseline=rf_y_pred_train)
cb_dataset_test = Pool(data=X_test, label=y_test, baseline=rf_y_pred_test)

cb_model = CatBoostClassifier(iterations=400, learning_rate=0.03, grow_policy='Depthwise', random_seed=seed)

cb_model.fit(
    X=cb_dataset_train,
    eval_set=cb_dataset_test,
    use_best_model=True,
    plot=True
)

y_pred = cb_model.predict(Pool(data=X_test, baseline=rf_y_pred_test))

print(f1_score(y_test, y_pred))

In [None]:
import pickle
# 0.873
pickle.dump(rf_model, open('models/rf_model.pkl', 'wb'), protocol=4)
pickle.dump(cb_model, open('models/cb_model.pkl', 'wb'), protocol=4)