In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from utils import find_optimal_hyperparameters, fit_and_evaluate, load_model_from_numpy
from dataloader import load_and_split_data

In [2]:
X_train, X_test, y_train, y_test = load_and_split_data("data/preprocessed_dataset.csv",
                                                       "increase_stock",
                                                       "low_bike_demand",
                                                       test_size=0.2,
                                                       random_state=0)

In [3]:
model = RandomForestClassifier(random_state=0)
acc, precision, recall, f1, roc_auc, cm = fit_and_evaluate(model,
                                                           X_train,
                                                           y_train,
                                                           X_test,
                                                           y_test,
                                                           verbose=True)

Accuracy: 0.866
Precision: 0.653
Recall: 0.552
F1: 0.598
ROC AUC: 0.914
Confusion Matrix: 
[[245  17]
 [ 26  32]]


In [4]:
param_space = {'bootstrap': [True, False],
               'max_depth': [20, 40, 80, 100, None],
               'max_features': [None, 'sqrt'],
               'min_samples_leaf': [1, 2, 4],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [50, 100, 200]}

rf = RandomForestClassifier(random_state=0)

best_params = find_optimal_hyperparameters(model,
                                           param_space, 
                                           X_train, 
                                           y_train,
                                           cv=5, 
                                           scoring="recall", 
                                           n_jobs=-1, 
                                           save_dir="output/best_params", 
                                           save_file="rf_best.npy")

Best parameters found:  {'bootstrap': True, 'max_depth': 20, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Saving best parameters to 'output/best_params/rf_best.npy'


In [5]:
model = RandomForestClassifier

model = load_model_from_numpy(model, "output/best_params/rf_best.npy", extra_parms={"random_state" : 0})

acc, precision, recall, f1, roc_auc, cm = fit_and_evaluate(model, 
                                                           X_train, 
                                                           y_train, 
                                                           X_test, 
                                                           y_test, 
                                                           verbose=True)

Accuracy: 0.875
Precision: 0.673
Recall: 0.603
F1: 0.636
ROC AUC: 0.905
Confusion Matrix: 
[[245  17]
 [ 23  35]]
