In [None]:
from sklearn.ensemble import RandomForestClassifier
from nb_utils import load_and_split_data, find_optimal_hyperparameters, load_model_from_json, fit_and_evaluate

### Load and split the dataset

In [2]:
X_train, X_test, y_train, y_test = load_and_split_data("data/training_data_preprocessed.csv",
                                                       target_column='increase_stock',
                                                       class_zero='low_bike_demand',
                                                       test_size=0.2,
                                                       random_state=0)

### Load, fit and evaluate the initial model

In [3]:
model = RandomForestClassifier(random_state=0)
results = fit_and_evaluate(model,
                           X_train,
                           y_train,
                           X_test,
                           y_test,
                           verbose=True)

Evaluating RandomForestClassifier
Accuracy: 0.8750
Precision: 0.6875
Recall: 0.5690
F1: 0.6226
ROC AUC: 0.9148
Confusion Matrix: 
[[247  15]
 [ 25  33]]



### Find optimal hyperparameters

In [None]:
param_space = {'bootstrap': [True, False],
               'max_depth': [20, 40, 80, 100, None],
               'max_features': [None, 'sqrt'],
               'min_samples_leaf': [1, 2, 4],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [50, 100, 200]
               }

best_params = find_optimal_hyperparameters(RandomForestClassifier,
                                           param_space,
                                           X_train,
                                           y_train,
                                           cv=5,
                                           scoring='accuracy',
                                           save_dir="output/best_params",
                                           save_file="rf_best_params.json",
                                           extra_args={"random_state": 0})

Best parameters found:  {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Saving best parameters to 'output/best_params/rf_best.json'


### Use optimal hyperparameters to train and evaluate

In [None]:
opt_model = load_model_from_json(RandomForestClassifier, "output/best_params/rf_best_params.json")

opt_results = fit_and_evaluate(model,
                               X_train,
                               y_train,
                               X_test,
                               y_test,
                               verbose=True)

Evaluating RandomForestClassifier
Accuracy: 0.8750
Precision: 0.6875
Recall: 0.5690
F1: 0.6226
ROC AUC: 0.9148
Confusion Matrix: 
[[247  15]
 [ 25  33]]

