In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [50]:
df = pd.read_csv("data/preprocessed_dataset.csv")
df['increase_stock'] = np.where(df['increase_stock'] == 'low_bike_demand', 0, 1)

df.head(5)

Unnamed: 0,hour_of_day,day_of_week,month,weekday,summertime,temp,dew,humidity,snowdepth,windspeed,cloudcover,visibility,increase_stock,day,rain
0,5,5,1,0,0,-7.2,-15.0,53.68,0,16.3,31.6,16.0,0,0,0
1,21,4,1,1,0,-1.3,-12.8,40.97,0,23.9,85.7,16.0,0,0,0
2,21,3,8,1,1,26.9,21.8,73.39,0,0.0,81.1,16.0,0,0,0
3,1,6,1,0,0,3.1,-4.0,59.74,0,19.2,0.0,16.0,0,0,0
4,17,0,3,1,0,11.7,-11.4,18.71,0,10.5,44.6,16.0,0,1,0


In [51]:
X = df.copy()
y = X.pop("increase_stock")

print(X.shape)
print(y.shape)

(1600, 14)
(1600,)


In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [53]:
model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)


threshold = 0.5
y_pred_proba = model.predict_proba(X_test)
y_pred = (y_pred_proba[:, 1] >= threshold).astype(int)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred) 
recall = recall_score(y_test, y_pred) 
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Random Forest \n")
print(f"Accuracy: {round(accuracy, 4)}")
print(f'Precision: {round(precision, 4)}')
print(f'Recall: {round(recall, 4)}')
print(f'Confusion Matrix: \n{conf_matrix}')

Random Forest 

Accuracy: 0.8656
Precision: 0.6471
Recall: 0.569
Confusion Matrix: 
[[244  18]
 [ 25  33]]


In [None]:
param_space = {'bootstrap': [True, False],
               'max_depth': [20, 40, 80, 100, None],
               'max_features': [None, 'sqrt'],
               'min_samples_leaf': [1, 2, 4],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [50, 100, 200]}

rf = RandomForestClassifier(random_state=0)

rf_grid_search = GridSearchCV(estimator = rf,
                               param_grid = param_space,
                               cv = 5,
                               verbose=2,
                               scoring="recall")

# Fit the random search model
rf_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits
[CV] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total ti

In [77]:
print(rf_grid_search.best_params_)

{'bootstrap': True, 'max_depth': 20, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}


In [88]:
model = RandomForestClassifier(max_depth=20,
                               max_features = None,
                               min_samples_leaf = 4,
                               min_samples_split = 10,
                               n_estimators = 100,
                               random_state=0)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

recall = recall_score(y_test, y_pred)

print(recall)

0.7894736842105263


In [None]:

k = 10  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=0)

# Collect performance metrics for each fold
fold_accuracies = []

for train_index, test_index in kf.split(X_train):
    
    model = RandomForestClassifier(max_depth=20,
                               max_features = None,
                               min_samples_leaf = 4,
                               min_samples_split = 10,
                               n_estimators = 100,
                               random_state=0)

    # Split data into train and test for this fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate accuracy for this fold
    recall = recall_score(y_test, y_pred)
    fold_accuracies.append(recall)

# Average performance across all folds
print(f"K-Fold Cross-Validation Results ({k} folds):")
print(f"Recall for each fold: {fold_accuracies}")
print(f"Mean Recall: {np.mean(fold_accuracies):.4f}")
print(f"Standard Deviation of Recall: {np.std(fold_accuracies):.4f}")

K-Fold Cross-Validation Results (10 folds):
Recall for each fold: [np.float64(0.5833333333333334), np.float64(0.6111111111111112), np.float64(0.6538461538461539), np.float64(0.625), np.float64(0.6363636363636364), np.float64(0.6428571428571429), np.float64(0.6), np.float64(0.6842105263157895), np.float64(0.7333333333333333), np.float64(0.7894736842105263)]
Mean Recall: 0.6560
Standard Deviation of Recall: 0.0605
