In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier

### Loading the dataset

In [21]:
train_data = pd.read_csv('data/training_data_fall2024.csv')

# Transform label into 0 (low_bike_demand) and 1 (high_bike_demand)
train_data['increase_stock'] = np.where(train_data['increase_stock'] == 'low_bike_demand', 0, 1)

X = train_data.copy()
y = X.pop('increase_stock')

cat_features = ['hour_of_day', 'day_of_week', 'month', 'holiday', 'weekday', 'summertime']

In [22]:
# Split the data into test and train
X_train, X_test, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0)

### Defining CatBoost function

In [23]:
def catboost(X, y, test_size=0.2, cat_features=None):
    # Split the data into test and train
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=0)

    # Load and fit the model
    model = CatBoostClassifier(
        random_seed=0,
        cat_features=cat_features,
        #iterations=1000,
        #learning_rate=0.1,
        #depth=5
    )
    model.fit(X_train, y_train, verbose=0)

    # Compute predictions
    y_pred = model.predict(X_valid)

    # Compute metrics
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred) # Proportion of 'low_bike_demand' (0) that were correctly predicted
    recall = recall_score(y_valid, y_pred) # Proportion of 'high_bike_demand' (1) that were correctly predicted
    conf_matrix = confusion_matrix(y_valid, y_pred)

    print(f"Logistic Regression Model \n")
    print(f"Accuracy: {round(accuracy, 4)}")
    print(f'Precision: {round(precision, 4)}')
    print(f'Recall: {round(recall, 4)}')
    print(f'Confusion Matrix: \n{conf_matrix}')

    return model


In [24]:
catboost(X_train, y_train, test_size=0.3, cat_features=cat_features)

Logistic Regression Model 

Accuracy: 0.881
Precision: 0.75
Recall: 0.6429
Confusion Matrix: 
[[251  15]
 [ 25  45]]


<catboost.core.CatBoostClassifier at 0x26bf7a8bd10>

### Tune the model

In [None]:
model = CatBoostClassifier(random_seed=0, cat_features=cat_features)

param_grid = {
    'iterations': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [6, 8, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=2, scoring='recall')

grid_search.fit(X_train, y_train, verbose=0)

In [None]:
print(grid_search.best_params_)

In [25]:
tuned_model = CatBoostClassifier(
    random_seed=0,
    cat_features=cat_features,
    #iterations=grid_search.best_params_['iterations'],
    iterations=500,
    #learning_rate=grid_search.best_params_['learning_rate'],
    learning_rate=0.05,
    #depth=grid_search.best_params_['depth']
    depth=8
)

tuned_model.fit(X_train, y_train, verbose=0)


<catboost.core.CatBoostClassifier at 0x26bf7b62d10>

### Save the optimal model

In [26]:
tuned_model.save_model('models/catboost_model.cbm')

## Final Evaluation

### Load the model

In [27]:
catboost_model = CatBoostClassifier()
catboost_model.load_model('models/catboost_model.cbm')

<catboost.core.CatBoostClassifier at 0x26bf6b9dd90>

### Compute predictions and metrics

In [28]:
y_pred = catboost_model.predict(X_test)

# Compute metrics
accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred) # Proportion of 'low_bike_demand' (0) that were correctly predicted
recall = recall_score(y_valid, y_pred) # Proportion of 'high_bike_demand' (1) that were correctly predicted
conf_matrix = confusion_matrix(y_valid, y_pred)

print(f"CatBoost Classifier Model \n")
print(f"Accuracy: {round(accuracy, 4)}")
print(f'Precision: {round(precision, 4)}')
print(f'Recall: {round(recall, 4)}')
print(f'Confusion Matrix: \n{conf_matrix}')

CatBoost Classifier Model 

Accuracy: 0.8938
Precision: 0.7089
Recall: 0.6667
Confusion Matrix: 
[[373  23]
 [ 28  56]]


## Loading pre-processed-2 dataset

In [2]:
train_data = pd.read_csv('data/preprocessed_data_2.csv', sep=';')

X = train_data.copy()
y = X.pop('increase_stock')

In [6]:
# Split the data into test and train
X_train, X_test, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0)

In [11]:
catboost_model = catboost(X_train, y_train, test_size=0.3)

Logistic Regression Model 

Accuracy: 0.8899
Precision: 0.7705
Recall: 0.6714
Confusion Matrix: 
[[252  14]
 [ 23  47]]


In [12]:
y_pred = catboost_model.predict(X_test)

# Compute metrics
accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred) # Proportion of 'low_bike_demand' (0) that were correctly predicted
recall = recall_score(y_valid, y_pred) # Proportion of 'high_bike_demand' (1) that were correctly predicted
conf_matrix = confusion_matrix(y_valid, y_pred)

print(f"CatBoost Classifier Model \n")
print(f"Accuracy: {round(accuracy, 4)}")
print(f'Precision: {round(precision, 4)}')
print(f'Recall: {round(recall, 4)}')
print(f'Confusion Matrix: \n{conf_matrix}')

CatBoost Classifier Model 

Accuracy: 0.8979
Precision: 0.7273
Recall: 0.6667
Confusion Matrix: 
[[375  21]
 [ 28  56]]


## Loading pre-processed dataset

In [17]:
train_data = pd.read_csv('data/preprocessed_dataset.csv')

# Transform label into 0 (low_bike_demand) and 1 (high_bike_demand)
train_data['increase_stock'] = np.where(train_data['increase_stock'] == 'low_bike_demand', 0, 1)

X = train_data.copy()
y = X.pop('increase_stock')

In [18]:
# Split the data into test and train
X_train, X_test, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0)

In [19]:
catboost_model = catboost(X_train, y_train, test_size=0.3)

Logistic Regression Model 

Accuracy: 0.8839
Precision: 0.7627
Recall: 0.6429
Confusion Matrix: 
[[252  14]
 [ 25  45]]


In [20]:
y_pred = catboost_model.predict(X_test)

# Compute metrics
accuracy = accuracy_score(y_valid, y_pred)
precision = precision_score(y_valid, y_pred) # Proportion of 'low_bike_demand' (0) that were correctly predicted
recall = recall_score(y_valid, y_pred) # Proportion of 'high_bike_demand' (1) that were correctly predicted
conf_matrix = confusion_matrix(y_valid, y_pred)

print(f"CatBoost Classifier Model \n")
print(f"Accuracy: {round(accuracy, 4)}")
print(f'Precision: {round(precision, 4)}')
print(f'Recall: {round(recall, 4)}')
print(f'Confusion Matrix: \n{conf_matrix}')

CatBoost Classifier Model 

Accuracy: 0.8812
Precision: 0.6957
Recall: 0.5714
Confusion Matrix: 
[[375  21]
 [ 36  48]]
