## Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn.linear_model as skl_lm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

### Loading the dataset

In [2]:
train_data = pd.read_csv('data/training_data_fall2024.csv')

# Transform label into 0 (low_bike_demand) and 1 (high_bike_demand)
train_data['increase_stock'] = np.where(train_data['increase_stock'] == 'low_bike_demand', 0, 1)

X = train_data.copy()
y = X.pop('increase_stock')

cat_features = ['hour_of_day', 'day_of_week', 'month', 'holiday', 'weekday', 'summertime']

### Defining Logistic Regression function

In [3]:
# Logistic Regression model with one-hot encoding
def logistic_regression(X, y, test_size=0.2, cat_features=None):
    # One-hot encode categorical features
    X = pd.get_dummies(X, columns=cat_features)

    # Split the data into test (0.8) and train (0.2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)

    # Load and fit the model
    logr_model = skl_lm.LogisticRegression(max_iter=5000)
    logr_model.fit(X_train, y_train)

    # Compute predictions
    y_pred = logr_model.predict(X_test)

    # Compute the metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred) # Proportion of 'low_bike_demand' (0) that were correctly predicted
    recall = recall_score(y_test, y_pred) # Proportion of 'high_bike_demand' (1) that were correctly predicted
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f"Logistic Regression Model \n")
    print(f"Accuracy: {round(accuracy, 4)}")
    print(f'Precision: {round(precision, 4)}')
    print(f'Recall: {round(recall, 4)}')
    print(f'Confusion Matrix: \n{conf_matrix}')

In [4]:
logistic_regression(X, y, test_size=0.3, cat_features=cat_features)

Logistic Regression Model 

Accuracy: 0.8771
Precision: 0.6866
Recall: 0.5476
Confusion Matrix: 
[[375  21]
 [ 38  46]]


### Tuning the model

In [5]:
def grid_search(X, y, test_size=0.2, cat_features=None):
    # One-hot encode categorical features
    X = pd.get_dummies(X, columns=cat_features)

    # Split the data into test (0.8) and train (0.2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)

    # Define the hyperparameter grid
    param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
        'penalty': ['l1', 'l2'],       # Regularization type
        'solver': ['liblinear', 'saga'] # Solver options compatible with L1/L2
    }

    # Load the model
    logr_model = skl_lm.LogisticRegression()

    # Perform grid search
    grid_search = GridSearchCV(logr_model, param_grid, cv=5, scoring='precision', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Compute predictions
    y_pred = grid_search.predict(X_test)

    # Compute the metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred) # Proportion of 'low_bike_demand' (0) that were correctly predicted
    recall = recall_score(y_test, y_pred) # Proportion of 'high_bike_demand' (1) that were correctly predicted
    conf_matrix = confusion_matrix(y_test, y_pred)

    print(f"Logistic Regression Model \n")
    print(f"Accuracy: {round(accuracy, 4)}")
    print(f'Precision: {round(precision, 4)}')
    print(f'Recall: {round(recall, 4)}')
    print(f'Confusion Matrix: \n{conf_matrix}')

In [6]:
grid_search(X, y, test_size=0.3, cat_features=cat_features)

Logistic Regression Model 

Accuracy: 0.8812
Precision: 0.6957
Recall: 0.5714
Confusion Matrix: 
[[375  21]
 [ 36  48]]


## Trying pre-processed dataset

In [7]:
train_data = pd.read_csv('data/preprocessed_dataset.csv')

# Transform label into 0 (low_bike_demand) and 1 (high_bike_demand)
train_data['increase_stock'] = np.where(train_data['increase_stock'] == 'low_bike_demand', 0, 1)

X = train_data.copy()
y = X.pop('increase_stock')

cat_features = ['hour_of_day', 'day_of_week', 'month', 'weekday', 'summertime', 'snowdepth', 'day', 'rain']

In [8]:
logistic_regression(X, y, test_size=0.3, cat_features=cat_features)

Logistic Regression Model 

Accuracy: 0.8792
Precision: 0.6757
Recall: 0.5952
Confusion Matrix: 
[[372  24]
 [ 34  50]]


### Tuning the model

In [9]:
grid_search(X, y, test_size=0.3, cat_features=cat_features)

Logistic Regression Model 

Accuracy: 0.8833
Precision: 0.6892
Recall: 0.6071
Confusion Matrix: 
[[373  23]
 [ 33  51]]
