## Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
import sklearn.linear_model as skl_lm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from utils import find_optimal_hyperparameters

### Loading the dataset

In [2]:
train_data = pd.read_csv('data/preprocessed_dataset.csv')

# Transform label into 0 (low_bike_demand) and 1 (high_bike_demand)
train_data['increase_stock'] = np.where(train_data['increase_stock'] == 'low_bike_demand', 0, 1)

X = train_data.copy()
y = X.pop('increase_stock')

### Split data into training and test sets

In [3]:
# Split the data into test (0.8) and train (0.3)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Check if train and test data are balanced
#plt.scatter(np.arange(1, len(y_train) + 1), y_train, marker='o', label='Train')
#plt.scatter(np.arange(1, len(y_test) + 1), y_test, marker='x', label='Test')
#plt.legend()
#plt.show()

### Defining Logistic Regression function

We evaluate the model on precision because misclassifying high_bike_demand as low_bike_demand is more critical than the reverse. Predicting low_bike_demand during high_bike_demand results in not having enough bikes available for all the users, whereas overestimating demand simply leads to surplus bikes, which is less disruptive for users.

In [4]:
# Logistic Regression model with one-hot encoding
def logistic_regression(X, y, valid_size=0.2, cat_features=None):
    # One-hot encode categorical features
    X = pd.get_dummies(X, columns=cat_features)

    # Split the data into test (0.8) and train (0.2)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=valid_size, random_state=0)

    # Load and fit the model
    logr_model = skl_lm.LogisticRegression(max_iter=5000)
    logr_model.fit(X_train, y_train)

    # Compute predictions
    y_pred = logr_model.predict(X_valid)

    # Compute the metrics
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred) # Proportion of 'low_bike_demand' (0) that were correctly predicted
    recall = recall_score(y_valid, y_pred) # Proportion of 'high_bike_demand' (1) that were correctly predicted
    conf_matrix = confusion_matrix(y_valid, y_pred)

    print(f"Logistic Regression Model \n")
    print(f"Accuracy: {round(accuracy, 4)}")
    print(f'Precision: {round(precision, 4)}')
    print(f'Recall: {round(recall, 4)}')
    print(f'Confusion Matrix: \n{conf_matrix}')

    return logr_model

In [5]:
logistic_regression(X, y, valid_size=0.2)

Logistic Regression Model 

Accuracy: 0.8688
Precision: 0.66
Recall: 0.569
Confusion Matrix: 
[[245  17]
 [ 25  33]]


In [6]:
# Define the hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],       # Regularization type
    'solver': ['liblinear', 'saga'] # Solver options compatible with L1/L2
}

In [7]:
find_optimal_hyperparameters(skl_lm.LogisticRegression, param_grid, X_train, y_train, save_dir='output/best_params', save_file='logreg_best_params.json')

Best parameters found:  {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Saving best parameters to 'output/best_params/logreg_best_params.json'


{'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}