In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Feature Engineering

#### Load the data

In [2]:
data = pd.read_csv('data/training_data_fall2024.csv')

#### Process the data

In [3]:
# Convert the increase_stock column to a binary column
data['increase_stock'] = np.where(data['increase_stock'] == 'high_bike_demand', 1, 0)

# Drop the columns that are not needed
data = data.drop(columns=['snow', 'holiday'])

# Add day_afternoon_night (0 (8-14h), 1 (15-19h), 2 (20-7h)) column
data['day_afternoon_night'] = np.where((15 <= data['hour_of_day']) & (data['hour_of_day'] <= 19), '1', 
									   np.where((20 <= data['hour_of_day']) | (data['hour_of_day'] <= 7), '2', '0'))

# Define the categorical (non-binary) and numerical features
cat_features = ['day_of_week', 'month', 'day_afternoon_night']
num_features = ['hour_of_day', 'temp', 'dew', 'humidity', 'precip', 'snowdepth', 'windspeed', 'cloudcover', 'visibility']

# Normalize the numerical features
for feature in num_features:
    data[feature] = (data[feature] - data[feature].mean()) / data[feature].std()

# One-hot encode the categorical features (which are not already binary)
data = pd.get_dummies(data, columns=cat_features)

In [4]:
data.to_csv('data/preprocessed_data_2.csv', sep=';', index=False)