**Importing Libraries**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import datetime as datetime

**Preprocessing**

In [2]:
from os import listdir

weather_df = pd.read_csv('datasets/weather_hourly_helsinki.csv', header=2)
bike_df = pd.DataFrame(columns=['Departure', 'Return', 'Departure station id', 'Departure station name', 'Return station id', 'Return station name', 'Covered distance (m)', 'Duration (sec.)'])

print('Loading datasets...')
for dataset in listdir('datasets'):
  if dataset == 'weather_hourly_helsinki.csv' or dataset == 'full_bike_data.csv' or dataset.endswith('.gz'):
    continue
  temp_df = pd.read_csv('datasets/' + dataset, low_memory=False)
  bike_df = pd.concat([bike_df, temp_df])

bike_df['Departure'] = pd.to_datetime(bike_df['Departure'], format='mixed')
bike_df['Return'] = pd.to_datetime(bike_df['Return'], format='mixed')
bike_df.to_csv('datasets/full_bike_data.csv')

for station in bike_df['Departure station name'].unique():
  print(f'Aggregating over {station}')
  columns = ['Departure', 'Departure station name', 'Departure station id']
  temp_station = bike_df.loc[bike_df['Departure station name'] == station, columns]
  temp_station['trip'] = 1

  temp_station = temp_station.resample('h', on='Departure').trip.sum()

  print('Writing aggregated dataframe to .csv...')
  try:
    name = station.replace('/', '-')
    temp_station.to_csv('datasets/' + name + '_hourly_aggregate.csv', mode='x')
  except Exception as e:
    print(f'Failed to write {station} due to {e}')

for station in bike_df['Return station name'].unique():
  print(f'Aggregating over {station}')
  columns = ['Return', 'Return station name', 'Return station id']
  temp_station = bike_df.loc[bike_df['Return station name'] == station, columns]
  temp_station['trip'] = 1

  temp_station = temp_station.resample('h', on='Return').trip.sum()

  print('Writing aggregated dataframe to .csv...')
  try:
    name = station.replace('/', '-')
    temp_station.to_csv('datasets/' + name + '_return_hourly_aggregate.csv', mode='x')
  except Exception as e:
    print(f'Failed to write {station} due to {e}')

Aggregating over Sammonpuistikko
Writing aggregated dataframe to .csv...
Aggregating over Albertinkatu
Writing aggregated dataframe to .csv...
Aggregating over Arabiankatu
Writing aggregated dataframe to .csv...
Aggregating over Piispanportti
Writing aggregated dataframe to .csv...
Aggregating over Marian sairaala
Writing aggregated dataframe to .csv...
Aggregating over Perämiehenkatu
Writing aggregated dataframe to .csv...
Aggregating over Töölönlahdenkatu
Writing aggregated dataframe to .csv...
Aggregating over Koskelantie
Writing aggregated dataframe to .csv...
Aggregating over Päijänteentie
Writing aggregated dataframe to .csv...
Aggregating over Lintulahdenkatu
Writing aggregated dataframe to .csv...
Aggregating over Kansallismuseo
Writing aggregated dataframe to .csv...
Aggregating over Laulurastaantie
Writing aggregated dataframe to .csv...
Aggregating over Kaivopuisto
Writing aggregated dataframe to .csv...
Aggregating over Jämeräntaival
Writing aggregated dataframe to .csv...


**Model #1**

In [6]:
station = 'Kamppi (M)'

data = pd.read_csv('datasets/' + station + '_hourly_aggregate.csv')
data['Departure'] = pd.to_datetime(data['Departure'], format='mixed')

weather_df = pd.read_csv('datasets/weather_hourly_helsinki.csv', header=2)
weather_df['time'] = pd.to_datetime(weather_df['time'], format='mixed')

data = pd.merge(weather_df, data, how='inner', left_on='time', right_on='Departure')
data = data.drop(['time'], axis=1)
data.set_index(data['temperature_2m (°C)'], inplace=True)
data.set_index(data['rain (mm)'], inplace=True)
data.set_index(data['Departure'], inplace=True)

data['temperature_2m (°C)'] = pd.to_numeric(data['temperature_2m (°C)'], errors='coerce')
data['rain (mm)'] = pd.to_numeric(data['rain (mm)'], errors='coerce')
data['trip'] = pd.to_numeric(data['trip'], errors='coerce')

data = data.dropna(axis=1)

mod = sm.tsa.statespace.SARIMAX(data['trip'], order=(1, 1, 1), seasonal_order=(0, 1, 0, 24), freq='h')
res = mod.fit(disp=False)
print(res.forecast('2024-06-15 14:00:00')[-1])

0.08274263164029039


  print(res.forecast('2024-06-15 14:00:00')[-1])


In [7]:
station = 'Kamppi (M)'

data = pd.read_csv('datasets/' + station + '_return_hourly_aggregate.csv')
data['Return'] = pd.to_datetime(data['Return'], format='mixed')

weather_df = pd.read_csv('datasets/weather_hourly_helsinki.csv', header=2)
weather_df['time'] = pd.to_datetime(weather_df['time'], format='mixed')

data = pd.merge(weather_df, data, how='inner', left_on='time', right_on='Return')
data = data.drop(['time'], axis=1)
data.set_index(data['temperature_2m (°C)'], inplace=True)
data.set_index(data['rain (mm)'], inplace=True)
data.set_index(data['Return'], inplace=True)

data['temperature_2m (°C)'] = pd.to_numeric(data['temperature_2m (°C)'], errors='coerce')
data['rain (mm)'] = pd.to_numeric(data['rain (mm)'], errors='coerce')
data['trip'] = pd.to_numeric(data['trip'], errors='coerce')

data = data.dropna(axis=1)

mod = sm.tsa.statespace.SARIMAX(data['trip'], order=(1, 1, 1), seasonal_order=(0, 1, 0, 24), freq='h')
res = mod.fit(disp=False)
print(res.forecast('2024-06-15 14:00:00')[-1])

  self._init_dates(dates, freq)


0.07520370154649547


  print(res.forecast('2024-06-15 14:00:00')[-1])


In [2]:
# ChatGPT-generated stuff that might be useful
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

bike_df = pd.read_csv('datasets/full_bike_data.csv', low_memory=False)
bike_df['Departure'] = pd.to_datetime(bike_df['Departure'], format='mixed')
bike_df['Return'] = pd.to_datetime(bike_df['Return'], format='mixed')

# Extract useful features from the datetime fields
bike_df['Departure_Hour'] = bike_df['Departure'].dt.hour
bike_df['Departure_Weekday'] = bike_df['Departure'].dt.weekday
bike_df['Departure_Month'] = bike_df['Departure'].dt.month

le_departure_station = LabelEncoder()
bike_df['Departure_station_encoded'] = le_departure_station.fit_transform(bike_df['Departure station name'])

le_return_station = LabelEncoder()
bike_df['Return_station_encoded'] = le_return_station.fit_transform(bike_df['Return station name'])

# Drop columns that are not necessary for modeling purposes (e.g., identifiers or redundant information)
bike_df_cleaned = bike_df.drop(columns=['Departure', 'Return', 'Departure station name', 'Return station name', 'Departure station id', 'Return station id'])

# Fill in any missing values if present in the dataset
bike_df_cleaned.fillna(0, inplace=True)

# Display cleaned data to verify the pre-processing
bike_df_cleaned.head()

station_data = bike_df[bike_df['Departure station name'] == station]

# Group by Departure Hour, Weekday, Month to see hourly demand patterns
station_hourly_demand = station_data.groupby(['Departure_Hour', 'Departure_Weekday', 'Departure_Month']).size().reset_index(name='Demand')

X = station_hourly_demand[['Departure_Hour', 'Departure_Weekday', 'Departure_Month']]
y = station_hourly_demand['Demand']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gbr_model.fit(X_train, y_train)

y_pred_gbr = gbr_model.predict(X_test)

mse_gbr = mean_squared_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

print(mse_gbr, r2_gbr)

Unnamed: 0.1,Unnamed: 0,Covered distance (m),Duration (sec.),trip,Departure_Hour,Departure_Weekday,Departure_Month,Departure_station_encoded,Return_station_encoded
0,0,702.0,308.0,0.0,23,4,9,143,392
1,1,1711.0,528.0,0.0,23,4,9,377,17
2,2,618.0,212.0,0.0,23,4,9,110,22
3,3,1053.0,303.0,0.0,23,4,9,413,337
4,4,1010.0,288.0,0.0,23,4,9,19,259
