In [11]:
import pandas as pd
import glob

## These next 3 cells will take the bikeshare ridership data and return a dataframe that has the total riders per day in each year

In [12]:
# Load 2022 daily ride counts
files_2022 = glob.glob("datasets/bikeshare-ridership-2022/*.csv")
all_daily_2022 = []
for file in files_2022:
    print("Processing rides (2022):", file)
    df = pd.read_csv(file, encoding="cp1252")
    df['Start Time'] = pd.to_datetime(df['Start Time'])
    df['date'] = df['Start Time'].dt.normalize()
    daily = df.groupby('date').size().reset_index(name='total_riders')
    all_daily_2022.append(daily)
final_daily_counts_2022 = pd.concat(all_daily_2022, ignore_index=True).sort_values('date').reset_index(drop=True)
final_daily_counts_2022.head()

Processing rides (2022): datasets/bikeshare-ridership-2022/Bike share ridership 2022-01.csv
Processing rides (2022): datasets/bikeshare-ridership-2022/Bike share ridership 2022-03.csv
Processing rides (2022): datasets/bikeshare-ridership-2022/Bike share ridership 2022-02.csv
Processing rides (2022): datasets/bikeshare-ridership-2022/Bike share ridership 2022-06.csv
Processing rides (2022): datasets/bikeshare-ridership-2022/Bike share ridership 2022-12.csv
Processing rides (2022): datasets/bikeshare-ridership-2022/Bike share ridership 2022-07.csv
Processing rides (2022): datasets/bikeshare-ridership-2022/Bike share ridership 2022-11.csv
Processing rides (2022): datasets/bikeshare-ridership-2022/Bike share ridership 2022-05.csv
Processing rides (2022): datasets/bikeshare-ridership-2022/Bike share ridership 2022-04.csv
Processing rides (2022): datasets/bikeshare-ridership-2022/Bike share ridership 2022-10.csv
Processing rides (2022): datasets/bikeshare-ridership-2022/Bike share ridership 

Unnamed: 0,date,total_riders
0,2022-01-01,2851
1,2022-01-02,1135
2,2022-01-03,2157
3,2022-01-04,3371
4,2022-01-05,2870


In [13]:
# Load 2023 daily ride counts
files_2023 = glob.glob("datasets/bikeshare-ridership-2023/*.csv")
all_daily_2023 = []
for file in files_2023:
    print("Processing rides (2023):", file)
    df = pd.read_csv(file, encoding="cp1252")
    df['Start Time'] = pd.to_datetime(df['Start Time'])
    df['date'] = df['Start Time'].dt.normalize()
    daily = df.groupby('date').size().reset_index(name='total_riders')
    all_daily_2023.append(daily)
final_daily_counts_2023 = pd.concat(all_daily_2023, ignore_index=True).sort_values('date').reset_index(drop=True)
final_daily_counts_2023.head()

Processing rides (2023): datasets/bikeshare-ridership-2023/Bike share ridership 2023-08.csv
Processing rides (2023): datasets/bikeshare-ridership-2023/Bike share ridership 2023-09.csv
Processing rides (2023): datasets/bikeshare-ridership-2023/Bike share ridership 2023-02.csv
Processing rides (2023): datasets/bikeshare-ridership-2023/Bike share ridership 2023-03.csv
Processing rides (2023): datasets/bikeshare-ridership-2023/Bike share ridership 2023-01.csv
Processing rides (2023): datasets/bikeshare-ridership-2023/Bike share ridership 2023-10.csv
Processing rides (2023): datasets/bikeshare-ridership-2023/Bike share ridership 2023-04.csv
Processing rides (2023): datasets/bikeshare-ridership-2023/Bike share ridership 2023-05.csv
Processing rides (2023): datasets/bikeshare-ridership-2023/Bike share ridership 2023-11.csv
Processing rides (2023): datasets/bikeshare-ridership-2023/Bike share ridership 2023-07.csv
Processing rides (2023): datasets/bikeshare-ridership-2023/Bike share ridership 

Unnamed: 0,date,total_riders
0,2023-01-01,3875
1,2023-01-02,5705
2,2023-01-03,5484
3,2023-01-04,3566
4,2023-01-05,6458


In [14]:
# Load 2024 daily ride counts
files_2024 = glob.glob("datasets/bikeshare-ridership-2024/*.csv")
all_daily_2024 = []
for file in files_2024:
    print("Processing rides (2024):", file)
    df = pd.read_csv(file, encoding="cp1252")
    df['Start Time'] = pd.to_datetime(df['Start Time'])
    df['date'] = df['Start Time'].dt.normalize()
    daily = df.groupby('date').size().reset_index(name='total_riders')
    all_daily_2024.append(daily)
final_daily_counts_2024 = pd.concat(all_daily_2024, ignore_index=True).sort_values('date').reset_index(drop=True)
final_daily_counts_2024.head()

Processing rides (2024): datasets/bikeshare-ridership-2024/Bike share ridership 2024-02.csv
Processing rides (2024): datasets/bikeshare-ridership-2024/Bike share ridership 2024-03.csv
Processing rides (2024): datasets/bikeshare-ridership-2024/Bike share ridership 2024-01.csv
Processing rides (2024): datasets/bikeshare-ridership-2024/Bike share ridership 2024-04.csv
Processing rides (2024): datasets/bikeshare-ridership-2024/Bike share ridership 2024-05.csv
Processing rides (2024): datasets/bikeshare-ridership-2024/Bike share ridership 2024-07.csv
Processing rides (2024): datasets/bikeshare-ridership-2024/Bike share ridership 2024-06.csv
Processing rides (2024): datasets/bikeshare-ridership-2024/Bike share ridership 2024-08.csv
Processing rides (2024): datasets/bikeshare-ridership-2024/Bike share ridership 2024-09.csv


Unnamed: 0,date,total_riders
0,2024-01-01,3157
1,2024-01-02,6622
2,2024-01-03,7906
3,2024-01-04,7366
4,2024-01-05,7419


This next cell will take the weather data of every day between January 1st 2022 to September 30th 2024 (using the temperature datasets found on Environment Canada's website) and get our required inputs: Temperature, precipitation in mm, and snow on the ground measured in cm

In [15]:
# Load weather data for all years (keep only needed columns)
weather_2022 = pd.read_csv('datasets/2022_daily_weather.csv', parse_dates=['Date/Time'])
weather_2023 = pd.read_csv('datasets/2023_daily_weather.csv', parse_dates=['Date/Time'])
weather_2024 = pd.read_csv('datasets/2024_daily_weather.csv', parse_dates=['Date/Time'])

for wdf in [weather_2022, weather_2023, weather_2024]:
    wdf['date'] = wdf['Date/Time'].dt.normalize()
    cols = ['date', 'Mean Temp (°C)', 'Snow on Grnd (cm)', 'Total Precip (mm)']
    drop_cols = [c for c in wdf.columns if c not in cols]
    wdf.drop(columns=drop_cols, inplace=True)

weather_2022.head()

Unnamed: 0,Mean Temp (°C),Total Precip (mm),Snow on Grnd (cm),date
0,1.5,2.4,,2022-01-01
1,-6.3,2.0,3.0,2022-01-02
2,-8.4,0.0,3.0,2022-01-03
3,-1.2,0.0,3.0,2022-01-04
4,0.2,0.3,3.0,2022-01-05


Now, we're going to merge all this data + weather flags into 1 dataframe

In [None]:
# Merge rides with weather, add calendar/threshold flags, and save
final_daily_counts_2022[['Mean Temp (°C)', 'Snow on Grnd (cm)', 'Total Precip (mm)']] = weather_2022[['Mean Temp (°C)', 'Snow on Grnd (cm)', 'Total Precip (mm)']]
final_daily_counts_2023[['Mean Temp (°C)', 'Snow on Grnd (cm)', 'Total Precip (mm)']] = weather_2023[['Mean Temp (°C)', 'Snow on Grnd (cm)', 'Total Precip (mm)']]
final_daily_counts_2024[['Mean Temp (°C)', 'Snow on Grnd (cm)', 'Total Precip (mm)']] = weather_2024[['Mean Temp (°C)', 'Snow on Grnd (cm)', 'Total Precip (mm)']]

combined = pd.concat([
    final_daily_counts_2022,
    final_daily_counts_2023,
    final_daily_counts_2024
], ignore_index=True)

# Fill missing weather with zeros before flags
weather_cols = ['Mean Temp (°C)', 'Snow on Grnd (cm)', 'Total Precip (mm)']
combined[weather_cols] = combined[weather_cols].fillna(0)

combined['month'] = combined['date'].dt.month
combined['day_of_week'] = combined['date'].dt.dayofweek
combined['is_weekend'] = (combined['day_of_week'] >= 5).astype(int)

combined['rain_flag'] = (combined['Total Precip (mm)'] > 0).astype(int)
combined['snow_flag'] = (combined['Snow on Grnd (cm)'] > 0).astype(int)
combined['heat_flag'] = (combined['Mean Temp (°C)'] > 25).astype(int)
combined['freeze_flag'] = (combined['Mean Temp (°C)'] < 0).astype(int)

combined = combined.sort_values('date').reset_index(drop=True)
combined.to_csv('datasets/combined_cleaned.csv', index=False, encoding='utf-8')

combined.head()

Unnamed: 0,date,total_riders,Mean Temp (°C),Snow on Grnd (cm),Total Precip (mm),month,day_of_week,is_weekend,rain_flag,snow_flag,heat_flag,freeze_flag
0,2022-01-01,2851,1.5,0.0,2.4,1,5,1,1,0,0,0
1,2022-01-02,1135,-6.3,3.0,2.0,1,6,1,1,1,0,1
2,2022-01-03,2157,-8.4,3.0,0.0,1,0,0,0,1,0,1
3,2022-01-04,3371,-1.2,3.0,0.0,1,1,0,0,1,0,1
4,2022-01-05,2870,0.2,3.0,0.3,1,2,0,1,1,0,0
