In [391]:
import os
import pandas as pd
import numpy as np
import time
import urllib.request
import glob

## STM Data Manipulation

In [392]:
# Merge STM files and save as a pandas dataframe

STM_1_df = pd.read_csv('../Data/Transit data/STM_Data_2021_2022.csv', dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})
STM_1_df = STM_1_df.dropna(how='all')
STM_2_df = pd.read_csv('../Data/Transit data/STM_Data_2023.csv', dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})

STM_df = pd.concat([STM_1_df, STM_2_df], ignore_index=True)

In [393]:
# Remove duplicate rows in case the index was copied over

original_df = STM_df.copy()
STM_df = STM_df.drop_duplicates()

removed_rows = original_df[~original_df.index.isin(STM_df.index)]
print(removed_rows)

Empty DataFrame
Columns: [date, ligne, dir, id_voy, dep_pl, dep_rl, arr_pl, arr_rl]
Index: []


In [394]:
# Change data format from YYYY/MM/DD to YYYY-MM-DD
try:
    STM_df['date'] = STM_df['date'].str.replace('/', '-')
    STM_df['date'] = pd.to_datetime(STM_df['date'],errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Date: {e}")

# Verify that the date format is correct
print(STM_df['date'].head())

# Change time format from HH:MM:SS to HH:MM
try:
    STM_df[['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']] = STM_df[['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']].apply(lambda x: x.str.slice(stop=5))
except Exception as e:
    print(f"An error occurred while slicing the Time: {e}")

# Verify that the time format is correct
print(STM_df[['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']].head())

0   2021-10-05
1   2021-10-05
2   2021-10-05
3   2021-10-05
4   2021-10-05
Name: date, dtype: datetime64[ns]
  dep_pl dep_rl arr_pl arr_rl
0  00:10  00:10  00:45  00:44
1  00:42  00:42  01:17  01:14
2  01:14  01:13  01:49  01:44
3  05:06  05:06  05:41  05:39
4  05:39  05:41  06:14  06:13


In [395]:
# Change time format to minutes after midnight

#def time_to_minutes(time): 
    #time = time.split(':')
    #return int(time[0]) * 60 + int(time[1])

#try:
    #STM_df[['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']] = STM_df[['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']].astype(str).map(time_to_minutes)
#except Exception as e:
    #print(f"An error occurred while converting the Time: {e}")

# Verify that the time format is correct

#print(STM_df[['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']].head())


In [396]:
STM_df.to_csv('../Data/Transit data/STM_Data.csv', index=False)

## Weather Data Manipulation

1) Daily data for snow

In [397]:
# Set the start and end years for the data
#yi = 2021
#yf = 2023 + 1 

In [398]:
# Retrieve daily weather data from the Government of Canada website and store in Weather Data folder
#
#station = 30165 # Select station ID (Montreal-Trudeau Airport is 30165)

#for year in range(yi, yf):

#        url = 'https://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&stationID=' + str(station) + '&Year=' + str(year) + '&Month=1&Day=14&timeframe=2&submit=Download+Data'
#        filename = os.path.join('../Data/Weather Data', 'daily_montreal_weather_' + str(year) + '.csv')
#        try: 
#            urllib.request.urlretrieve(url, filename)
#       except Exception as e:
#            print(f"An error occurred while downloading data for year {year} and month {month}: {e}")
#        time.sleep(5)

In [399]:
# merge all csv files in a directory into one csv file
all_dailyw = glob.glob(os.path.join('../Data/Weather Data', "daily_montreal_weather_*.csv")) 

df_all_dailyw = (pd.read_csv(f) for f in all_dailyw)
df_dailyw = pd.concat(df_all_dailyw, ignore_index=True)

In [400]:
# remove empty columns
df_dailyw = df_dailyw.dropna(axis=1, how='all')

In [401]:
# rename columns of interest to match the STM data
df_dailyw = df_dailyw.rename(columns={'Date/Time': 'date', 'Snow on Grnd (cm)': 'snow'})

In [402]:
# Change date format and remove rows with dates that are not in the STM data

try:
    df_dailyw['date'] = pd.to_datetime(df_dailyw['date'],errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Date: {e}")
    
df_dailyw = df_dailyw[df_dailyw['date'].isin(STM_df['date'])]

In [403]:
#place the date and snow columns first and remove the rest

cols = df_dailyw.columns.tolist()  
fifth_col = cols[4]
fifth_to_last_col = cols[-5]
cols = [col for col in cols if col not in {cols[4], fifth_to_last_col}]
cols = [fifth_col, fifth_to_last_col]
df_dailyw = df_dailyw[cols]

In [404]:
# place a value of 0 for snow if NaN
df_dailyw['snow'] = df_dailyw['snow'].fillna(0)

# place a value of Y for snow if >0 and N if 0 or Nan in a new column
df_dailyw['snow_yn'] = np.where(df_dailyw['snow'] > 0, 'Y', 'N')


In [405]:
df_dailyw.to_csv('../Data/Weather Data/daily_montreal_weather.csv', index=False)

2) Hourly data for precipitation

In [406]:
# Set the start and end years for the data
#yi = 2021
#yf = 2023 + 1 

In [407]:
# Retrieve hourly weather data from the Government of Canada website and store in Weather Data folder

#station = 30165 # Select station ID (Montreal-Trudeau Airport is 30165)

#for year in range(yi, yf):
   
    #if year == 2021:
        #mi = 10
        #mf = 12 + 1
    #elif year == 2023:
        #mi = 1
        #mf = 9 + 1
    #else:
        #mi = 1
        #mf = 12 + 1
    #for month in range(mi, mf):
        #if month < 10:
            #month = '0' + str(month)
        #else:
            #month = str(month)
        #url = 'https://climate.weather.gc.ca/climate_data/bulk_data_e.html?format=csv&time=UTC&stationID=' + str(station) + '&Year=' + str(year) + '&Month=' + str(month) + '&Day=14&timeframe=1&submit=Download+Data'
        #filename = os.path.join('../Data/Weather Data', 'hourly_montreal_weather_' + str(year) + '_' + str(month) + '.csv')
        #try: 
            #urllib.request.urlretrieve(url, filename)
        #except Exception as e:
            #print(f"An error occurred while downloading data for year {year} and month {month}: {e}")
        #time.sleep(5)
        

In [408]:
# merge all csv files in a directory into one csv file
all_hourlyw = glob.glob(os.path.join('../Data/Weather Data', "hourly_montreal_weather_*.csv")) 

df_all_hourlyw = (pd.read_csv(f) for f in all_hourlyw)
df_hourlyw = pd.concat(df_all_hourlyw, ignore_index=True)

In [409]:
# remove empty columns
df_hourlyw = df_hourlyw.dropna(axis=1, how='all')

In [410]:
# rename columns of interest to match the STM data
df_hourlyw = df_hourlyw.rename(columns={'Date/Time (UTC)': 'date', 'Time (UTC)': 'time', 'Temp (°C)' : 'temp', 'Precip. Amount (mm)' : 'precip'})

In [411]:
# Change format of date columns to datetime, change from UTC to EST taking into account daylight savings time, change time column to reflect the time in the date column

try:
    df_hourlyw['date'] = pd.to_datetime(df_hourlyw['date'],errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Date: {e}")

df_hourlyw['date'] = df_hourlyw['date'].dt.tz_localize('UTC').dt.tz_convert('America/Montreal')
df_hourlyw['time'] = df_hourlyw['date'].dt.strftime('%H:%M')
df_hourlyw['date'] = df_hourlyw['date'].dt.date

In [412]:
# Change date format and remove rows with dates that are not in the STM data

try:
    df_hourlyw['date'] = pd.to_datetime(df_hourlyw['date'],errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Date: {e}")
    
df_hourlyw = df_hourlyw[df_hourlyw['date'].isin(STM_df['date'])]

In [413]:
# place the date, time, temp and precip columns first and remove the rest

cols = df_hourlyw.columns.tolist()
date_col = cols[4]
time_col = cols[8]
temp_col = cols[9]
precip_col = cols[12]
cols = [col for col in cols if col not in {date_col, time_col, temp_col, precip_col}]
cols = [date_col, time_col, temp_col, precip_col]
df_hourlyw = df_hourlyw[cols]


In [414]:
df_hourlyw.to_csv('../Data/Weather Data/hourly_montreal_weather.csv', index=False)

## Master data

In [415]:
# Create a new dataframe for the master data with columns from the stm data: date as date, dep_rl as start_time, ligne as line

df_master = STM_df[['date', 'ligne', 'dep_rl', ]]
df_master = df_master.rename(columns={'dep_rl': 'start_time', 'ligne': 'line'})

In [416]:
# Add a actual temperature at start column (temp_start) to the master data, using the hourly weather data which matches the date and hour of the start time
# Must create an hour columm in the master data to match the time column in the weather data
# Must match the start hour with the next hour in the weather data since the weather time is the end of the hour

df_master['hour'] = df_master['start_time'].str.slice(stop=2)
df_master['hour'] = df_master['hour'].astype(int)
df_master['hour'] = df_master['hour'] + 1
df_master['hour'] = df_master['hour'].astype(str)
df_master['hour'] = df_master['hour'].str.zfill(2)
df_master['hour'] = df_master['hour'] + ':00'

df_master = pd.merge(df_master, df_hourlyw, how='left', left_on=['date', 'hour'], right_on=['date', 'time'])
df_master = df_master.rename(columns={'time': 'weather_time', 'temp': 'temp_start'})
df_master = df_master.drop(columns=['hour', 'weather_time'])


In [417]:
# Add rain and snow amount columns to the master data, using the daily weather data which matches the date
# For the snow_amt column, if the snow_yn column is Y, then take the value of the precip column, otherwise set to 0
# For the rain_amt column, if the snow_yn column is N, then take the value of the precip column, otherwise set to 0

df_master = pd.merge(df_master, df_dailyw, how='left', left_on=['date'], right_on=['date'])
df_master = df_master.rename(columns={'precip': 'rain_amt'})
df_master['snow_amt'] = np.where(df_master['snow_yn'] == 'Y', df_master['rain_amt'], 0)
df_master['rain_amt'] = np.where(df_master['snow_yn'] == 'N', df_master['rain_amt'], 0)
df_master = df_master.drop(columns=['snow_yn', 'snow'])


In [418]:
#convert arrival and departure times to minutes after midnight to find delays
#also convert start time to minutes after midnight (it is redundant with dep_rl but easier to work with)
df_master['dep_pl'] = STM_df['dep_pl']
df_master['dep_rl'] = STM_df['dep_rl']
df_master['arr_pl'] = STM_df['arr_pl']
df_master['arr_rl'] = STM_df['arr_rl']

#make sure the time format is correct
df_master[['start_time','dep_pl','dep_rl','arr_pl', 'arr_rl']] = df_master[['start_time','dep_pl','dep_rl','arr_pl', 'arr_rl']].astype(str).apply(lambda x: x.str.slice(stop=5))

def time_to_minutes(time):
    if pd.isna(time) or ':' not in time:
        return np.nan
    hours, minutes = time.split(':')
    return int(hours) * 60 + int(minutes)
try:
    df_master[['start_time','dep_pl','dep_rl','arr_pl', 'arr_rl']] = df_master[['start_time','dep_pl','dep_rl','arr_pl', 'arr_rl']].astype(str).map(time_to_minutes)
except Exception as e:
    print(f"An error occurred while converting the Time: {e}")

# add 24h to departure and arrival times if they occur after midnight
# if pl > 1380 and rl < 60, then rl = rl + 1440

df_master['dep_rl'] = np.where((df_master['dep_pl'] > 1380) & (df_master['dep_rl'] < 60), df_master['dep_rl'] + 1440, df_master['dep_rl'])
df_master['arr_rl'] = np.where((df_master['arr_pl'] > 1380) & (df_master['arr_rl'] < 60), df_master['arr_rl'] + 1440, df_master['arr_rl'])


In [419]:
# calculate delays
df_master['delay_start'] = df_master['dep_rl'] - df_master['dep_pl']
df_master['delay_end'] = df_master['arr_rl'] - df_master['arr_pl']
df_master = df_master.drop(columns=['arr_pl', 'arr_rl', 'dep_pl', 'dep_rl'])


In [420]:
# verify that there are no delays whose absolute value is greater than 24h
# print as a list with the index and the delay value

print(df_master[(df_master['delay_start'] > 1440) | (df_master['delay_start'] < -1440)])
print(df_master[(df_master['delay_end'] > 1440) | (df_master['delay_end'] < -1440)])



Empty DataFrame
Columns: [date, line, start_time, temp_start, rain_amt, snow_amt, delay_start, delay_end]
Index: []
Empty DataFrame
Columns: [date, line, start_time, temp_start, rain_amt, snow_amt, delay_start, delay_end]
Index: []


In [421]:
#reorganize columns

cols = df_master.columns.tolist()
date_col = cols[0]
line_col = cols[1]
delay_start_col = cols[6]
cols = [col for col in cols if col not in {date_col, line_col, delay_start_col}]

cols = [date_col, line_col, delay_start_col] + cols
df_master = df_master[cols]



In [422]:
#sort the file by line and date
df_master = df_master.sort_values(by=['line', 'date'])

In [423]:
# show unique line values
print(df_master['line'].unique())
print(STM_df['ligne'].unique())
#seems like the STM did not give me data for the 193 line ._.

[ 67.  80. 100. 121. 139. 439. 460. 467. 480.]
[100. 121. 139. 439. 460. 467. 480.  67.  80.]


In [424]:
df_master.to_csv('../Data/master_data.csv', index=False)