In [140]:
import os
import pandas as pd
import numpy as np
import time
import urllib.request
import glob
from datetime import datetime, timedelta

## STM Data Manipulation

In [141]:
# Merge STM files and save as a pandas dataframe

STM_1_df = pd.read_csv('../Data/Transit data/STM_Data_2021_2022.csv', dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})
STM_1_df = STM_1_df.dropna(how='all')
STM_2_df = pd.read_csv('../Data/Transit data/STM_Data_2023.csv', dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})

STM_df = pd.concat([STM_1_df, STM_2_df], ignore_index=True)

In [142]:
# Remove duplicate rows in case the index was copied over

original_df = STM_df.copy()
STM_df = STM_df.drop_duplicates()

removed_rows = original_df[~original_df.index.isin(STM_df.index)]
print(removed_rows)

Empty DataFrame
Columns: [date, ligne, dir, id_voy, dep_pl, dep_rl, arr_pl, arr_rl]
Index: []


In [143]:
# Change data format from YYYY/MM/DD to YYYY-MM-DD
try:
    STM_df['date'] = STM_df['date'].str.replace('/', '-')
    STM_df['date'] = pd.to_datetime(STM_df['date'],errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Date: {e}")

# Verify that the date format is correct
print(STM_df['date'].head())

# Change time format from HH:MM:SS to YYYY-MM-DD HH:MM:SS asigning same date from the date column

try:
    for col in ['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']:
        STM_df[col] = pd.to_datetime(STM_df[col], format='%H:%M:%S').dt.time
        STM_df[col] = STM_df[col].astype(str)
        STM_df[col] = STM_df['date'].astype(str) + ' ' + STM_df[col]
        STM_df[col] = pd.to_datetime(STM_df[col], errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Time: {e}")

#if the time in pl column is after 23:00 and in rl column is before 1:00, add 1 day to the date in rl column
#if the time in pl column is before 1:00 and in rl column is after 23:00, substract 1 day to the date in rl column
#this is to account for arrivals that were scheduled for late night but actually occurred early morning
#and for arrivals that were scheduled for early morning but actually occurred late the previous night

def adjust_dates(row):
    if row['dep_pl'].hour >= 23 and row['dep_rl'].hour < 1:
        row['dep_rl'] += pd.Timedelta(days=1)
    elif row['dep_pl'].hour < 1 and row['dep_rl'].hour >= 23:
        row['dep_rl'] -= pd.Timedelta(days=1)

    if row['arr_pl'].hour >= 23 and row['arr_rl'].hour < 1:
        row['arr_rl'] += pd.Timedelta(days=1)
    elif row['arr_pl'].hour < 1 and row['arr_rl'].hour >= 23:
        row['arr_rl'] -= pd.Timedelta(days=1)
    
    return row

STM_df = STM_df.apply(adjust_dates, axis=1)






0   2021-10-05
1   2021-10-05
2   2021-10-05
3   2021-10-05
4   2021-10-05
Name: date, dtype: datetime64[ns]


In [144]:
STM_df.head()

Unnamed: 0,date,ligne,dir,id_voy,dep_pl,dep_rl,arr_pl,arr_rl
0,2021-10-05,100.0,Est,35081296,2021-10-05 00:10:00,2021-10-05 00:10:07,2021-10-05 00:45:00,2021-10-05 00:44:54
1,2021-10-05,100.0,Est,35080889,2021-10-05 00:42:00,2021-10-05 00:42:20,2021-10-05 01:17:00,2021-10-05 01:14:05
2,2021-10-05,100.0,Est,35083075,2021-10-05 01:14:00,2021-10-05 01:13:51,2021-10-05 01:49:00,2021-10-05 01:44:43
3,2021-10-05,100.0,Est,35080237,2021-10-05 05:06:00,2021-10-05 05:06:21,2021-10-05 05:41:00,2021-10-05 05:39:21
4,2021-10-05,100.0,Est,35081071,2021-10-05 05:39:00,2021-10-05 05:41:07,2021-10-05 06:14:00,2021-10-05 06:13:06


In [145]:
#reset the index before adding the new 'id' column
STM_df = STM_df.reset_index(drop=True)

#add an index with the name 'id' for which the numbers start with a period
STM_df['id'] = STM_df.index + 1
STM_df['id'] = STM_df['id'].apply(lambda x: '.' + str(x))

# Verify that the index is correct
print(STM_df['id'].head())

#move index to the first column
cols = list(STM_df.columns)
cols = [cols[-1]] + cols[:-1]
STM_df = STM_df[cols]

STM_df.head()


0    .1
1    .2
2    .3
3    .4
4    .5
Name: id, dtype: object


Unnamed: 0,id,date,ligne,dir,id_voy,dep_pl,dep_rl,arr_pl,arr_rl
0,0.1,2021-10-05,100.0,Est,35081296,2021-10-05 00:10:00,2021-10-05 00:10:07,2021-10-05 00:45:00,2021-10-05 00:44:54
1,0.2,2021-10-05,100.0,Est,35080889,2021-10-05 00:42:00,2021-10-05 00:42:20,2021-10-05 01:17:00,2021-10-05 01:14:05
2,0.3,2021-10-05,100.0,Est,35083075,2021-10-05 01:14:00,2021-10-05 01:13:51,2021-10-05 01:49:00,2021-10-05 01:44:43
3,0.4,2021-10-05,100.0,Est,35080237,2021-10-05 05:06:00,2021-10-05 05:06:21,2021-10-05 05:41:00,2021-10-05 05:39:21
4,0.5,2021-10-05,100.0,Est,35081071,2021-10-05 05:39:00,2021-10-05 05:41:07,2021-10-05 06:14:00,2021-10-05 06:13:06


<div style="background-color:rgba(255, 165, 0, 0.10); padding:10px;">
Enable this cell to store the STM data as a csv file
</div>

In [319]:
STM_df.to_csv('../Data/Transit data/STM_Data.csv', index=False)

<div style="background-color:rgba(255, 165, 0, 0.10); padding:10px;">
</div>

<br><br><br><br><br>

## Weather Data Manipulation

1) Daily data for snow

<div style="background-color:rgba(255, 165, 0, 0.10); padding:10px;">
This code was used to retrieve all weather data from the Government of Canada website
</div>

2) Hourly data for precipitation

## Master data

In [343]:
from datetime import datetime, timedelta

In [344]:
# Create a new dataframe for the master data with columns from the stm data

df_master = STM_df[['id','date', 'ligne', 'dep_rl',]]
df_hourlyw = pd.read_csv('../Data/Weather Data/hourly_montreal_weather.csv',  dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})
df_dailyw = pd.read_csv('../Data/Weather Data/daily_montreal_weather.csv',  dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})

df_master = df_master.rename(columns={'dep_rl': 'start_time', 'ligne': 'line'})
df_hourlyw['date'] = pd.to_datetime(df_hourlyw['date'], errors='coerce')
df_dailyw['date'] = pd.to_datetime(df_dailyw['date'], errors='coerce')

#cut the date from the start_time column and keep only the time as HH:MM

df_master['start_time'] = df_master['start_time'].astype(str)
df_master['start_time'] = df_master['start_time'].str.slice(start=11, stop=16)
df_master['start_time'] = pd.to_datetime(df_master['start_time'], format='%H:%M', errors='coerce').dt.time


df_master.head()

Unnamed: 0,id,date,line,start_time
0,0.1,2021-10-05,100.0,00:10:00
1,0.2,2021-10-05,100.0,00:42:00
2,0.3,2021-10-05,100.0,01:13:00
3,0.4,2021-10-05,100.0,05:06:00
4,0.5,2021-10-05,100.0,05:41:00


In [345]:
#find nan values in the start_time column
df_master[df_master['start_time'].isnull()]
df_master = df_master.dropna(subset=['start_time'])


In [346]:
# Add a actual temperature at start column (temp_start) to the master data, using the hourly weather data which matches the date and hour of the start time
# Must create an hour columm in the master data to match the time column in the weather data (HH:MM)
# Must match the start hour with the next hour in the weather data since the weather time is the end of the hour

df_master['hour'] = df_master['start_time'].astype(str)
df_master['hour'] = df_master['hour'].str.slice(start=0, stop=2)
df_master['hour'] = df_master['hour'].astype(int)
df_master['hour'] = df_master['hour'] + 1
df_master['hour'] = df_master['hour'].astype(str)
df_master['hour'] = df_master['hour'].str.pad(width=2, side='left', fillchar='0')
df_master['hour'] = df_master['hour'] + ':00'

df_master = pd.merge(df_master, df_hourlyw, how='left', left_on=['date', 'hour'], right_on=['date', 'time'])
df_master = df_master.rename(columns={'time': 'weather_time', 'temp': 'temp_start'})
df_master = df_master.drop(columns=['hour', 'weather_time'])

df_master.head()

Unnamed: 0,id,date,line,start_time,temp_start,precip
0,0.1,2021-10-05,100.0,00:10:00,15.1,0.0
1,0.2,2021-10-05,100.0,00:42:00,15.1,0.0
2,0.3,2021-10-05,100.0,01:13:00,13.7,0.0
3,0.4,2021-10-05,100.0,05:06:00,11.8,0.0
4,0.5,2021-10-05,100.0,05:41:00,11.8,0.0


In [347]:
# Add rain and snow amount columns to the master data, using the daily weather data which matches the date
# For the snow_amt column, if the snow_yn column is Y, then take the value of the precip column, otherwise set to 0
# For the rain_amt column, if the snow_yn column is N, then take the value of the precip column, otherwise set to 0

df_master = pd.merge(df_master, df_dailyw, how='left', left_on=['date'], right_on=['date'])
df_master = df_master.rename(columns={'precip': 'rain_amt', 'snow': 'total_snow'})
df_master['snow_amt'] = np.where(df_master['snow_yn'] == 'Y', df_master['rain_amt'], 0)
df_master['rain_amt'] = np.where(df_master['snow_yn'] == 'N', df_master['rain_amt'], 0)
df_master = df_master.drop(columns=['snow_yn',])

df_master.head()


Unnamed: 0,id,date,line,start_time,temp_start,rain_amt,total_snow,snow_amt
0,0.1,2021-10-05,100.0,00:10:00,15.1,0.0,0.0,0
1,0.2,2021-10-05,100.0,00:42:00,15.1,0.0,0.0,0
2,0.3,2021-10-05,100.0,01:13:00,13.7,0.0,0.0,0
3,0.4,2021-10-05,100.0,05:06:00,11.8,0.0,0.0,0
4,0.5,2021-10-05,100.0,05:41:00,11.8,0.0,0.0,0


In [348]:
#import dep_pl, dep_rl, arr_pl, arr_rl columns from the stm data to the master data
# cut the seconds off from the time columns

df_master = pd.merge(df_master, STM_df[['id', 'dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']], how='left', left_on=['id'], right_on=['id'])

for col in ['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']:
    df_master[col] = df_master[col].astype(str)
    df_master[col] = df_master[col].str.slice(start=0, stop=16)
    df_master[col] = df_master[col] + ':00'
    df_master[col] = pd.to_datetime(df_master[col], format='%Y-%m-%d %H:%M:%S', errors='coerce')

#find the delays in minutes between the planned and real departure times in minutes
df_master['delay_start'] = (df_master['dep_rl'] - df_master['dep_pl']).dt.total_seconds() / 60
df_master['delay_start'] = df_master['delay_start'].astype(int)

df_master['delay_end'] = (df_master['arr_rl'] - df_master['arr_pl']).dt.total_seconds() / 60
df_master['delay_end'] = df_master['delay_end'].astype(int)

df_master = df_master.drop(columns=['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl'])

df_master.head()


Unnamed: 0,id,date,line,start_time,temp_start,rain_amt,total_snow,snow_amt,delay_start,delay_end
0,0.1,2021-10-05,100.0,00:10:00,15.1,0.0,0.0,0,0,-1
1,0.2,2021-10-05,100.0,00:42:00,15.1,0.0,0.0,0,0,-3
2,0.3,2021-10-05,100.0,01:13:00,13.7,0.0,0.0,0,-1,-5
3,0.4,2021-10-05,100.0,05:06:00,11.8,0.0,0.0,0,0,-2
4,0.5,2021-10-05,100.0,05:41:00,11.8,0.0,0.0,0,2,-1


In [349]:
#reorganize columns

cols = df_master.columns.tolist()
id_col = cols[0]
date_col = cols[1]
line_col = cols[2]
delay_start_col = cols[9]
cols = [col for col in cols if col not in {id_col,date_col, line_col, delay_start_col}]

cols = [id_col,date_col, line_col, delay_start_col] + cols
df_master = df_master[cols]

df_master.head()


Unnamed: 0,id,date,line,delay_end,start_time,temp_start,rain_amt,total_snow,snow_amt,delay_start
0,0.1,2021-10-05,100.0,-1,00:10:00,15.1,0.0,0.0,0,0
1,0.2,2021-10-05,100.0,-3,00:42:00,15.1,0.0,0.0,0,0
2,0.3,2021-10-05,100.0,-5,01:13:00,13.7,0.0,0.0,0,-1
3,0.4,2021-10-05,100.0,-2,05:06:00,11.8,0.0,0.0,0,0
4,0.5,2021-10-05,100.0,-1,05:41:00,11.8,0.0,0.0,0,2


In [350]:
# verify that there are no delays whose absolute value is greater than 10h
# print the whole list and whole row instead of truncating
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

print(df_master.loc[abs(df_master['delay_start']) > 600])
print(df_master.loc[abs(df_master['delay_end']) > 600])

             id       date   line  delay_end start_time temp_start rain_amt  \
14637    .14638 2021-10-20  139.0          2   07:50:00        6.9      0.0   
39737    .39719 2021-11-15  460.0         -2   07:45:00        3.0      0.2   
167836  .167821 2022-03-20  139.0         -1   17:11:00        4.2        0   
211850  .211835 2022-05-03   67.0          0   22:35:00       14.2      0.0   
271315  .271300 2022-07-06  460.0          7   06:16:00       16.8      0.0   
314365  .314350 2022-08-24   80.0         -2   23:08:00        NaN      NaN   
361995  .361980 2022-10-15  121.0         -1   21:42:00       12.8      0.0   
538933  .538897 2023-04-08   80.0         -4   23:50:00        NaN      NaN   

        total_snow snow_amt  delay_start  
14637          0.0        0         -834  
39737          0.0        0         -706  
167836         2.0      0.8         1018  
211850         0.0        0         1326  
271315         0.0        0         -629  
314365         0.0        0   

In [351]:
# show unique line values
print(df_master['line'].unique())
print(STM_df['ligne'].unique())
#seems like the STM did not give me data for the 193 line ._.

[100. 121. 139. 439. 460. 467. 480.  67.  80.]
[100. 121. 139. 439. 460. 467. 480.  67.  80.]


In [352]:
df_master.head()

Unnamed: 0,id,date,line,delay_end,start_time,temp_start,rain_amt,total_snow,snow_amt,delay_start
0,0.1,2021-10-05,100.0,-1,00:10:00,15.1,0.0,0.0,0,0
1,0.2,2021-10-05,100.0,-3,00:42:00,15.1,0.0,0.0,0,0
2,0.3,2021-10-05,100.0,-5,01:13:00,13.7,0.0,0.0,0,-1
3,0.4,2021-10-05,100.0,-2,05:06:00,11.8,0.0,0.0,0,0
4,0.5,2021-10-05,100.0,-1,05:41:00,11.8,0.0,0.0,0,2


In [353]:
df_master.to_csv('../Data/master_data.csv', index=False)