# **Data Wrangling**

> The purpose of this script is to modify the bus data provided by the STM and the historical weather data obtained from the Government of Canada site.  
> Running this script is not necessary as the modified data is available upon download/cloning of the repository.

In [None]:
import os
import pandas as pd
import numpy as np
import time
import urllib.request
import glob
from datetime import datetime, timedelta

## STM Data

> This data was provided by the STM under the terms of the *Act respecting access to documents held by public bodies and the protection of personal information (R.S.Q., chapter A-2.1)*.

In [None]:
# Merge STM files

STM_1_df = pd.read_csv('../Data/Transit data/STM_Data_2021_2022.csv', dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})
STM_2_df = pd.read_csv('../Data/Transit data/STM_Data_2023.csv', dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})
STM_1_df = STM_1_df.dropna(how='all')
STM_df = pd.concat([STM_1_df, STM_2_df], ignore_index=True)

# Remove duplicate rows
original_df = STM_df.copy()
STM_df = STM_df.drop_duplicates()
removed_rows = original_df[~original_df.index.isin(STM_df.index)]

STM_df.head()

In [None]:
# Date and time formatting
# The date and time columns are given in HH:MM:SS as strings, we convert them to datetime objects with their assigned time to find delays later

# Change data format from YYYY/MM/DD to YYYY-MM-DD and assign to hour columns, HH:MM:SS is changed to YYYY-MM-DD HH:MM:SS
try:
    STM_df['date'] = STM_df['date'].str.replace('/', '-')
    STM_df['date'] = pd.to_datetime(STM_df['date'],errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Date: {e}")

try:
    for col in ['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']:
        STM_df[col] = pd.to_datetime(STM_df[col], format='%H:%M:%S').dt.time
        STM_df[col] = STM_df[col].astype(str)
        STM_df[col] = STM_df['date'].astype(str) + ' ' + STM_df[col]
        STM_df[col] = pd.to_datetime(STM_df[col], errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Time: {e}")

# Some datapoints have a trip that was scheduled for a day but actually occurred the next day, this is compensated for as follows:

#   If the time in pl column is after 23:00 and in rl column is before 1:00, add 1 day to the date in rl column
#   -> This is to account for arrivals that were scheduled for late night but actually occurred early morning
#   If the time in pl column is before 1:00 and in rl column is after 23:00, substract 1 day to the date in rl column
#   -> This is to account for arrivals that were scheduled for early morning but actually occurred late the previous night

def adjust_dates(row):
    if row['dep_pl'].hour >= 23 and row['dep_rl'].hour < 1:
        row['dep_rl'] += pd.Timedelta(days=1)
    elif row['dep_pl'].hour < 1 and row['dep_rl'].hour >= 23:
        row['dep_rl'] -= pd.Timedelta(days=1)

    if row['arr_pl'].hour >= 23 and row['arr_rl'].hour < 1:
        row['arr_rl'] += pd.Timedelta(days=1)
    elif row['arr_pl'].hour < 1 and row['arr_rl'].hour >= 23:
        row['arr_rl'] -= pd.Timedelta(days=1)
    
    return row

STM_df = STM_df.apply(adjust_dates, axis=1)

# Slice the time to only keep HH:MM
try:
    STM_df[['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']] = STM_df[['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']].apply(lambda x: x.str.slice(stop=5))
except Exception as e:
    print(f"An error occurred while slicing the Time: {e}")

STM_df.head()

In [None]:
#Indexing

#Reset the index before adding the new 'id' column
STM_df = STM_df.reset_index(drop=True)

#add an index with the name 'id' for which the numbers start with a period
STM_df['id'] = STM_df.index + 1
STM_df['id'] = STM_df['id'].apply(lambda x: '.' + str(x))

#move index to the first column
cols = list(STM_df.columns)
cols = [cols[-1]] + cols[:-1]
STM_df = STM_df[cols]

STM_df.head()


In [None]:
# Uncomment to save the dataframe as a csv file
# STM_df.to_csv('../Data/Transit data/STM_Data.csv', index=False)

## Weather Data

>The weather data is retrieved in the following cells from the Environment Canada website. \
>More information at *https://climate.weather.gc.ca/historical_data/search_historic_data_e.html* 

In [None]:
# Change the language of the following cell to Python to enable the extraction of the data from the website

1. Daily data for snow

In [None]:
# Read back the daily weather data to perform formatting
# Since snow on ground is only available for daily data, the presence of snow is determined by the daily data, shown as Y/N
# The daily data is later applied to the hourly data (see master data below)

df_dailyw = pd.read_csv('../Data/Weather Data/daily_montreal_weather.csv', dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})
df_dailyw = df_dailyw.dropna(axis=1, how='all')
df_dailyw = df_dailyw.rename(columns={'Date/Time': 'date', 'Snow on Grnd (cm)': 'snow'})

In [None]:
#Date and time formatting
# Change date format and remove rows with dates that are not in the STM data
try:
    df_dailyw['date'] = pd.to_datetime(df_dailyw['date'],errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Date: {e}")
    
df_dailyw = df_dailyw[df_dailyw['date'].isin(STM_df['date'])]

In [None]:
#Place the date and snow columns first and remove the rest

cols = df_dailyw.columns.tolist()  
fifth_col = cols[4]
fifth_to_last_col = cols[-5]
cols = [col for col in cols if col not in {cols[4], fifth_to_last_col}]
cols = [fifth_col, fifth_to_last_col]
df_dailyw = df_dailyw[cols]

# place a value of 0 for snow if NaN
df_dailyw['snow'] = df_dailyw['snow'].fillna(0)

# place a value of Y for snow if >0 and N if 0 or Nan in a new column
df_dailyw['snow_yn'] = np.where(df_dailyw['snow'] > 0, 'Y', 'N')

In [None]:
#Uncomment to save the dataframe as a csv file
#df_dailyw.to_csv('../Data/Weather Data/daily_montreal_weather.csv', index=False)

2) Hourly data for precipitation

In [None]:
# Read back the hourly weather data to perform formatting
# The hourly data is formatted as UTC, the time is converted to EST
df_hourlyw = pd.read_csv('../Data/Weather Data/hourly_montreal_weather.csv', dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})
df_hourlyw = df_hourlyw.dropna(axis=1, how='all')
df_hourlyw = df_hourlyw.rename(columns={'Date/Time (UTC)': 'date', 'Time (UTC)': 'time', 'Temp (°C)' : 'temp', 'Precip. Amount (mm)' : 'precip'})

In [None]:
# Change format of date columns to datetime, change from UTC to EST taking into account daylight savings time
# Change time column to reflect the time in the date column
try:
    df_hourlyw['date'] = pd.to_datetime(df_hourlyw['date'],errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Date: {e}")

df_hourlyw['date'] = df_hourlyw['date'].dt.tz_localize('UTC').dt.tz_convert('America/Montreal')
df_hourlyw['time'] = df_hourlyw['date'].dt.strftime('%H:%M')
df_hourlyw['date'] = df_hourlyw['date'].dt.date

# Change date format and remove rows with dates that are not in the STM data
try:
    df_hourlyw['date'] = pd.to_datetime(df_hourlyw['date'],errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Date: {e}")
    
df_hourlyw = df_hourlyw[df_hourlyw['date'].isin(STM_df['date'])]

# place the date, time, temp and precip columns first and remove the rest
cols = df_hourlyw.columns.tolist()
date_col = cols[4]
time_col = cols[8]
temp_col = cols[9]
precip_col = cols[12]
cols = [col for col in cols if col not in {date_col, time_col, temp_col, precip_col}]
cols = [date_col, time_col, temp_col, precip_col]
df_hourlyw = df_hourlyw[cols]

In [None]:
#Uncomment to save the dataframe as a csv file
#df_hourlyw.to_csv('../Data/Weather Data/hourly_montreal_weather.csv', index=False)

## Master data

> The master data used for this project includes the following:
> - Bus data
>   - Date*
>   - Line
>   - Start time
>   - Start delay
>   - End time*
> - Weather data
>   - Temperature at start 
>   - Rain amount
>   - Snow amount
>   - Total snow*
>
> These variables will be used to predict the end delay.  
> *: These variables are included in the master data for later use, but they are not used as predictors

In [None]:
from datetime import datetime, timedelta

1. Bus data (date, line, start_time, end_time)

In [None]:
# Create a new dataframe for the master data with columns from the STM data
STM_df = pd.read_csv('../Data/Transit data/STM_Data.csv',  dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str,8:str})
df_master = STM_df[['id','date', 'ligne', 'dep_rl', 'arr_rl']]
df_hourlyw = pd.read_csv('../Data/Weather Data/hourly_montreal_weather.csv',  dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})
df_dailyw = pd.read_csv('../Data/Weather Data/daily_montreal_weather.csv',  dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})

df_master = df_master.rename(columns={'dep_rl': 'start_time', 'arr_rl': 'end_time', 'ligne': 'line'})
df_hourlyw['date'] = pd.to_datetime(df_hourlyw['date'], errors='coerce')
df_dailyw['date'] = pd.to_datetime(df_dailyw['date'], errors='coerce')

#Cut the date from the start_time column and keep only the time as HH:MM
for col in ['start_time', 'end_time']:
    df_master[col] = df_master[col].astype(str)
    df_master[col] = df_master[col].str.slice(start=11, stop=16)
    df_master[col] = pd.to_datetime(df_master[col], format='%H:%M', errors='coerce').dt.time

# delete nan values in the start_time column
df_master = df_master.dropna(subset=['start_time'])

df_master.head()

2. Temperature at start (temp_start)

In [None]:
# Add temp_start column to the master data, using the hourly weather data which matches the date and hour of the start time
# Must create an hour columm in the master data to match the time column in the weather data (HH:MM)
# Match atch the start hour with the next hour in the weather data since the weather time is the end of the hour

df_master['hour'] = df_master['start_time'].astype(str)
df_master['hour'] = df_master['hour'].str.slice(start=0, stop=2)
df_master['hour'] = df_master['hour'].astype(int)
df_master['hour'] = df_master['hour'] + 1
df_master['hour'] = df_master['hour'].astype(str)
df_master['hour'] = df_master['hour'].str.pad(width=2, side='left', fillchar='0')
df_master['hour'] = df_master['hour'] + ':00'

# Rename the time column in hourly data to hour
df_hourlyw = df_hourlyw.rename(columns={'time': 'hour'})

df_master['date'] = pd.to_datetime(df_master['date'])
df_hourlyw['date'] = pd.to_datetime(df_hourlyw['date'])

df_master = pd.merge(df_master, df_hourlyw, how='left', on=['date', 'hour'])
df_master = df_master.rename(columns={'temp': 'temp_start'})
df_master = df_master.drop(columns=['hour'])

df_master.head()

3. Precipitation (snow_amt, rain_amt)

In [None]:
# Add rain_amt and snow_amt columns to the master data, using the daily weather data which matches the date
# For the snow_amt column, if the snow_yn column is Y, then take the value of the precip column, otherwise set to 0
# For the rain_amt column, if the snow_yn column is N, then take the value of the precip column, otherwise set to 0

df_master = pd.merge(df_master, df_dailyw, how='left', left_on=['date'], right_on=['date'])
df_master = df_master.rename(columns={'precip': 'rain_amt', 'snow': 'total_snow'})
df_master['snow_amt'] = np.where(df_master['snow_yn'] == 'Y', df_master['rain_amt'], 0)
df_master['rain_amt'] = np.where(df_master['snow_yn'] == 'N', df_master['rain_amt'], 0)
df_master = df_master.drop(columns=['snow_yn',])

df_master.head()

4. Delays (delay_start, delay_end)

In [None]:
# Import dep_pl, dep_rl, arr_pl, arr_rl columns from the stm data to the master data
# Cut the seconds off from the time columns
df_master = pd.merge(df_master, STM_df[['id', 'dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']], how='left', left_on=['id'], right_on=['id'])

for col in ['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']:
    df_master[col] = df_master[col].astype(str)
    df_master[col] = df_master[col].str.slice(start=0, stop=16)
    df_master[col] = df_master[col] + ':00'
    df_master[col] = pd.to_datetime(df_master[col], format='%Y-%m-%d %H:%M:%S', errors='coerce')

# Find the delays in minutes between the planned and real departure times in minutes
df_master['delay_start'] = (df_master['dep_rl'] - df_master['dep_pl']).dt.total_seconds() / 60
df_master['delay_start'] = df_master['delay_start'].astype(int)

df_master['delay_end'] = (df_master['arr_rl'] - df_master['arr_pl']).dt.total_seconds() / 60
df_master['delay_end'] = df_master['delay_end'].astype(int)

df_master = df_master.drop(columns=['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl'])

df_master.head()


In [None]:
# Verify that there are no delays whose absolute value is greater than 10h

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

print(df_master.loc[abs(df_master['delay_start']) > 600])
print(df_master.loc[abs(df_master['delay_end']) > 600])

5. Organization

In [None]:
cols = df_master.columns.tolist()
id_col = cols[0]
date_col = cols[1]
line_col = cols[2]
delay_start_col = cols[9]
cols = [col for col in cols if col not in {id_col,date_col, line_col, delay_start_col}]

cols = [id_col,date_col, line_col, delay_start_col] + cols
df_master = df_master[cols]

df_master.head()


In [None]:
# show unique line values
print(df_master['line'].unique())
print(STM_df['ligne'].unique())

In [None]:
#seems like the STM did not provide data for the 193 line ._.

In [None]:
#Uncomment to save the dataframe as a csv file
#df_master.to_csv('../Data/master_data.csv', index=False)