<div style="display: flex; justify-content: space-between;">
<a style="flex: 1; text-align: left;" href="./2_5_Model_building.ipynb">← Previous: 2.5 Model Building</a>
<a style="flex: 1; text-align: right;" href="./3_1_2_Weather.ipynb">Next: 3.1.2 Weather →</a>
</div>

### 3.1.1 Bus
---


In [1]:
import pandas as pd

In [2]:
# Merge STM files
STM_1_df = pd.read_csv('../Data/Transit data/STM_Data_2021_2022.csv', dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})
STM_2_df = pd.read_csv('../Data/Transit data/STM_Data_2023.csv', dtype={0: str, 2: str, 3: str, 4: str, 5: str, 6: str, 7: str})
STM_1_df = STM_1_df.dropna(how='all')
STM_df = pd.concat([STM_1_df, STM_2_df], ignore_index=True)

# Remove duplicate rows
original_df = STM_df.copy()
STM_df = STM_df.drop_duplicates()
removed_rows = original_df[~original_df.index.isin(STM_df.index)]

STM_df.head()

Unnamed: 0,date,ligne,dir,id_voy,dep_pl,dep_rl,arr_pl,arr_rl
0,2021/10/05,100.0,Est,35081296,00:10:00,00:10:07,00:45:00,00:44:54
1,2021/10/05,100.0,Est,35080889,00:42:00,00:42:20,01:17:00,01:14:05
2,2021/10/05,100.0,Est,35083075,01:14:00,01:13:51,01:49:00,01:44:43
3,2021/10/05,100.0,Est,35080237,05:06:00,05:06:21,05:41:00,05:39:21
4,2021/10/05,100.0,Est,35081071,05:39:00,05:41:07,06:14:00,06:13:06


In [3]:
# Date and time formatting
# The date and time columns are given in HH:MM:SS as strings, we convert them to datetime objects with their assigned time to find delays later

# Change data format from YYYY/MM/DD to YYYY-MM-DD and assign to hour columns, HH:MM:SS is changed to YYYY-MM-DD HH:MM:SS
try:
    STM_df['date'] = STM_df['date'].str.replace('/', '-')
    STM_df['date'] = pd.to_datetime(STM_df['date'],errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Date: {e}")

try:
    for col in ['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']:
        STM_df[col] = pd.to_datetime(STM_df[col], format='%H:%M:%S').dt.time
        STM_df[col] = STM_df[col].astype(str)
        STM_df[col] = STM_df['date'].astype(str) + ' ' + STM_df[col]
        STM_df[col] = pd.to_datetime(STM_df[col], errors='coerce')
except Exception as e:
    print(f"An error occurred while converting the Time: {e}")

# Some datapoints have a trip that was scheduled for a day but actually occurred the next day, this is compensated for as follows:
#   If the time in pl column is after 23:00 and in rl column is before 1:00, add 1 day to the date in rl column
#   -> This is to account for arrivals that were scheduled for late night but actually occurred early morning
#   If the time in pl column is before 1:00 and in rl column is after 23:00, substract 1 day to the date in rl column
#   -> This is to account for arrivals that were scheduled for early morning but actually occurred late the previous night

def adjust_dates(row):
    if row['dep_pl'].hour >= 23 and row['dep_rl'].hour < 1:
        row['dep_rl'] += pd.Timedelta(days=1)
    elif row['dep_pl'].hour < 1 and row['dep_rl'].hour >= 23:
        row['dep_rl'] -= pd.Timedelta(days=1)

    if row['arr_pl'].hour >= 23 and row['arr_rl'].hour < 1:
        row['arr_rl'] += pd.Timedelta(days=1)
    elif row['arr_pl'].hour < 1 and row['arr_rl'].hour >= 23:
        row['arr_rl'] -= pd.Timedelta(days=1)
    
    return row

STM_df = STM_df.apply(adjust_dates, axis=1)

# Slice the time to only keep HH:MM
try:
    for col in ['dep_pl', 'dep_rl', 'arr_pl', 'arr_rl']:
        STM_df[col] = STM_df[col].dt.strftime('%H:%M')
except Exception as e:
    print(f"An error occurred while formatting the Time: {e}")

STM_df.head()

Unnamed: 0,date,ligne,dir,id_voy,dep_pl,dep_rl,arr_pl,arr_rl
0,2021-10-05,100.0,Est,35081296,00:10,00:10,00:45,00:44
1,2021-10-05,100.0,Est,35080889,00:42,00:42,01:17,01:14
2,2021-10-05,100.0,Est,35083075,01:14,01:13,01:49,01:44
3,2021-10-05,100.0,Est,35080237,05:06,05:06,05:41,05:39
4,2021-10-05,100.0,Est,35081071,05:39,05:41,06:14,06:13


In [4]:
#Indexing

#Reset the index before adding the new 'id' column
STM_df = STM_df.reset_index(drop=True)

#add an index with the name 'id' for which the numbers start with a period
STM_df['id'] = STM_df.index + 1
STM_df['id'] = STM_df['id'].apply(lambda x: '.' + str(x))

#move index to the first column
cols = list(STM_df.columns)
cols = [cols[-1]] + cols[:-1]
STM_df = STM_df[cols]

STM_df.head()


Unnamed: 0,id,date,ligne,dir,id_voy,dep_pl,dep_rl,arr_pl,arr_rl
0,0.1,2021-10-05,100.0,Est,35081296,00:10,00:10,00:45,00:44
1,0.2,2021-10-05,100.0,Est,35080889,00:42,00:42,01:17,01:14
2,0.3,2021-10-05,100.0,Est,35083075,01:14,01:13,01:49,01:44
3,0.4,2021-10-05,100.0,Est,35080237,05:06,05:06,05:41,05:39
4,0.5,2021-10-05,100.0,Est,35081071,05:39,05:41,06:14,06:13


In [None]:
# Uncomment to save the dataframe as a csv file
# STM_df.to_csv('../Data/Transit data/STM_Data.csv', index=False)

<div style="display: flex; justify-content: space-between;">
<a style="flex: 1; text-align: left;" href="./2_5_Model_building.ipynb">← Previous: 2.5 Model Building</a>
<span style="flex: 1; text-align: center;">3.1.1 Bus</span>
<a style="flex: 1; text-align: right;" href="./3_1_2_Weather.ipynb">Next: 3.1.2 Weather →</a>
</div>
