In [2]:
import pandas as pd
from tqdm import tqdm

In [3]:
mrt_master_df = pd.read_csv('Data/Master Sets/TrainStationMaster.csv')

In [4]:
mrt_master_df.head()

Unnamed: 0.1,Unnamed: 0,YEAR_MONTH,DAY_TYPE,TIME_PER_HOUR,PT_TYPE,PT_CODE,TOTAL_TAP_IN_VOLUME,TOTAL_TAP_OUT_VOLUME
0,0,2024-03,WEEKDAY,11,TRAIN,NS7,21943,15130
1,1,2024-03,WEEKENDS/HOLIDAY,11,TRAIN,NS7,13062,14512
2,2,2024-03,WEEKDAY,16,TRAIN,SW4,2124,3233
3,3,2024-03,WEEKENDS/HOLIDAY,16,TRAIN,SW4,1532,1736
4,4,2024-03,WEEKENDS/HOLIDAY,10,TRAIN,CC5,1886,2659


In [51]:
station_network_list = mrt_master_df['PT_CODE'].unique()
station_network_list.sort()
time_interval = mrt_master_df['YEAR_MONTH'].unique()
time_interval.sort()
hour_interval = mrt_master_df['TIME_PER_HOUR'].unique()
hour_interval.sort()

In [52]:
def compute_time_series(col: str):

    if col not in mrt_master_df:
        raise ValueError('Column does not exist')
    temp_dict = {}

    for station in tqdm(station_network_list):
        for time in time_interval:
            temp_df = mrt_master_df[(mrt_master_df['PT_CODE'] == station) & (mrt_master_df['YEAR_MONTH'] == time)]
            val = temp_df[col].mean() # Takes mean of all rows
            if station not in temp_dict:
                temp_dict[station] = {}
            temp_dict[station][time] = val
    return temp_dict

def compute_time_series_with_hour(col: str):

    if col not in mrt_master_df:
        raise ValueError('Column does not exist')
    temp_dict = {}

    for station in tqdm(station_network_list):
        for time in time_interval:
            for hour in hour_interval:
                temp_df = mrt_master_df[(mrt_master_df['PT_CODE'] == station) & (mrt_master_df['YEAR_MONTH'] == time) & (mrt_master_df['TIME_PER_HOUR'] == hour)]
                val = temp_df[col].mean() # Takes mean of all rows
                if station not in temp_dict:
                    temp_dict[station] = {}
                temp_dict[station][f'{time} H{hour}:00'] = val
    return temp_dict

In [53]:
ts_matrix = pd.DataFrame(compute_time_series('TOTAL_TAP_IN_VOLUME'))
ts_matrix_hourly = pd.DataFrame(compute_time_series_with_hour('TOTAL_TAP_IN_VOLUME'))

  0%|          | 0/182 [00:00<?, ?it/s]

100%|██████████| 182/182 [00:15<00:00, 11.83it/s]
100%|██████████| 182/182 [05:03<00:00,  1.67s/it]


In [54]:
# Shortcut solution to set all NA values to 0
ts_matrix.fillna(0, inplace=True)
ts_matrix_hourly.fillna(0, inplace=True)

# TODO Imputation for NA values

In [55]:
ts_matrix.to_csv('Data\Master Sets\StationTimeSeries_TapInVolume.csv')
ts_matrix_hourly.to_csv('Data\Master Sets\StationTimeSeries_TapInVolume_Hourly.csv')