In [81]:
import os
from pathlib import Path
import re
from tqdm import tqdm

import numpy as np
import pandas as pd
import gtfs_kit as gk

import logging

In [51]:
logging.basicConfig()
logger = logging.getLogger('graph_extraction')
logger.setLevel(logging.INFO)

# Generate date intervals

In [3]:
ORIGIN_DATA_DIR = Path('./sample_data/gtfs')
TARGET_DATA_DIR = Path('./sample_data/gtfs_reduced')
all_gtfs_files = [ORIGIN_DATA_DIR.joinpath(e) for e in os.listdir(ORIGIN_DATA_DIR) if Path(e).suffix == '.zip']
all_gtfs_files

[PosixPath('sample_data/gtfs/ov-gtfs-20200626-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip'),
 PosixPath('sample_data/gtfs/ov-gtfs-20210518-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip'),
 PosixPath('sample_data/gtfs/ov-gtfs-20190423-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip'),
 PosixPath('sample_data/gtfs/ov-gtfs-20191021-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip'),
 PosixPath('sample_data/gtfs/ov-gtfs-20190902-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip'),
 PosixPath('sample_data/gtfs/ov-gtfs-20201228-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip'),
 PosixPath('sample_data/gtfs/ov-gtfs-20201105-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip'),
 PosixPath('sample_data/gtfs/ov-gtfs-20200124-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip'),
 PosixPath('sample_data/gtfs/ov-gtfs-20200602-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip'),
 PosixPath('sample_data/gtfs/ov-gtfs-20210610-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RN

In [4]:
dates = [re.findall(r'\d{8}',str(path))[0] for path in all_gtfs_files]
series_dates = pd.to_datetime(pd.Series(dates)).dt.date
df_dates = pd.DataFrame({'GTFS_File': all_gtfs_files}, index=pd.DatetimeIndex(series_dates))
df_dates

Unnamed: 0,GTFS_File
2020-06-26,sample_data/gtfs/ov-gtfs-20200626-filtered-by-...
2021-05-18,sample_data/gtfs/ov-gtfs-20210518-filtered-by-...
2019-04-23,sample_data/gtfs/ov-gtfs-20190423-filtered-by-...
2019-10-21,sample_data/gtfs/ov-gtfs-20191021-filtered-by-...
2019-09-02,sample_data/gtfs/ov-gtfs-20190902-filtered-by-...
...,...
2021-05-03,sample_data/gtfs/ov-gtfs-20210503-filtered-by-...
2020-05-22,sample_data/gtfs/ov-gtfs-20200522-filtered-by-...
2020-02-25,sample_data/gtfs/ov-gtfs-20200225-filtered-by-...
2020-12-22,sample_data/gtfs/ov-gtfs-20201222-filtered-by-...


In [5]:
df_dates = df_dates.resample('D').ffill()
df_dates

Unnamed: 0,GTFS_File
2019-01-05,sample_data/gtfs/ov-gtfs-20190105-filtered-by-...
2019-01-06,sample_data/gtfs/ov-gtfs-20190105-filtered-by-...
2019-01-07,sample_data/gtfs/ov-gtfs-20190105-filtered-by-...
2019-01-08,sample_data/gtfs/ov-gtfs-20190108-filtered-by-...
2019-01-09,sample_data/gtfs/ov-gtfs-20190108-filtered-by-...
...,...
2021-12-26,sample_data/gtfs/ov-gtfs-20211223-filtered-by-...
2021-12-27,sample_data/gtfs/ov-gtfs-20211223-filtered-by-...
2021-12-28,sample_data/gtfs/ov-gtfs-20211228-filtered-by-...
2021-12-29,sample_data/gtfs/ov-gtfs-20211228-filtered-by-...


In [6]:
df_dates['Day'] = pd.Series(pd.Series(df_dates.index).dt.day_name().values, index=df_dates.index)
df_dates

Unnamed: 0,GTFS_File,Day
2019-01-05,sample_data/gtfs/ov-gtfs-20190105-filtered-by-...,Saturday
2019-01-06,sample_data/gtfs/ov-gtfs-20190105-filtered-by-...,Sunday
2019-01-07,sample_data/gtfs/ov-gtfs-20190105-filtered-by-...,Monday
2019-01-08,sample_data/gtfs/ov-gtfs-20190108-filtered-by-...,Tuesday
2019-01-09,sample_data/gtfs/ov-gtfs-20190108-filtered-by-...,Wednesday
...,...,...
2021-12-26,sample_data/gtfs/ov-gtfs-20211223-filtered-by-...,Sunday
2021-12-27,sample_data/gtfs/ov-gtfs-20211223-filtered-by-...,Monday
2021-12-28,sample_data/gtfs/ov-gtfs-20211228-filtered-by-...,Tuesday
2021-12-29,sample_data/gtfs/ov-gtfs-20211228-filtered-by-...,Wednesday


In [7]:
mondays = df_dates[df_dates['Day']=='Monday']
mondays

# Extracting GTFS for a specific date

In [97]:
curr_path = None

for entry in tqdm(mondays.iterrows()):
    # Check if gtfs file needs to be read in
    if curr_path != entry[1][0]:
        logger.info(f"reading in file: {entry[1][0]}")
        curr_path = entry[1][0]
        feed = gk.read_feed(curr_path, dist_units='km')

    # Extract current date
    d = entry[0].date()
    curr_date = d.isoformat().replace('-','')

    # Make sure date is in available dates
    try:
        assert curr_date in feed.get_dates()
    except:
        logger.warning(f"could not find date {curr_date} in {feed.get_dates()}")
        continue

    restricted_feed = feed.restrict_to_dates([curr_date])
    reduced_file_path = TARGET_DATA_DIR.joinpath(f'filtered-ov-gtfs-{curr_date}.zip')
    restricted_feed.write(reduced_file_path)

0it [00:00, ?it/s]INFO:graph_extraction:reading in file: sample_data/gtfs/ov-gtfs-20190105-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip
1it [00:06,  6.09s/it]INFO:graph_extraction:reading in file: sample_data/gtfs/ov-gtfs-20190110-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip
2it [00:11,  5.90s/it]INFO:graph_extraction:reading in file: sample_data/gtfs/ov-gtfs-20190115-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip
3it [00:17,  5.81s/it]INFO:graph_extraction:reading in file: sample_data/gtfs/ov-gtfs-20190128-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip
4it [00:23,  5.94s/it]INFO:graph_extraction:reading in file: sample_data/gtfs/ov-gtfs-20190204-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip
5it [00:30,  6.15s/it]INFO:graph_extraction:reading in file: sample_data/gtfs/ov-gtfs-20190206-filtered-by-GVB_IFF:GVB_IFF:NS_IFF:NSI_IFF:RNET.zip
6it [00:37,  6.47s/it]INFO:graph_extraction:reading in file: sample_data/gtfs/ov-gtfs-20190217-filtered-by-GVB_IFF:GVB_IFF