### This script combines data from gtfs files from multiple dates. It outputs combined shapes, routes, and trips files. It also outputs an aggregated "full trip stop schedule" file, which contains columns from various gtfs input files.

In [1]:
import pandas as pd
import numpy as np
import datetime
from collections import defaultdict

In [2]:
# root directory for all the gtfs data files from different dates
gtfs_folder = "/Users/anne/Documents/projects/buses350/gtfs-realtime/data/source/"

In [3]:
# create dictionary keyed by date then file type;
# each entry is a pandas dataframe
date_file_dict = defaultdict(dict)
file_types = ['stops', 'stop_times', 'trips', 'shapes', 'routes']

In [4]:
# specify set of dates to combine
# (for a set of files labeled with a certain date, 
# those data apply from that date up until the day before 
# the next date for which data files exist)
date_start_end_dict = {'2018_08_15':('2018-08-15','2010-09-23'),
                       '2018_09_24':('2018-09-24','2010-09-24')}
dates = list(date_start_end_dict.keys())

# make sure to change out_suffix below to correspond to this date range!

In [5]:
# read in each file type for each date, and store in dictionary
for date in dates:
    for f in file_types:
        date_file_dict[date][f] = pd.read_csv(f"{gtfs_folder}{date}_gtfs/{f}.txt")
        date_file_dict[date][f]['start_gtfs_date'] = \
            datetime.datetime.strptime(date_start_end_dict[date][0], "%Y-%m-%d")
        date_file_dict[date][f]['end_gtfs_date'] = \
            datetime.datetime.strptime(date_start_end_dict[date][1], "%Y-%m-%d")

In [6]:
# get the desired set of columns for the new aggregated "full trip stop schedule" file
def trip_stop_schedule(gtfs_stops, gtfs_stop_times, gtfs_trips, gtfs_routes):
    gtfs_stops.drop(['start_gtfs_date','end_gtfs_date'], axis=1, inplace=True)
    trip_stops_w_names = gtfs_stop_times.merge(gtfs_stops, how='left',on='stop_id')
    trip_arrival_time = trip_stops_w_names.loc[trip_stops_w_names['stop_sequence']==1,['trip_id','stop_sequence','arrival_time']]\
                        .groupby('trip_id')\
                        .agg({'arrival_time':'max'})\
                        .reset_index()\
                        .rename(columns={'arrival_time':'trip_start_time'})
    trip_stops_w_names = trip_stops_w_names.merge(trip_arrival_time, how='left', on='trip_id')
    
    trip_stops_w_name_route = trip_stops_w_names.merge(gtfs_trips[['trip_id','route_id','direction_id','trip_headsign','shape_id']], how='left',on='trip_id')
    
    trip_stops_w_name_route = trip_stops_w_name_route.merge(gtfs_routes[['route_id', 'route_short_name', 'route_desc']], how='left', on='route_id')

    return trip_stops_w_name_route

In [7]:
# for each date, aggregate information from various files
file_types.append('aggregated')
for date in dates:
    date_file_dict[date][file_types[-1]] = trip_stop_schedule(date_file_dict[date]['stops'], 
                                                              date_file_dict[date]['stop_times'],
                                                              date_file_dict[date]['trips'], 
                                                              date_file_dict[date]['routes'])

In [8]:
# append all the data from different dates for each file type 
# (including the new aggregated file type)
file_dict = {}
for f in file_types:
    file_dict[f] = date_file_dict[dates[0]][f].copy()
    for date in dates[1:]:
        file_dict[f] = file_dict[f].append(date_file_dict[date][f])

In [9]:
file_dict['aggregated'].columns

Index(['trip_id', 'arrival_time', 'departure_time', 'stop_id', 'stop_sequence',
       'stop_headsign', 'pickup_type', 'drop_off_type', 'shape_dist_traveled',
       'start_gtfs_date', 'end_gtfs_date', 'stop_code', 'stop_name',
       'stop_desc', 'stop_lat', 'stop_lon', 'zone_id', 'stop_url',
       'location_type', 'parent_station', 'stop_timezone', 'trip_start_time',
       'route_id', 'direction_id', 'trip_headsign', 'shape_id',
       'route_short_name', 'route_desc'],
      dtype='object')

In [10]:
# suffix for the output files specifying the date range they contain; 
# we could get this from date_start_end_dict 
# but it would require us to parse and sort dates, so whatever
out_suffix = "2018-08-15_2018-09-24"

In [11]:
# write output csv files
file_dict['aggregated'].to_csv(f"gtfs_{out_suffix}.csv", index=False)
file_dict['shapes'].to_csv(f"gtfs_shapes_{out_suffix}.csv", index=False)
file_dict['routes'].to_csv(f"gtfs_routes_{out_suffix}.csv", index=False)
file_dict['trips'].to_csv(f"gtfs_trips_{out_suffix}.csv", index=False)