In [1]:
%pip install shapely pyproj

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import requests
import zipfile
import io
import os
from shapely.geometry import Point, LineString
from shapely.ops import transform
import pyproj

# Configuration
GTFS_URL = "https://rrgtfsfeeds.s3.amazonaws.com/gtfs_subway.zip"
DATA_DIR = "../data/static"
os.makedirs(DATA_DIR, exist_ok=True)

## 1. Download and Extract GTFS Data

In [3]:
print(f"Downloading GTFS data from {GTFS_URL}...")
r = requests.get(GTFS_URL)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(DATA_DIR)
print("Download and extraction complete.")

Downloading GTFS data from https://rrgtfsfeeds.s3.amazonaws.com/gtfs_subway.zip...
Download and extraction complete.


## 2. Load Shapes and Stops

In [4]:
# Load necessary files
shapes_df = pd.read_csv(os.path.join(DATA_DIR, "shapes.txt"))
stops_df = pd.read_csv(os.path.join(DATA_DIR, "stops.txt"))
trips_df = pd.read_csv(os.path.join(DATA_DIR, "trips.txt"))
routes_df = pd.read_csv(os.path.join(DATA_DIR, "routes.txt"))
calendar_df = pd.read_csv(os.path.join(DATA_DIR, "calendar.txt"))
calendar_dates_df = pd.read_csv(os.path.join(DATA_DIR, "calendar_dates.txt"))
stop_times_df = pd.read_csv(os.path.join(DATA_DIR, "stop_times.txt"), dtype={'stop_id': str})

files_dict = {
    "shapes": shapes_df,
    "stops": stops_df,
    "trips": trips_df,
    "routes": routes_df,
    "calendar": calendar_df,
    "calendar_dates": calendar_dates_df,
    "stop_times": stop_times_df
}

for name, df in files_dict.items():
    print(f"Shape of {name}: {df.shape}")

Shape of shapes: (149834, 4)
Shape of stops: (1488, 6)
Shape of trips: (20304, 6)
Shape of routes: (29, 10)
Shape of calendar: (3, 10)
Shape of calendar_dates: (8, 3)
Shape of stop_times: (562597, 5)


In [5]:
# Identify longest A line shapes

# filter trips for the A line 
a_trips = trips_df[trips_df['route_id']=='A']

# get unique shape_ids for the A line
a_shape_ids = a_trips['shape_id'].unique()

# filter shapes to only include A-line shapes
a_shapes = shapes_df[shapes_df['shape_id'].isin(a_shape_ids)]

# find the shape_id with the most points.  A proxy for the longest most detailed path
# we group by shape_id and count the rows
shape_counts = a_shapes.groupby('shape_id').count()['shape_pt_sequence']

longest_shape_id = shape_counts.idxmax()

print(f"Selected canonical shape id: {longest_shape_id} with {shape_counts.max()} points")

# extract the points for this specific shape and sort them by sequence
main_shape_points = a_shapes[a_shapes['shape_id']==longest_shape_id].sort_values('shape_pt_sequence')

main_shape_points.head()

Selected canonical shape id: A..N09R with 1133 points


Unnamed: 0,shape_id,shape_pt_sequence,shape_pt_lat,shape_pt_lon
46232,A..N09R,0,40.603995,-73.755405
46233,A..N09R,1,40.603729,-73.755898
46234,A..N09R,2,40.603715,-73.755925
46235,A..N09R,3,40.603701,-73.755953
46236,A..N09R,4,40.603687,-73.75598


In [6]:
# create geometry and calculate distance

# create a linestring from the shapes point
line_coords = list(zip(main_shape_points.shape_pt_lon, main_shape_points.shape_pt_lat))
line_geom = LineString(line_coords)

# define projection: WGS84 (lat/lon) -> UTM Zone 18N (NYC, meters)
# EPSG:4326 is standard Lat/Lon
# EPSG:32618 is UTM Zone 18N (Meters)
wgs84 = pyproj.CRS('EPSG:4326')
utm = pyproj.CRS('EPSG:32618')
project = pyproj.Transformer.from_crs(wgs84, utm, always_xy=True).transform

# project the line to meters
line_geom_meters = transform(project, line_geom)

print(f"Total length of A line (longest path): {line_geom_meters.length / 1000:.2f} km")
print(f"Total length in miles: {line_geom_meters.length * 0.000621371:.2f} miles")

Total length of A line (longest path): 51.82 km
Total length in miles: 32.20 miles


In [7]:
# map stations to linear distance

# filter stops that are prevalent to the A line using stop_id pattern 'A..'
# we keep both parent stations location type 1 and child stops locattion type 0
a_stops = stops_df[stops_df['stop_id'].str.startswith('A')].copy()

# Ensure coordinates are numerc and drop invalids
a_stops['stop_lat'] = pd.to_numeric(a_stops['stop_lat'], errors='coerce')
a_stops['stop_lon'] = pd.to_numeric(a_stops['stop_lon'], errors='coerce')
a_stops = a_stops.dropna(subset=['stop_lat','stop_lon'])

# function to calculate distance along the line
def get_distance_along_line(row, line_geometry, projector):
    try:
        stop_point = Point(row['stop_lon'], row['stop_lat'])
        if stop_point.is_empty:
            return None
        #project point to meters
        stop_point_meters = transform(projector, stop_point)
        # find distance along the line
        return line_geometry.project(stop_point_meters)
    except Exception:
        return None
    
# apply this function
a_stops['distance_from_start_mi'] = a_stops.apply(
    lambda row: get_distance_along_line(row, line_geom_meters, project),axis=1
)

# convert to miles
a_stops['distance_from_start_mi'] = a_stops['distance_from_start_mi'] * 0.000621371

# sort and select columns
cols_to_keep = ['stop_id', 'stop_name', 'distance_from_start_mi', 'location_type', 'parent_station']

# handle missing columns if they don't exist
if 'parent_station' not in a_stops.columns:
    a_stops['parent_station'] = None
if 'location_type' not in a_stops.columns:
    a_stops['location_type'] = 0

# fill nans
a_stops['parent_station'] = a_stops['parent_station'].fillna('')
a_stops['location_type'] = a_stops['location_type'].fillna(0).astype(int)

# drop stations that couldn't be projected
a_stops = a_stops.dropna(subset=['distance_from_start_mi'])

a_stops_sorted = a_stops.sort_values('distance_from_start_mi')[cols_to_keep]

output_path = "../data/a_line_station_distances.csv"
a_stops_sorted.to_csv(output_path, index=False)

print(f"Station distance map save to {output_path}")
display(a_stops_sorted.head(10))

Station distance map save to ../data/a_line_station_distances.csv


Unnamed: 0,stop_id,stop_name,distance_from_start_mi,location_type,parent_station
689,A65S,Ozone Park-Lefferts Blvd,9.466588,0,A65
688,A65N,Ozone Park-Lefferts Blvd,9.466588,0,A65
687,A65,Ozone Park-Lefferts Blvd,9.466588,1,
681,A63,104 St,9.481178,1,
682,A63N,104 St,9.481178,0,A63
686,A64S,111 St,9.481178,0,A64
684,A64,111 St,9.481178,1,
685,A64N,111 St,9.481178,0,A64
683,A63S,104 St,9.481178,0,A63
680,A61S,Rockaway Blvd,9.686007,0,A61


In [8]:
# define schedule logic 

def get_scheduled_departures(target_date_str, route_id="A", direction_id=0):
    """
    Returns a DataFrame of scheduled terminal departures for a specific date.
    """
    target_date = pd.to_datetime(target_date_str)
    day_name = target_date.day_name().lower()
    date_int = int(target_date.strftime('%Y%m%d'))

    #1 find service ids for this day of week
    active_services = calendar_df[calendar_df[day_name]==1]['service_id'].tolist()

    # 2 handle exceptions (calendar_dates.txt)
    # exception type 1 = add service, 2 = remove service
    exceptions = calendar_dates_df[calendar_dates_df['date']==date_int]

    added_services = exceptions[exceptions['exception_type']==1]['service_id'].tolist()
    removed_services = exceptions[exceptions['exception_type']==2]['service_id'].tolist()

    final_services = set(active_services) - set(removed_services) | set(added_services)

    # filter trips
    # we want trips for our route, direction and active service ids
    daily_trips = trips_df[
        (trips_df['route_id']==route_id) &
        (trips_df['direction_id']==direction_id) &
        (trips_df['service_id'].isin(final_services))
    ]

    if daily_trips.empty:
        return pd.DataFrame()
    
    #4 get terminal departures (stop sequence 1)
    # we join with stop times to find the time the train leaves the first station
    trip_ids = daily_trips['trip_id'].unique()

    terminal_departures = stop_times_df[
        (stop_times_df['trip_id'].isin(trip_ids))&
        (stop_times_df['stop_sequence']==1)
    ].copy()

    # 5 parse times, handle 25:00 format
    def parse_gtfs_time(time_str):
        h, m, s =map(int, time_str.split(':'))
        return h * 3600 + m * 60 + s

    terminal_departures['departure_seconds'] = terminal_departures['departure_time'].apply(parse_gtfs_time)

    # sort by time
    terminal_departures = terminal_departures.sort_values('departure_seconds')

    # calculate headway (time since last train)
    terminal_departures['scheduled_headway_sec'] = terminal_departures['departure_seconds'].diff()
    terminal_departures['scheduled_headway_min'] = terminal_departures['scheduled_headway_sec'] / 60.0

    return terminal_departures[['trip_id', 'departure_time', 'departure_seconds', 'scheduled_headway_min']]

# test with a sample date( e.g. weekday)
test_date = "2025-06-01"
schedule_df = get_scheduled_departures(test_date)

print(f"Scheduled departures for {test_date}: {len(schedule_df)}")
display(schedule_df.head(10))

Scheduled departures for 2025-06-01: 136


Unnamed: 0,trip_id,departure_time,departure_seconds,scheduled_headway_min
219328,BFA25GEN-A055-Sunday-00_004000_A..N43R,00:40:00,2400,
219394,BFA25GEN-A055-Sunday-00_006000_A..N43R,01:00:00,3600,20.0
219468,BFA25GEN-A055-Sunday-00_008000_A..N43R,01:20:00,4800,20.0
219542,BFA25GEN-A055-Sunday-00_010000_A..N43R,01:40:00,6000,20.0
219616,BFA25GEN-A055-Sunday-00_012000_A..N43R,02:00:00,7200,20.0
219690,BFA25GEN-A055-Sunday-00_014000_A..N43R,02:20:00,8400,20.0
219698,BFA25GEN-A055-Sunday-00_014200_A..N09R,02:22:00,8520,2.0
219823,BFA25GEN-A055-Sunday-00_016000_A..N43R,02:40:00,9600,18.0
219831,BFA25GEN-A055-Sunday-00_016200_A..N09R,02:42:00,9720,2.0
219956,BFA25GEN-A055-Sunday-00_018000_A..N43R,03:00:00,10800,18.0


In [9]:
# generate target headways for the full experiment period

# 1 define date range for the experiment
start_date = "2025-01-01"
end_date = "2025-06-30"

print(f"Generating target schedule from {start_date} to {end_date}")

# 2 create range of dates
date_range = pd.date_range(start=start_date, end=end_date)

all_scheduled_departures = []

# 3 loop through each day and get the schedule
for single_date in date_range:
    date_str = single_date.strftime('%Y-%m-%d')

    # Iterate over both directions: 0 (Northbound) and 1 (Southbound)
    for direction_id in [0, 1]:
        # call function from the previous cell
        daily_df = get_scheduled_departures(date_str, direction_id=direction_id)

        if not daily_df.empty:
            # add a column for the date so we can distinguish them later
            daily_df['service_date'] = date_str
            # add direction_id to distinguish N/S
            daily_df['direction_id'] = direction_id

            all_scheduled_departures.append(daily_df)

# 4 combine all days into one big df
full_schedule_df = pd.concat(all_scheduled_departures, ignore_index=True)

# 5 save to csv
output_file = "../data/target_terminal_headways.csv"
full_schedule_df.to_csv(output_file, index=False)

print(f"Success! Generated {len(full_schedule_df)} scheduled departures.")
print(f"Saved to {output_file}")

# show samples to verify
display(full_schedule_df.head())
display(full_schedule_df.tail())
# Check counts by direction
print(full_schedule_df['direction_id'].value_counts())

Generating target schedule from 2025-01-01 to 2025-06-30
Success! Generated 63477 scheduled departures.
Saved to ../data/target_terminal_headways.csv


Unnamed: 0,trip_id,departure_time,departure_seconds,scheduled_headway_min,service_date,direction_id
0,BFA25GEN-A087-Weekday-00_003950_A..N43R,00:39:30,2370,,2025-01-01,0
1,BFA25GEN-A087-Weekday-00_005950_A..N43R,00:59:30,3570,20.0,2025-01-01,0
2,BFA25GEN-A087-Weekday-00_007950_A..N43R,01:19:30,4770,20.0,2025-01-01,0
3,BFA25GEN-A087-Weekday-00_009950_A..N43R,01:39:30,5970,20.0,2025-01-01,0
4,BFA25GEN-A087-Weekday-00_011950_A..N43R,01:59:30,7170,20.0,2025-01-01,0


Unnamed: 0,trip_id,departure_time,departure_seconds,scheduled_headway_min,service_date,direction_id
63472,BFA25GEN-A087-Weekday-00_138300_A..S74R,23:03:00,82980,10.0,2025-06-30,1
63473,BFA25GEN-A087-Weekday-00_139600_A..S05R,23:16:00,83760,13.0,2025-06-30,1
63474,BFA25GEN-A087-Weekday-00_140900_A..S74R,23:29:00,84540,13.0,2025-06-30,1
63475,BFA25GEN-A087-Weekday-00_142200_A..S05R,23:42:00,85320,13.0,2025-06-30,1
63476,BFA25GEN-A087-Weekday-00_143450_A..S74R,23:54:30,86070,12.5,2025-06-30,1


direction_id
1    31997
0    31480
Name: count, dtype: int64
