In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Paths
ARRIVALS_FILE = "../data/nyc_subway_a_line_arrivals_2025.csv"
STATIONS_FILE = "../data/a_line_station_distances.csv"
OUTPUT_FILE = "../data/observed_terminal_headways.csv"

## 1. Load Data

In [2]:
# Load arrivals data
arrivals = pd.read_csv(ARRIVALS_FILE)
arrivals['arrival_time'] = pd.to_datetime(arrivals['arrival_time'])

print(f"Loaded {len(arrivals):,} arrival records")
print(f"Date range: {arrivals['arrival_time'].min()} to {arrivals['arrival_time'].max()}")
print(f"Directions: {arrivals['direction'].unique()}")
arrivals.head()

Loaded 2,100,859 arrival records
Date range: 2025-06-06 00:00:00+00:00 to 2025-12-06 11:41:44+00:00
Directions: ['S' 'N']


Unnamed: 0,trip_uid,route_id,direction,stop_id,stop_name,stop_lat,stop_lon,arrival_time
0,1749151110_A..S57R,A,S,A38S,Fulton St,40.710197,-74.007691,2025-06-06 00:00:00+00:00
1,1749153120_A..N55R,A,N,H06N,Beach 67 St,40.590927,-73.796924,2025-06-06 00:00:02+00:00
2,1749149220_A..S58R,A,S,H03S,Howard Beach-JFK Airport,40.660476,-73.830301,2025-06-06 00:00:07+00:00
3,1749153300_A..S57R,A,S,A06S,181 St,40.851695,-73.937969,2025-06-06 00:00:17+00:00
4,1749147750_A..S58R,A,S,H11S,Far Rockaway-Mott Av,40.603995,-73.755405,2025-06-06 00:00:23+00:00


In [3]:
# Load station distances to identify terminals
stations = pd.read_csv(STATIONS_FILE)
stations_main = stations[stations['location_type'] == 1][['stop_id', 'stop_name', 'distance_from_start_mi']]

print("Terminal stations (by distance):")
print("\nNear start (0 mi) - Northbound origin:")
print(stations_main.nsmallest(5, 'distance_from_start_mi'))
print("\nNear end (32 mi) - Southbound origin:")
print(stations_main.nlargest(5, 'distance_from_start_mi'))

Terminal stations (by distance):

Near start (0 mi) - Northbound origin:
    stop_id             stop_name  distance_from_start_mi
195     H11  Far Rockaway-Mott Av                0.000000
192     H10           Beach 25 St                0.416841
189     H09           Beach 36 St                0.902272
186     H08           Beach 44 St                1.360883
183     H07           Beach 60 St                2.022885

Near end (32 mi) - Southbound origin:
   stop_id      stop_name  distance_from_start_mi
0      A02  Inwood-207 St               32.199699
3      A03     Dyckman St               31.774254
6      A05         190 St               31.188895
9      A06         181 St               30.641013
12     A07         175 St               30.330511


## 2. Identify Terminal Stops per Direction

For each trip, we need the **first station** (where it originated).
- Northbound trains start at the southern end (low distance)
- Southbound trains start at the northern end (high distance)

In [4]:
# Add distance info to arrivals
# Strip N/S suffix to match station IDs
arrivals['stop_id_base'] = arrivals['stop_id'].str.rstrip('NS')

# Merge with station distances
arrivals_with_dist = arrivals.merge(
    stations_main[['stop_id', 'distance_from_start_mi']], 
    left_on='stop_id_base', 
    right_on='stop_id',
    how='left',
    suffixes=('', '_station')
)

print(f"Merged: {len(arrivals_with_dist):,} records")
print(f"Records with distance: {arrivals_with_dist['distance_from_start_mi'].notna().sum():,}")
arrivals_with_dist.head()

Merged: 2,100,859 records
Records with distance: 2,097,004


Unnamed: 0,trip_uid,route_id,direction,stop_id,stop_name,stop_lat,stop_lon,arrival_time,stop_id_base,stop_id_station,distance_from_start_mi
0,1749151110_A..S57R,A,S,A38S,Fulton St,40.710197,-74.007691,2025-06-06 00:00:00+00:00,A38,A38,19.795857
1,1749153120_A..N55R,A,N,H06N,Beach 67 St,40.590927,-73.796924,2025-06-06 00:00:02+00:00,H06,H06,2.478227
2,1749149220_A..S58R,A,S,H03S,Howard Beach-JFK Airport,40.660476,-73.830301,2025-06-06 00:00:07+00:00,H03,H03,8.007467
3,1749153300_A..S57R,A,S,A06S,181 St,40.851695,-73.937969,2025-06-06 00:00:17+00:00,A06,A06,30.641013
4,1749147750_A..S58R,A,S,H11S,Far Rockaway-Mott Av,40.603995,-73.755405,2025-06-06 00:00:23+00:00,H11,H11,0.0


In [5]:
# For each trip, find the FIRST stop (terminal departure)
# Group by trip_uid and get the earliest arrival time

terminal_arrivals = arrivals_with_dist.loc[
    arrivals_with_dist.groupby('trip_uid')['arrival_time'].idxmin()
].copy()

print(f"Found {len(terminal_arrivals):,} unique trips")
print(f"\nBy direction:")
print(terminal_arrivals['direction'].value_counts())

terminal_arrivals.head(10)

Found 62,912 unique trips

By direction:
direction
S    31723
N    31189
Name: count, dtype: int64


Unnamed: 0,trip_uid,route_id,direction,stop_id,stop_name,stop_lat,stop_lon,arrival_time,stop_id_base,stop_id_station,distance_from_start_mi
4,1749147750_A..S58R,A,S,H11S,Far Rockaway-Mott Av,40.603995,-73.755405,2025-06-06 00:00:23+00:00,H11,H11,0.0
25,1749148320_A..N55R,A,N,A09N,168 St,40.840719,-73.939561,2025-06-06 00:02:47+00:00,A09,A09,29.845248
13,1749148500_A..S58R,A,S,H10S,Beach 25 St,40.600066,-73.761353,2025-06-06 00:01:27+00:00,H10,H10,0.416841
21,1749149160_A..N54R,A,N,A03N,Dyckman St,40.865491,-73.927271,2025-06-06 00:02:05+00:00,A03,A03,31.774254
2,1749149220_A..S58R,A,S,H03S,Howard Beach-JFK Airport,40.660476,-73.830301,2025-06-06 00:00:07+00:00,H03,H03,8.007467
18,1749149460_A..N55R,A,N,A24N,59 St-Columbus Circle,40.768296,-73.981736,2025-06-06 00:01:52+00:00,A24,A24,24.311297
6,1749149550_A..S57R,A,S,A59S,80 St,40.679371,-73.858992,2025-06-06 00:00:27+00:00,A59,A59,10.484866
5,1749149940_A..S58R,A,S,A57S,Grant Av,40.677044,-73.86505,2025-06-06 00:00:25+00:00,A57,A57,10.853113
7,1749150120_A..N54R,A,N,A15N,125 St,40.811109,-73.952343,2025-06-06 00:00:30+00:00,A15,A15,27.662716
15,1749150240_A..N55R,A,N,A42N,Hoyt-Schermerhorn Sts,40.688484,-73.985001,2025-06-06 00:01:47+00:00,A42,A42,17.593208


In [6]:
# Verify terminal stations make sense
print("=== Northbound Terminal Stations (should be low distance ~0) ===")
north_terminals = terminal_arrivals[terminal_arrivals['direction'] == 'N']
print(north_terminals.groupby('stop_name')['distance_from_start_mi'].agg(['count', 'mean']).sort_values('count', ascending=False).head(10))

print("\n=== Southbound Terminal Stations (should be high distance ~32) ===")
south_terminals = terminal_arrivals[terminal_arrivals['direction'] == 'S']
print(south_terminals.groupby('stop_name')['distance_from_start_mi'].agg(['count', 'mean']).sort_values('count', ascending=False).head(10))

=== Northbound Terminal Stations (should be low distance ~0) ===
                            count       mean
stop_name                                   
Ozone Park-Lefferts Blvd    15154   9.466588
Far Rockaway-Mott Av        14482   0.000000
Rockaway Park-Beach 116 St    888   3.158321
168 St                        117  29.845248
Euclid Av                      44  11.249591
Shepherd Av                    35  11.711652
Jay St-MetroTech               35  17.950742
Beach 25 St                    29   0.416841
Rockaway Av                    22  13.640900
Ralph Av                       21  14.106337

=== Southbound Terminal Stations (should be high distance ~32) ===
                            count       mean
stop_name                                   
Inwood-207 St               23973  32.199699
Euclid Av                    3365  11.249591
168 St                       2885  29.845248
Dyckman St                    625  31.774254
Utica Av                      264  14.629842
Rockaway Par

## 3. Compute Observed Terminal Headways

For each direction, sort by arrival time and compute headway as time since previous train.

In [7]:
def compute_headways(df, direction):
    """
    Compute headways for a given direction.
    Headway = time since previous train departed from terminal.
    """
    # Filter by direction and sort by time
    dir_df = df[df['direction'] == direction].copy()
    dir_df = dir_df.sort_values('arrival_time').reset_index(drop=True)
    
    # Compute headway in minutes
    dir_df['prev_arrival'] = dir_df['arrival_time'].shift(1)
    dir_df['headway_sec'] = (dir_df['arrival_time'] - dir_df['prev_arrival']).dt.total_seconds()
    dir_df['headway_min'] = dir_df['headway_sec'] / 60.0
    
    # Direction ID (0=N, 1=S to match existing convention)
    dir_df['direction_id'] = 0 if direction == 'N' else 1
    
    return dir_df

# Compute headways for each direction
north_headways = compute_headways(terminal_arrivals, 'N')
south_headways = compute_headways(terminal_arrivals, 'S')

print(f"Northbound: {len(north_headways):,} terminal departures")
print(f"Southbound: {len(south_headways):,} terminal departures")

Northbound: 31,189 terminal departures
Southbound: 31,723 terminal departures


In [8]:
# Combine both directions
observed_headways = pd.concat([north_headways, south_headways], ignore_index=True)
observed_headways = observed_headways.sort_values('arrival_time').reset_index(drop=True)

# Rename for consistency with existing pipeline
observed_headways = observed_headways.rename(columns={
    'arrival_time': 'departure_time',
    'headway_min': 'observed_headway_min'
})

# Add service date
observed_headways['service_date'] = observed_headways['departure_time'].dt.date

print(f"\nTotal records: {len(observed_headways):,}")
observed_headways.head(10)


Total records: 62,912


Unnamed: 0,trip_uid,route_id,direction,stop_id,stop_name,stop_lat,stop_lon,departure_time,stop_id_base,stop_id_station,distance_from_start_mi,prev_arrival,headway_sec,observed_headway_min,direction_id,service_date
0,1749151110_A..S57R,A,S,A38S,Fulton St,40.710197,-74.007691,2025-06-06 00:00:00+00:00,A38,A38,19.795857,NaT,,,1,2025-06-06
1,1749153120_A..N55R,A,N,H06N,Beach 67 St,40.590927,-73.796924,2025-06-06 00:00:02+00:00,H06,H06,2.478227,NaT,,,0,2025-06-06
2,1749149220_A..S58R,A,S,H03S,Howard Beach-JFK Airport,40.660476,-73.830301,2025-06-06 00:00:07+00:00,H03,H03,8.007467,2025-06-06 00:00:00+00:00,7.0,0.116667,1,2025-06-06
3,1749153300_A..S57R,A,S,A06S,181 St,40.851695,-73.937969,2025-06-06 00:00:17+00:00,A06,A06,30.641013,2025-06-06 00:00:07+00:00,10.0,0.166667,1,2025-06-06
4,1749147750_A..S58R,A,S,H11S,Far Rockaway-Mott Av,40.603995,-73.755405,2025-06-06 00:00:23+00:00,H11,H11,0.0,2025-06-06 00:00:17+00:00,6.0,0.1,1,2025-06-06
5,1749149940_A..S58R,A,S,A57S,Grant Av,40.677044,-73.86505,2025-06-06 00:00:25+00:00,A57,A57,10.853113,2025-06-06 00:00:23+00:00,2.0,0.033333,1,2025-06-06
6,1749149550_A..S57R,A,S,A59S,80 St,40.679371,-73.858992,2025-06-06 00:00:27+00:00,A59,A59,10.484866,2025-06-06 00:00:25+00:00,2.0,0.033333,1,2025-06-06
7,1749150120_A..N54R,A,N,A15N,125 St,40.811109,-73.952343,2025-06-06 00:00:30+00:00,A15,A15,27.662716,2025-06-06 00:00:02+00:00,28.0,0.466667,0,2025-06-06
8,1749151500_A..S58R,A,S,A36S,Chambers St,40.714111,-74.008585,2025-06-06 00:00:47+00:00,A36,A36,20.153914,2025-06-06 00:00:27+00:00,20.0,0.333333,1,2025-06-06
9,1749152880_A..N54R,A,N,A57N,Grant Av,40.677044,-73.86505,2025-06-06 00:00:57+00:00,A57,A57,10.853113,2025-06-06 00:00:30+00:00,27.0,0.45,0,2025-06-06


## 4. Validate Headway Distribution

In [9]:
# Check headway statistics
print("=== Observed Terminal Headway Statistics (minutes) ===")
print("\nNorthbound:")
print(observed_headways[observed_headways['direction_id']==0]['observed_headway_min'].describe())

print("\nSouthbound:")
print(observed_headways[observed_headways['direction_id']==1]['observed_headway_min'].describe())

=== Observed Terminal Headway Statistics (minutes) ===

Northbound:
count    31188.000000
mean         8.468072
std          5.590964
min          0.000000
25%          3.583333
50%          8.333333
75%         12.000000
max         78.733333
Name: observed_headway_min, dtype: float64

Southbound:
count    31722.000000
mean         8.325762
std          4.527249
min          0.000000
25%          5.383333
50%          7.916667
75%         10.466667
max         65.216667
Name: observed_headway_min, dtype: float64


In [10]:
# Check for outliers (gaps > 60 min might indicate service gaps or data issues)
large_gaps = observed_headways[observed_headways['observed_headway_min'] > 60]
print(f"\nLarge gaps (>60 min): {len(large_gaps)}")
if len(large_gaps) > 0:
    print(large_gaps[['departure_time', 'direction', 'stop_name', 'observed_headway_min']].head(20))


Large gaps (>60 min): 2
                 departure_time direction                 stop_name  \
51038 2025-11-02 06:02:01+00:00         N  Ozone Park-Lefferts Blvd   
51039 2025-11-02 06:03:30+00:00         S                   High St   

       observed_headway_min  
51038             78.733333  
51039             65.216667  


In [11]:
# Cap extreme headways (overnight gaps) to reasonable maximum
MAX_HEADWAY_MIN = 30.0  # Same as model's MAX_HEADWAY

before_cap = observed_headways['observed_headway_min'].copy()
observed_headways['observed_headway_min'] = observed_headways['observed_headway_min'].clip(upper=MAX_HEADWAY_MIN)

capped_count = (before_cap > MAX_HEADWAY_MIN).sum()
print(f"Capped {capped_count:,} headways > {MAX_HEADWAY_MIN} min")

Capped 52 headways > 30.0 min


## 5. Compare with GTFS Schedule

Let's compare observed vs scheduled headways to see the difference.

In [12]:
# Load the old GTFS-based schedule for comparison
gtfs_schedule = pd.read_csv("../data/target_terminal_headways.csv")
print(f"GTFS schedule: {len(gtfs_schedule):,} records")
print(gtfs_schedule.columns.tolist())
gtfs_schedule.head()

GTFS schedule: 64,521 records
['trip_id', 'departure_time', 'departure_seconds', 'scheduled_headway_min', 'service_date', 'direction_id']


Unnamed: 0,trip_id,departure_time,departure_seconds,scheduled_headway_min,service_date,direction_id
0,BFA25GEN-A087-Weekday-00_003950_A..N43R,00:39:30,2370,,2025-06-06,0
1,BFA25GEN-A087-Weekday-00_005950_A..N43R,00:59:30,3570,20.0,2025-06-06,0
2,BFA25GEN-A087-Weekday-00_007950_A..N43R,01:19:30,4770,20.0,2025-06-06,0
3,BFA25GEN-A087-Weekday-00_009950_A..N43R,01:39:30,5970,20.0,2025-06-06,0
4,BFA25GEN-A087-Weekday-00_011950_A..N43R,01:59:30,7170,20.0,2025-06-06,0


In [13]:
# Compare distributions
print("=== Scheduled (GTFS) Headway Stats ===")
print(gtfs_schedule['scheduled_headway_min'].describe())

print("\n=== Observed (Actual) Headway Stats ===")
print(observed_headways['observed_headway_min'].dropna().describe())

=== Scheduled (GTFS) Headway Stats ===
count    64153.000000
mean         8.500553
std          5.402830
min          0.000000
25%          4.500000
50%          8.000000
75%         11.500000
max         63.500000
Name: scheduled_headway_min, dtype: float64

=== Observed (Actual) Headway Stats ===
count    62910.000000
mean         8.390558
std          5.047027
min          0.000000
25%          4.616667
50%          8.033333
75%         11.133333
max         30.000000
Name: observed_headway_min, dtype: float64


## 6. Save Observed Terminal Headways

In [14]:
# Select columns needed for the pipeline
output_df = observed_headways[[
    'trip_uid',
    'departure_time',
    'direction',
    'direction_id',
    'stop_name',
    'observed_headway_min',
    'service_date'
]].copy()

# Drop the first row of each direction (NaN headway)
output_df = output_df.dropna(subset=['observed_headway_min'])

print(f"Final dataset: {len(output_df):,} records")
print(f"\nBy direction:")
print(output_df['direction'].value_counts())

output_df.head(10)

Final dataset: 62,910 records

By direction:
direction
S    31722
N    31188
Name: count, dtype: int64


Unnamed: 0,trip_uid,departure_time,direction,direction_id,stop_name,observed_headway_min,service_date
2,1749149220_A..S58R,2025-06-06 00:00:07+00:00,S,1,Howard Beach-JFK Airport,0.116667,2025-06-06
3,1749153300_A..S57R,2025-06-06 00:00:17+00:00,S,1,181 St,0.166667,2025-06-06
4,1749147750_A..S58R,2025-06-06 00:00:23+00:00,S,1,Far Rockaway-Mott Av,0.1,2025-06-06
5,1749149940_A..S58R,2025-06-06 00:00:25+00:00,S,1,Grant Av,0.033333,2025-06-06
6,1749149550_A..S57R,2025-06-06 00:00:27+00:00,S,1,80 St,0.033333,2025-06-06
7,1749150120_A..N54R,2025-06-06 00:00:30+00:00,N,0,125 St,0.466667,2025-06-06
8,1749151500_A..S58R,2025-06-06 00:00:47+00:00,S,1,Chambers St,0.333333,2025-06-06
9,1749152880_A..N54R,2025-06-06 00:00:57+00:00,N,0,Grant Av,0.45,2025-06-06
10,1749150750_A..S58R,2025-06-06 00:01:07+00:00,S,1,High St,0.333333,2025-06-06
11,1749151080_A..N54R,2025-06-06 00:01:17+00:00,N,0,34 St-Penn Station,0.333333,2025-06-06


In [15]:
# Save to CSV
output_df.to_csv(OUTPUT_FILE, index=False)
print(f"Saved observed terminal headways to {OUTPUT_FILE}")

Saved observed terminal headways to ../data/observed_terminal_headways.csv


## 7. Next Steps

The `3b_distance_binning.ipynb` notebook needs to be updated to use:
- `observed_terminal_headways.csv` instead of `target_terminal_headways.csv`
- Column `observed_headway_min` instead of `scheduled_headway_min`

This will create `schedule_matrix_distance.npy` with **actual** terminal headways instead of planned ones.

In [16]:
print("Summary:")
print("========")
print(f"Created: {OUTPUT_FILE}")
print(f"Records: {len(output_df):,}")
print(f"Date range: {output_df['departure_time'].min()} to {output_df['departure_time'].max()}")
print(f"\nHeadway stats (minutes):")
print(f"  Mean: {output_df['observed_headway_min'].mean():.2f}")
print(f"  Std:  {output_df['observed_headway_min'].std():.2f}")
print(f"  Min:  {output_df['observed_headway_min'].min():.2f}")
print(f"  Max:  {output_df['observed_headway_min'].max():.2f}")

Summary:
Created: ../data/observed_terminal_headways.csv
Records: 62,910
Date range: 2025-06-06 00:00:07+00:00 to 2025-12-06 09:49:50+00:00

Headway stats (minutes):
  Mean: 8.39
  Std:  5.05
  Min:  0.00
  Max:  30.00
