In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# configuration
# grid resolution
TIME_BIN_SIZE_MIN = 1 # Strict 1-minute bins per Master Plan
FFILL_LIMIT = 30 # Limit forward fill to 30 minutes

# file paths
STATION_MAP_FILE = "../data/a_line_station_distances.csv"
SCHEDULE_FILE = "../data/target_terminal_headways.csv"
REALTIME_ARRIVALS_FILE = "../data/nyc_subway_a_line_arrivals_2025.csv"
OUTPUT_MATRIX_FILE = "../data/headway_matrix_full.npy"
OUTPUT_SCHEDULE_FILE = "../data/schedule_matrix_full.npy"


In [None]:
# 1. Load Station Map and Create Sequence Mapping
print("Loading Station Map...")
station_map = pd.read_csv(STATION_MAP_FILE)

# Create dictionary to map stop_id to sequence_id (0..N)
# The station_map is already sorted by distance in Notebook 2
stop_to_seq = dict(zip(station_map['stop_id'], station_map['sequence_id']))

print(f"Loaded {len(station_map)} stations.")
print(f"Sample Mapping: {list(stop_to_seq.items())[:5]}")

# 2. Load Real-time Arrivals
print("Loading Real-time Arrivals...")
arrivals_df = pd.read_csv(REALTIME_ARRIVALS_FILE)
arrivals_df['arrival_time'] = pd.to_datetime(arrivals_df['arrival_time'])

# Filter for valid stops
arrivals_df = arrivals_df[arrivals_df['stop_id'].isin(stop_to_seq.keys())].copy()

# Map stop_id to sequence_id
arrivals_df['sequence_id'] = arrivals_df['stop_id'].map(stop_to_seq)

# Map direction to integer (N=0, S=1) or keep as is?
# Master plan says: "Pivot data to (Time, Station, Direction)"
# Let's map N->0, S->1 for array indexing
direction_map = {'N': 0, 'S': 1}
arrivals_df['direction_idx'] = arrivals_df['direction'].map(direction_map)

print(f"Loaded {len(arrivals_df)} arrivals.")
display(arrivals_df.head())


Loading Datasets...
Loaded 156 stations (Total).
Loaded 63477 scheduled departures.
Loaded 2100859 real-time arrival records.


Unnamed: 0,stop_id,stop_name,distance_from_start_mi,location_type,parent_station
0,A65S,Ozone Park-Lefferts Blvd,9.466588,0,A65
1,A65N,Ozone Park-Lefferts Blvd,9.466588,0,A65
2,A65,Ozone Park-Lefferts Blvd,9.466588,1,
3,A63,104 St,9.481178,1,
4,A63N,104 St,9.481178,0,A63


Unnamed: 0,trip_uid,route_id,direction,stop_id,stop_name,stop_lat,stop_lon,arrival_time
0,1749151110_A..S57R,A,S,A38S,Fulton St,40.710197,-74.007691,2025-06-06 00:00:00+00:00
1,1749153120_A..N55R,A,N,H06N,Beach 67 St,40.590927,-73.796924,2025-06-06 00:00:02+00:00
2,1749149220_A..S58R,A,S,H03S,Howard Beach-JFK Airport,40.660476,-73.830301,2025-06-06 00:00:07+00:00
3,1749153300_A..S57R,A,S,A06S,181 St,40.851695,-73.937969,2025-06-06 00:00:17+00:00
4,1749147750_A..S58R,A,S,H11S,Far Rockaway-Mott Av,40.603995,-73.755405,2025-06-06 00:00:23+00:00


In [None]:
# 3. Spatiotemporal Tensor Construction (The Physics Layer)

print("Constructing Spatiotemporal Tensor...")

# Pivot to (Time, Station, Direction)
# We want the LAST arrival time in each bin
pivot_df = arrivals_df.pivot_table(
    index='arrival_time', 
    columns=['sequence_id', 'direction_idx'], 
    values='arrival_time', 
    aggfunc='last'
)

# Resample to 1-minute frequency
# This creates the dense time grid
# .last() keeps the arrival time if it happened in that minute
resampled_df = pivot_df.resample(f'{TIME_BIN_SIZE_MIN}min').last()

# Forward Fill (Physics: Headway grows linearly)
# We fill the *Arrival Time* forward. 
# If a train arrived at 12:00, the cell at 12:05 will contain "12:00".
filled_df = resampled_df.ffill(limit=FFILL_LIMIT)

# Calculate Headway (Current Time - Last Arrival Time)
# We subtract the cell value (Last Arrival) from the index (Current Time)
# This creates the sawtooth wave: 0, 1, 2, 3...
current_time = filled_df.index.to_series()
headway_df = filled_df.apply(lambda col: current_time - col)

# Convert to minutes
headway_df = headway_df / pd.Timedelta(minutes=1)

# Fill remaining gaps with 0 (System Closed / No Data)
headway_df = headway_df.fillna(0)

# Clip negative values (shouldn't happen if logic is correct, but safety first)
headway_df = headway_df.clip(lower=0)

print("Headway Matrix Constructed.")
print(f"Shape: {headway_df.shape}")
display(headway_df.iloc[:10, :5]) # Show first 10 mins, first 5 columns


Line Length: 22.73 miles
Bin Width: 0.36 miles


Unnamed: 0,stop_id,stop_name,distance_from_start_mi,distance_bin
0,A65S,Ozone Park-Lefferts Blvd,9.466588,0
1,A65N,Ozone Park-Lefferts Blvd,9.466588,0
2,A65,Ozone Park-Lefferts Blvd,9.466588,0
3,A63,104 St,9.481178,0
4,A63N,104 St,9.481178,0
5,A64S,111 St,9.481178,0
6,A64,111 St,9.481178,0
7,A64N,111 St,9.481178,0
8,A63S,104 St,9.481178,0
9,A61S,Rockaway Blvd,9.686007,0


In [None]:
# 4. Reshape and Normalize

# Ensure all stations and directions are present
all_sequences = sorted(station_map['sequence_id'].unique())
all_directions = [0, 1]
full_index = pd.MultiIndex.from_product([all_sequences, all_directions], names=['sequence_id', 'direction_idx'])

# Reindex columns to ensure full grid (fill missing columns with 0)
headway_df_full = headway_df.reindex(columns=full_index, fill_value=0)

# Convert to Numpy Array
# Shape: (Time, Stations * Directions)
matrix_flat = headway_df_full.values

# Reshape to (Time, Stations, Directions, 1)
num_time = matrix_flat.shape[0]
num_stations = len(all_sequences)
num_directions = len(all_directions)

matrix_reshaped = matrix_flat.reshape(num_time, num_stations, num_directions, 1)

print(f"Final Matrix Shape: {matrix_reshaped.shape}")

# Normalize
# Max value is determined by FFILL_LIMIT (30 mins)
MAX_HEADWAY = FFILL_LIMIT
matrix_norm = matrix_reshaped / MAX_HEADWAY

# Clip to [0, 1] just in case
matrix_norm = np.clip(matrix_norm, 0, 1)

print(f"Matrix Normalized. Max Value: {matrix_norm.max()}")

# Save
np.save(OUTPUT_MATRIX_FILE, matrix_norm)
print(f"Saved Headway Matrix to {OUTPUT_MATRIX_FILE}")


Time bins created.


In [None]:
# 5. Process Terminal Schedule

print("Processing Terminal Schedule...")
schedule_df = pd.read_csv(SCHEDULE_FILE)
schedule_df['departure_time'] = pd.to_datetime(schedule_df['departure_time'])

# Pivot
# Index: Departure Time
# Columns: Direction
# Values: Scheduled Headway (The gap associated with this train)
sched_pivot = schedule_df.pivot_table(
    index='departure_time',
    columns='direction_idx',
    values='scheduled_headway_min',
    aggfunc='mean' # Should be unique per train
)

# Resample to 1-min and FFill
# This creates a step function: "The scheduled headway is X minutes"
sched_resampled = sched_pivot.resample(f'{TIME_BIN_SIZE_MIN}min').ffill()

# Reindex to match the main matrix time index
# We use the same time range as the real-time data
sched_aligned = sched_resampled.reindex(headway_df.index).ffill().fillna(0)

# Ensure both directions 0 and 1 exist
for d in [0, 1]:
    if d not in sched_aligned.columns:
        sched_aligned[d] = 0

sched_aligned = sched_aligned[[0, 1]]

# Convert to Numpy
# Shape: (Time, 2)
sched_matrix = sched_aligned.values

# Reshape to (Time, 2, 1)
sched_matrix = sched_matrix.reshape(sched_matrix.shape[0], 2, 1)

# Normalize?
# Scheduled headways are usually 5-20 mins.
# Let's normalize by same MAX_HEADWAY (30)
sched_norm = sched_matrix / MAX_HEADWAY
sched_norm = np.clip(sched_norm, 0, 1)

print(f"Schedule Matrix Shape: {sched_norm.shape}")

np.save(OUTPUT_SCHEDULE_FILE, sched_norm)
print(f"Saved Schedule Matrix to {OUTPUT_SCHEDULE_FILE}")


Calculating headways...
Calculated headways. Dropped 199580 records (NaNs, >20min, or <0.5min).
