In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# configuration
# grid resolution
TIME_BIN_SIZE_MIN = 1 # Strict 1-minute bins per Master Plan
FFILL_LIMIT = 30 # Limit forward fill to 30 minutes

# file paths
STATION_MAP_FILE = "../data/a_line_station_distances.csv"
SCHEDULE_FILE = "../data/target_terminal_headways.csv"
REALTIME_ARRIVALS_FILE = "../data/nyc_subway_a_line_arrivals_2025.csv"
OUTPUT_MATRIX_FILE = "../data/headway_matrix_full.npy"
OUTPUT_SCHEDULE_FILE = "../data/schedule_matrix_full.npy"


In [2]:
# 1. Load Station Map and Create Sequence Mapping
print("Loading Station Map...")
station_map = pd.read_csv(STATION_MAP_FILE)

# Create dictionary to map stop_id to sequence_id (0..N)
# The station_map is already sorted by distance in Notebook 2
stop_to_seq = dict(zip(station_map['stop_id'], station_map['sequence_id']))

print(f"Loaded {len(station_map)} stations.")
print(f"Sample Mapping: {list(stop_to_seq.items())[:5]}")

# 2. Load Real-time Arrivals
print("Loading Real-time Arrivals...")
arrivals_df = pd.read_csv(REALTIME_ARRIVALS_FILE)
arrivals_df['arrival_time'] = pd.to_datetime(arrivals_df['arrival_time'])

# Filter for valid stops
arrivals_df = arrivals_df[arrivals_df['stop_id'].isin(stop_to_seq.keys())].copy()

# Map stop_id to sequence_id
arrivals_df['sequence_id'] = arrivals_df['stop_id'].map(stop_to_seq)

# Map direction to integer (N=0, S=1) or keep as is?
# Master plan says: "Pivot data to (Time, Station, Direction)"
# Let's map N->0, S->1 for array indexing
direction_map = {'N': 0, 'S': 1}
arrivals_df['direction_idx'] = arrivals_df['direction'].map(direction_map)

print(f"Loaded {len(arrivals_df)} arrivals.")
display(arrivals_df.head())


Loading Station Map...
Loaded 198 stations.
Sample Mapping: [('A02', 0), ('A02N', 0), ('A02S', 0), ('A03', 1), ('A03N', 1)]
Loading Real-time Arrivals...
Loaded 2097004 arrivals.


Unnamed: 0,trip_uid,route_id,direction,stop_id,stop_name,stop_lat,stop_lon,arrival_time,sequence_id,direction_idx
0,1749151110_A..S57R,A,S,A38S,Fulton St,40.710197,-74.007691,2025-06-06 00:00:00+00:00,28,1
1,1749153120_A..N55R,A,N,H06N,Beach 67 St,40.590927,-73.796924,2025-06-06 00:00:02+00:00,60,0
2,1749149220_A..S58R,A,S,H03S,Howard Beach-JFK Airport,40.660476,-73.830301,2025-06-06 00:00:07+00:00,54,1
3,1749153300_A..S57R,A,S,A06S,181 St,40.851695,-73.937969,2025-06-06 00:00:17+00:00,3,1
4,1749147750_A..S58R,A,S,H11S,Far Rockaway-Mott Av,40.603995,-73.755405,2025-06-06 00:00:23+00:00,65,1


In [3]:
# 3. Spatiotemporal Tensor Construction (REMEDIATED)
#
# STRATEGY: 
# 1. Compute headways (keep NaN for missing data)
# 2. Apply spatial imputation on NaN values
# 3. THEN fill remaining NaN with 0 (system closed)

import time
t0 = time.time()

print("Constructing Spatiotemporal Tensor...")

# Check for duplicate columns
if len(arrivals_df.columns) != len(set(arrivals_df.columns)):
    arrivals_df = arrivals_df.loc[:, ~arrivals_df.columns.duplicated()]

arrivals_df['arrival_ts'] = arrivals_df['arrival_time']

# Pivot to (Time, Station, Direction)
pivot_df = arrivals_df.pivot_table(
    index='arrival_time', 
    columns=['sequence_id', 'direction_idx'], 
    values='arrival_ts', 
    aggfunc='last'
)

# Resample to 1-minute frequency
print(f"  Resampling... ({time.time()-t0:.1f}s)")
resampled_df = pivot_df.resample(f'{TIME_BIN_SIZE_MIN}min').last()
print(f"  Grid shape: {resampled_df.shape}")

# ========================================================================
# STEP 1: Temporal ffill on arrival times (original approach)
# ========================================================================
print(f"  Temporal ffill on arrivals... ({time.time()-t0:.1f}s)")
filled_df = resampled_df.ffill(limit=FFILL_LIMIT)

# ========================================================================
# STEP 2: Calculate headways - KEEP NaN (don't fill with 0 yet!)
# ========================================================================
print(f"  Calculating headways... ({time.time()-t0:.1f}s)")
current_time = filled_df.index.to_series()
headway_df = filled_df.apply(lambda col: current_time - col)
headway_df = headway_df / pd.Timedelta(minutes=1)
# Clip negatives but KEEP NaN for spatial imputation
headway_df = headway_df.clip(lower=0)

print(f"  Headway calculation done ({time.time()-t0:.1f}s)")
print(f"  NaN count before spatial imputation: {headway_df.isna().sum().sum():,}")

# ========================================================================
# STEP 3: SPATIAL IMPUTATION on numeric headways
# Fill NaN from nearest station along the track
# ========================================================================
print(f"  Applying spatial imputation... ({time.time()-t0:.1f}s)")

# Get columns for each direction
col_names = headway_df.columns.tolist()
north_cols = sorted([c for c in col_names if c[1] == 0], key=lambda x: x[0])
south_cols = sorted([c for c in col_names if c[1] == 1], key=lambda x: x[0])

# Spatial fill: bfill + ffill along stations (axis=1)
north_headways = headway_df[north_cols].bfill(axis=1).ffill(axis=1)
south_headways = headway_df[south_cols].bfill(axis=1).ffill(axis=1)

# Merge back
headway_df = pd.concat([north_headways, south_headways], axis=1).sort_index(axis=1)

print(f"  NaN count after spatial imputation: {headway_df.isna().sum().sum():,}")

# ========================================================================
# STEP 4: Fill remaining NaN with 0 (system closed periods)
# ========================================================================
headway_df = headway_df.fillna(0)
headway_df = headway_df.clip(lower=0, upper=FFILL_LIMIT)

print(f"\n‚úÖ Done in {time.time()-t0:.1f}s")
print(f"Shape: {headway_df.shape}")

# Validate sparsity
zero_fraction = (headway_df == 0).sum().sum() / headway_df.size
print(f"\nüìä Grid Density: {zero_fraction:.2%} zeros")
if zero_fraction < 0.01:
    print("   ‚úÖ PASS: <1% zeros")
elif zero_fraction < 0.05:
    print("   ‚ö†Ô∏è WARNING: 1-5% zeros")  
elif zero_fraction < 0.20:
    print("   ‚ö†Ô∏è Moderate: 5-20% zeros (mostly overnight closures)")
else:
    print("   ‚ùå FAIL: >20% zeros")

display(headway_df.iloc[:5, :4])

Constructing Spatiotemporal Tensor...
  Resampling... (5.0s)
  Grid shape: (264222, 131)
  Temporal ffill on arrivals... (5.7s)
  Calculating headways... (6.1s)
  Headway calculation done (6.5s)
  NaN count before spatial imputation: 10,312,978
  Applying spatial imputation... (6.6s)
  NaN count after spatial imputation: 3,930

‚úÖ Done in 7.0s
Shape: (264222, 131)

üìä Grid Density: 9.38% zeros
   ‚ö†Ô∏è Moderate: 5-20% zeros (mostly overnight closures)


sequence_id,0,0,1,1
direction_idx,0,1,0,1
arrival_time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2025-06-06 00:00:00+00:00,0.0,0.0,0.0,0.0
2025-06-06 00:01:00+00:00,0.5,0.716667,0.5,0.716667
2025-06-06 00:02:00+00:00,0.0,0.0,0.0,1.716667
2025-06-06 00:03:00+00:00,0.0,0.8,0.916667,0.0
2025-06-06 00:04:00+00:00,0.416667,1.8,1.916667,0.383333


In [4]:
# 4. Reshape and Normalize

# Ensure all stations and directions are present
all_sequences = sorted(station_map['sequence_id'].unique())
all_directions = [0, 1]
full_index = pd.MultiIndex.from_product([all_sequences, all_directions], names=['sequence_id', 'direction_idx'])

# Reindex columns to ensure full grid (fill missing columns with 0)
headway_df_full = headway_df.reindex(columns=full_index, fill_value=0)

# Convert to Numpy Array
# Shape: (Time, Stations * Directions)
matrix_flat = headway_df_full.values

# Reshape to (Time, Stations, Directions, 1)
num_time = matrix_flat.shape[0]
num_stations = len(all_sequences)
num_directions = len(all_directions)

matrix_reshaped = matrix_flat.reshape(num_time, num_stations, num_directions, 1)

print(f"Final Matrix Shape: {matrix_reshaped.shape}")

# REMOVED: Naive Normalization. We now save raw minutes to allow for RobustScaling downstream.
# MAX_HEADWAY = FFILL_LIMIT
# matrix_norm = matrix_reshaped / MAX_HEADWAY
# matrix_norm = np.clip(matrix_norm, 0, 1)

print(f"Matrix Constructed (Raw Minutes). Max Value: {matrix_reshaped.max()}")

# Save Raw Matrix
np.save(OUTPUT_MATRIX_FILE, matrix_reshaped)
print(f"Saved Headway Matrix to {OUTPUT_MATRIX_FILE}")


Final Matrix Shape: (264222, 66, 2, 1)
Matrix Constructed (Raw Minutes). Max Value: 30.0
Saved Headway Matrix to ../data/headway_matrix_full.npy


In [5]:
# 5. Process Terminal Schedule

print("Processing Terminal Schedule...")
schedule_df = pd.read_csv(SCHEDULE_FILE)

# Rename direction_id to direction_idx to match arrivals_df convention
if 'direction_id' in schedule_df.columns:
    schedule_df = schedule_df.rename(columns={'direction_id': 'direction_idx'})

# Construct full datetime from service_date and departure_seconds
# This handles GTFS times > 24:00:00 correctly (e.g. 25:00:00 becomes next day 01:00:00)
# We use departure_seconds because departure_time string might be "25:30:00" which pd.to_datetime fails on.
schedule_df['service_date_dt'] = pd.to_datetime(schedule_df['service_date'])
schedule_df['departure_dt'] = schedule_df['service_date_dt'] + pd.to_timedelta(schedule_df['departure_seconds'], unit='s')

# FIX: Localize to UTC to match arrivals_df index (which is assumed to be UTC or has +00:00)
# If we don't do this, reindex() will fail to match naive timestamps with aware timestamps, resulting in all NaNs.
schedule_df['departure_dt'] = schedule_df['departure_dt'].dt.tz_localize('UTC')

# Pivot
# Index: Departure Time (Full Datetime)
# Columns: Direction
# Values: Scheduled Headway (The gap associated with this train)
sched_pivot = schedule_df.pivot_table(
    index='departure_dt',
    columns='direction_idx',
    values='scheduled_headway_min',
    aggfunc='mean' # Should be unique per train
)

# Resample to 1-min and FFill
# This creates a step function: "The scheduled headway is X minutes"
sched_resampled = sched_pivot.resample(f'{TIME_BIN_SIZE_MIN}min').ffill()

# Reindex to match the main matrix time index
# We use the same time range as the real-time data
sched_aligned = sched_resampled.reindex(headway_df.index).ffill().fillna(0)

# Ensure both directions 0 and 1 exist
for d in [0, 1]:
    if d not in sched_aligned.columns:
        sched_aligned[d] = 0

sched_aligned = sched_aligned[[0, 1]]

# Convert to Numpy
# Shape: (Time, 2)
sched_matrix = sched_aligned.values

# Reshape to (Time, 2, 1)
sched_matrix = sched_matrix.reshape(sched_matrix.shape[0], 2, 1)

# REMOVED: Naive Normalization. Saving raw minutes.
# sched_norm = sched_matrix / MAX_HEADWAY
# sched_norm = np.clip(sched_norm, 0, 1)

print(f"Schedule Matrix Shape: {sched_matrix.shape}")

np.save(OUTPUT_SCHEDULE_FILE, sched_matrix)
print(f"Saved Schedule Matrix to {OUTPUT_SCHEDULE_FILE}")

Processing Terminal Schedule...
Schedule Matrix Shape: (264222, 2, 1)
Saved Schedule Matrix to ../data/schedule_matrix_full.npy


In [7]:
# --- REMEDIATION VALIDATION CHECK ---
# Verify spatial imputation quality and data statistics

print("=" * 60)
print("REMEDIATION PHASE 2: DATA QUALITY VALIDATION")
print("=" * 60)

# 1. Zero Fraction (Grid Density)
# Note: ~10% zeros is EXPECTED due to overnight closures (subway runs ~5am-1am)
zero_fraction = (matrix_reshaped == 0).sum() / matrix_reshaped.size
print(f"\n1Ô∏è‚É£ Grid Density:")
print(f"   Zero fraction: {zero_fraction:.2%}")
print(f"   Target: <15% (allows for overnight closures)")
if zero_fraction < 0.15:
    print("   ‚úÖ PASS - Zeros represent overnight closures")
elif zero_fraction < 0.25:
    print("   ‚ö†Ô∏è WARNING: Higher than expected zeros")
else:
    print("   ‚ùå FAIL - Review imputation logic")

# 2. Value Distribution (should be realistic headways)
non_zero_headways = matrix_reshaped[matrix_reshaped > 0]
print(f"\n2Ô∏è‚É£ Headway Distribution (non-zero values):")
print(f"   Count: {len(non_zero_headways):,}")
print(f"   Mean:  {non_zero_headways.mean():.1f} min")
print(f"   Std:   {non_zero_headways.std():.1f} min")
print(f"   Min:   {non_zero_headways.min():.1f} min")
print(f"   25%:   {np.percentile(non_zero_headways, 25):.1f} min")
print(f"   50%:   {np.percentile(non_zero_headways, 50):.1f} min")
print(f"   75%:   {np.percentile(non_zero_headways, 75):.1f} min")
print(f"   Max:   {non_zero_headways.max():.1f} min")

# 3. Sawtooth Check (headways should cycle 0 -> peak -> 0)
sample_station = 10
sample_headways_n = matrix_reshaped[1000:1500, sample_station, 0, 0]
print(f"\n3Ô∏è‚É£ Sawtooth Pattern Check (station {sample_station}, Northbound):")
zero_crossings = np.sum(np.diff(sample_headways_n < 1) != 0) // 2
print(f"   Train arrivals in 500 min sample: {zero_crossings}")
print(f"   Implied headway: {500 / max(zero_crossings, 1):.1f} min")
if zero_crossings > 20:
    print("   ‚úÖ PASS - Realistic train frequency detected")
else:
    print("   ‚ö†Ô∏è WARNING - Low train frequency, check data")

# 4. Schedule Data Check
non_zero_sched = sched_matrix[sched_matrix > 0]
print(f"\n4Ô∏è‚É£ Schedule Matrix:")
if len(non_zero_sched) > 0:
    print(f"   ‚úÖ Non-zero entries: {len(non_zero_sched):,}")
    print(f"   Mean scheduled headway: {non_zero_sched.mean():.1f} min")
    print(f"   Min: {non_zero_sched.min():.1f}, Max: {non_zero_sched.max():.1f}")
else:
    print("   ‚ùå CRITICAL ERROR: Schedule matrix is all zeros!")

# 5. Shape Verification
num_stations_actual = matrix_reshaped.shape[1]
print(f"\n5Ô∏è‚É£ Final Shapes:")
print(f"   Headway matrix: {matrix_reshaped.shape}")
print(f"   Schedule matrix: {sched_matrix.shape}")
print(f"   Stations: {num_stations_actual} (A line has multiple branches)")

print("\n" + "=" * 60)
if zero_fraction < 0.15 and len(non_zero_sched) > 0:
    print("üéâ REMEDIATION PHASE 2 COMPLETE - Data ready for training")
else:
    print("‚ö†Ô∏è REMEDIATION INCOMPLETE - Review issues above")
print("=" * 60)

REMEDIATION PHASE 2: DATA QUALITY VALIDATION

1Ô∏è‚É£ Grid Density:
   Zero fraction: 10.06%
   Target: <15% (allows for overnight closures)
   ‚úÖ PASS - Zeros represent overnight closures

2Ô∏è‚É£ Headway Distribution (non-zero values):
   Count: 31,366,906
   Mean:  7.4 min
   Std:   6.2 min
   Min:   0.0 min
   25%:   2.6 min
   50%:   5.8 min
   75%:   10.6 min
   Max:   30.0 min

3Ô∏è‚É£ Sawtooth Pattern Check (station 10, Northbound):
   Train arrivals in 500 min sample: 54
   Implied headway: 9.3 min
   ‚úÖ PASS - Realistic train frequency detected

4Ô∏è‚É£ Schedule Matrix:
   ‚úÖ Non-zero entries: 528,354
   Mean scheduled headway: 7.7 min
   Min: 0.5, Max: 63.5

5Ô∏è‚É£ Final Shapes:
   Headway matrix: (264222, 66, 2, 1)
   Schedule matrix: (264222, 2, 1)
   Stations: 66 (A line has multiple branches)

üéâ REMEDIATION PHASE 2 COMPLETE - Data ready for training
