In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# configuration
# grid resolution
TIME_BIN_SIZE_MIN = 1 # Strict 1-minute bins per Master Plan
FFILL_LIMIT = 30 # Limit forward fill to 30 minutes

# file paths
STATION_MAP_FILE = "../data/a_line_station_distances.csv"
SCHEDULE_FILE = "../data/target_terminal_headways.csv"
REALTIME_ARRIVALS_FILE = "../data/nyc_subway_a_line_arrivals_2025.csv"
OUTPUT_MATRIX_FILE = "../data/headway_matrix_full.npy"
OUTPUT_SCHEDULE_FILE = "../data/schedule_matrix_full.npy"


In [2]:
# 1. Load Station Map and Create Sequence Mapping
print("Loading Station Map...")
station_map = pd.read_csv(STATION_MAP_FILE)

# Create dictionary to map stop_id to sequence_id (0..N)
# The station_map is already sorted by distance in Notebook 2
stop_to_seq = dict(zip(station_map['stop_id'], station_map['sequence_id']))

print(f"Loaded {len(station_map)} stations.")
print(f"Sample Mapping: {list(stop_to_seq.items())[:5]}")

# 2. Load Real-time Arrivals
print("Loading Real-time Arrivals...")
arrivals_df = pd.read_csv(REALTIME_ARRIVALS_FILE)
arrivals_df['arrival_time'] = pd.to_datetime(arrivals_df['arrival_time'])

# Filter for valid stops
arrivals_df = arrivals_df[arrivals_df['stop_id'].isin(stop_to_seq.keys())].copy()

# Map stop_id to sequence_id
arrivals_df['sequence_id'] = arrivals_df['stop_id'].map(stop_to_seq)

# Map direction to integer (N=0, S=1) or keep as is?
# Master plan says: "Pivot data to (Time, Station, Direction)"
# Let's map N->0, S->1 for array indexing
direction_map = {'N': 0, 'S': 1}
arrivals_df['direction_idx'] = arrivals_df['direction'].map(direction_map)

print(f"Loaded {len(arrivals_df)} arrivals.")
display(arrivals_df.head())


Loading Station Map...
Loaded 198 stations.
Sample Mapping: [('A02', 0), ('A02N', 0), ('A02S', 0), ('A03', 1), ('A03N', 1)]
Loading Real-time Arrivals...
Loaded 2097004 arrivals.


Unnamed: 0,trip_uid,route_id,direction,stop_id,stop_name,stop_lat,stop_lon,arrival_time,sequence_id,direction_idx
0,1749151110_A..S57R,A,S,A38S,Fulton St,40.710197,-74.007691,2025-06-06 00:00:00+00:00,28,1
1,1749153120_A..N55R,A,N,H06N,Beach 67 St,40.590927,-73.796924,2025-06-06 00:00:02+00:00,60,0
2,1749149220_A..S58R,A,S,H03S,Howard Beach-JFK Airport,40.660476,-73.830301,2025-06-06 00:00:07+00:00,54,1
3,1749153300_A..S57R,A,S,A06S,181 St,40.851695,-73.937969,2025-06-06 00:00:17+00:00,3,1
4,1749147750_A..S58R,A,S,H11S,Far Rockaway-Mott Av,40.603995,-73.755405,2025-06-06 00:00:23+00:00,65,1


In [None]:
# 3. Spatiotemporal Tensor Construction (REMEDIATED)
# 
# KEY FIX: Spatial imputation instead of temporal ffill
# The paper uses headway fields that propagate spatially along the track.
# If station B has no data at time t, we use the headway from the nearest
# station that does have data at that same time t.

print("Constructing Spatiotemporal Tensor (SPATIAL IMPUTATION)...")

# Check for duplicate columns
if len(arrivals_df.columns) != len(set(arrivals_df.columns)):
    print("Warning: Duplicate columns found. Handling...")
    arrivals_df = arrivals_df.loc[:, ~arrivals_df.columns.duplicated()]

# Create a dedicated value column to avoid conflict with index grouper
arrivals_df['arrival_ts'] = arrivals_df['arrival_time']

# Pivot to (Time, Station, Direction)
pivot_df = arrivals_df.pivot_table(
    index='arrival_time', 
    columns=['sequence_id', 'direction_idx'], 
    values='arrival_ts', 
    aggfunc='last'
)

# Resample to 1-minute frequency
resampled_df = pivot_df.resample(f'{TIME_BIN_SIZE_MIN}min').last()

# ========================================================================
# VECTORIZED SPATIAL IMPUTATION (Fast version for 264K timesteps)
# Uses pandas interpolate with method='nearest' along axis=1 (stations)
# ========================================================================

def spatial_impute_vectorized(df, direction):
    """
    Vectorized spatial imputation using pandas interpolate.
    Much faster than row-by-row iteration.
    
    Strategy:
    1. Extract single direction
    2. Sort columns by station sequence
    3. Use interpolate(method='nearest', axis=1) to fill from nearest station
    4. Use bfill/ffill for edge cases (first/last station)
    """
    # Extract columns for this direction
    cols = [(seq, direction) for seq in sorted(set(c[0] for c in df.columns)) 
            if (seq, direction) in df.columns]
    
    if len(cols) == 0:
        return pd.DataFrame()
    
    sub_df = df[cols].copy()
    
    # Sort columns by sequence_id (already should be, but ensure)
    sub_df = sub_df.reindex(columns=sorted(sub_df.columns, key=lambda x: x[0]))
    
    # Interpolate spatially (along axis=1 = columns = stations)
    # 'nearest' method uses nearest valid value
    imputed = sub_df.interpolate(method='nearest', axis=1, limit_direction='both')
    
    # Handle edge cases where entire rows or edge stations are NaN
    imputed = imputed.bfill(axis=1).ffill(axis=1)
    
    return imputed

print("  Applying vectorized spatial imputation for Northbound...")
north_imputed = spatial_impute_vectorized(resampled_df, direction=0)
print(f"    Northbound shape: {north_imputed.shape}, NaN remaining: {north_imputed.isna().sum().sum()}")

print("  Applying vectorized spatial imputation for Southbound...")
south_imputed = spatial_impute_vectorized(resampled_df, direction=1)
print(f"    Southbound shape: {south_imputed.shape}, NaN remaining: {south_imputed.isna().sum().sum()}")

# Merge back
filled_df = pd.concat([north_imputed, south_imputed], axis=1)

# Sort columns to standard order
filled_df = filled_df.sort_index(axis=1)

# Small temporal ffill for system-closed gaps only (max 5 mins)
# This is acceptable for overnight gaps but prevents daytime "phantom dwells"
filled_df = filled_df.ffill(limit=5)

# Calculate Headway (Current Time - Last Arrival Time)
current_time = filled_df.index.to_series()
headway_df = filled_df.apply(lambda col: current_time - col)

# Convert to minutes
headway_df = headway_df / pd.Timedelta(minutes=1)

# Fill remaining gaps with 0 (System Closed / No Data)
headway_df = headway_df.fillna(0)

# Clip to valid range
headway_df = headway_df.clip(lower=0, upper=FFILL_LIMIT)

print("\n‚úÖ Headway Matrix Constructed with Spatial Imputation.")
print(f"Shape: {headway_df.shape}")

# Validate sparsity - THIS IS THE KEY METRIC
zero_fraction = (headway_df == 0).sum().sum() / headway_df.size
print(f"\nüìä Grid Density Check:")
print(f"   Zero fraction: {zero_fraction:.2%}")
if zero_fraction < 0.01:
    print("   ‚úÖ PASS: <1% zeros (dense headway field achieved)")
elif zero_fraction < 0.05:
    print("   ‚ö†Ô∏è WARNING: 1-5% zeros (acceptable but not ideal)")
else:
    print("   ‚ùå FAIL: >5% zeros (sparse grid, imputation may have failed)")

# Show sample data
display(headway_df.iloc[:10, :5])

Constructing Spatiotemporal Tensor...
Headway Matrix Constructed.
Shape: (264222, 131)


sequence_id,0,0,1,1,2
direction_idx,0,1,0,1,0
arrival_time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-06-06 00:00:00+00:00,0.0,0.0,0.0,0.0,0.0
2025-06-06 00:01:00+00:00,0.0,0.0,0.0,0.0,0.0
2025-06-06 00:02:00+00:00,0.0,0.0,0.0,0.0,0.0
2025-06-06 00:03:00+00:00,0.0,0.8,0.916667,0.0,0.0
2025-06-06 00:04:00+00:00,0.416667,1.8,1.916667,0.383333,0.0
2025-06-06 00:05:00+00:00,1.416667,2.8,2.916667,1.383333,0.0
2025-06-06 00:06:00+00:00,2.416667,3.8,3.916667,2.383333,0.0
2025-06-06 00:07:00+00:00,3.416667,4.8,4.916667,3.383333,0.0
2025-06-06 00:08:00+00:00,4.416667,5.8,5.916667,4.383333,0.0
2025-06-06 00:09:00+00:00,5.416667,6.8,6.916667,5.383333,0.966667


In [4]:
# 4. Reshape and Normalize

# Ensure all stations and directions are present
all_sequences = sorted(station_map['sequence_id'].unique())
all_directions = [0, 1]
full_index = pd.MultiIndex.from_product([all_sequences, all_directions], names=['sequence_id', 'direction_idx'])

# Reindex columns to ensure full grid (fill missing columns with 0)
headway_df_full = headway_df.reindex(columns=full_index, fill_value=0)

# Convert to Numpy Array
# Shape: (Time, Stations * Directions)
matrix_flat = headway_df_full.values

# Reshape to (Time, Stations, Directions, 1)
num_time = matrix_flat.shape[0]
num_stations = len(all_sequences)
num_directions = len(all_directions)

matrix_reshaped = matrix_flat.reshape(num_time, num_stations, num_directions, 1)

print(f"Final Matrix Shape: {matrix_reshaped.shape}")

# REMOVED: Naive Normalization. We now save raw minutes to allow for RobustScaling downstream.
# MAX_HEADWAY = FFILL_LIMIT
# matrix_norm = matrix_reshaped / MAX_HEADWAY
# matrix_norm = np.clip(matrix_norm, 0, 1)

print(f"Matrix Constructed (Raw Minutes). Max Value: {matrix_reshaped.max()}")

# Save Raw Matrix
np.save(OUTPUT_MATRIX_FILE, matrix_reshaped)
print(f"Saved Headway Matrix to {OUTPUT_MATRIX_FILE}")


Final Matrix Shape: (264222, 66, 2, 1)
Matrix Constructed (Raw Minutes). Max Value: 30.0
Saved Headway Matrix to ../data/headway_matrix_full.npy


In [5]:
# 5. Process Terminal Schedule

print("Processing Terminal Schedule...")
schedule_df = pd.read_csv(SCHEDULE_FILE)

# Rename direction_id to direction_idx to match arrivals_df convention
if 'direction_id' in schedule_df.columns:
    schedule_df = schedule_df.rename(columns={'direction_id': 'direction_idx'})

# Construct full datetime from service_date and departure_seconds
# This handles GTFS times > 24:00:00 correctly (e.g. 25:00:00 becomes next day 01:00:00)
# We use departure_seconds because departure_time string might be "25:30:00" which pd.to_datetime fails on.
schedule_df['service_date_dt'] = pd.to_datetime(schedule_df['service_date'])
schedule_df['departure_dt'] = schedule_df['service_date_dt'] + pd.to_timedelta(schedule_df['departure_seconds'], unit='s')

# FIX: Localize to UTC to match arrivals_df index (which is assumed to be UTC or has +00:00)
# If we don't do this, reindex() will fail to match naive timestamps with aware timestamps, resulting in all NaNs.
schedule_df['departure_dt'] = schedule_df['departure_dt'].dt.tz_localize('UTC')

# Pivot
# Index: Departure Time (Full Datetime)
# Columns: Direction
# Values: Scheduled Headway (The gap associated with this train)
sched_pivot = schedule_df.pivot_table(
    index='departure_dt',
    columns='direction_idx',
    values='scheduled_headway_min',
    aggfunc='mean' # Should be unique per train
)

# Resample to 1-min and FFill
# This creates a step function: "The scheduled headway is X minutes"
sched_resampled = sched_pivot.resample(f'{TIME_BIN_SIZE_MIN}min').ffill()

# Reindex to match the main matrix time index
# We use the same time range as the real-time data
sched_aligned = sched_resampled.reindex(headway_df.index).ffill().fillna(0)

# Ensure both directions 0 and 1 exist
for d in [0, 1]:
    if d not in sched_aligned.columns:
        sched_aligned[d] = 0

sched_aligned = sched_aligned[[0, 1]]

# Convert to Numpy
# Shape: (Time, 2)
sched_matrix = sched_aligned.values

# Reshape to (Time, 2, 1)
sched_matrix = sched_matrix.reshape(sched_matrix.shape[0], 2, 1)

# REMOVED: Naive Normalization. Saving raw minutes.
# sched_norm = sched_matrix / MAX_HEADWAY
# sched_norm = np.clip(sched_norm, 0, 1)

print(f"Schedule Matrix Shape: {sched_matrix.shape}")

np.save(OUTPUT_SCHEDULE_FILE, sched_matrix)
print(f"Saved Schedule Matrix to {OUTPUT_SCHEDULE_FILE}")

Processing Terminal Schedule...
Schedule Matrix Shape: (264222, 2, 1)
Saved Schedule Matrix to ../data/schedule_matrix_full.npy


In [None]:
# --- REMEDIATION VALIDATION CHECK ---
# Verify spatial imputation quality and data statistics

print("=" * 60)
print("REMEDIATION PHASE 2: DATA QUALITY VALIDATION")
print("=" * 60)

# 1. Zero Fraction (Grid Density)
zero_fraction = (matrix_reshaped == 0).sum() / matrix_reshaped.size
print(f"\n1Ô∏è‚É£ Grid Density:")
print(f"   Zero fraction: {zero_fraction:.2%}")
print(f"   Target: <1% (dense headway field)")
if zero_fraction < 0.01:
    print("   ‚úÖ PASS")
else:
    print("   ‚ùå FAIL - Consider reviewing imputation logic")

# 2. Value Distribution (should be realistic headways)
non_zero_headways = matrix_reshaped[matrix_reshaped > 0]
print(f"\n2Ô∏è‚É£ Headway Distribution (non-zero values):")
print(f"   Count: {len(non_zero_headways):,}")
print(f"   Mean:  {non_zero_headways.mean():.1f} min")
print(f"   Std:   {non_zero_headways.std():.1f} min")
print(f"   Min:   {non_zero_headways.min():.1f} min")
print(f"   25%:   {np.percentile(non_zero_headways, 25):.1f} min")
print(f"   50%:   {np.percentile(non_zero_headways, 50):.1f} min")
print(f"   75%:   {np.percentile(non_zero_headways, 75):.1f} min")
print(f"   Max:   {non_zero_headways.max():.1f} min")

# 3. Sawtooth Check (headways should cycle 0 -> peak -> 0)
# Sample one station and check for periodic pattern
sample_station = 10  # Mid-line station
sample_headways_n = matrix_reshaped[1000:1500, sample_station, 0, 0]  # 500 mins of Northbound
print(f"\n3Ô∏è‚É£ Sawtooth Pattern Check (station {sample_station}, Northbound):")
# Count zero-crossings (new train arrivals)
zero_crossings = np.sum(np.diff(sample_headways_n < 1) != 0) // 2
print(f"   Train arrivals in 500 min sample: {zero_crossings}")
print(f"   Implied headway: {500 / max(zero_crossings, 1):.1f} min")
if zero_crossings > 20:  # Expect ~50 trains in 500 mins at ~10 min headway
    print("   ‚úÖ PASS - Realistic train frequency detected")
else:
    print("   ‚ö†Ô∏è WARNING - Low train frequency, check data")

# 4. Schedule Data Check
non_zero_sched = sched_matrix[sched_matrix > 0]
print(f"\n4Ô∏è‚É£ Schedule Matrix:")
if len(non_zero_sched) > 0:
    print(f"   ‚úÖ Non-zero entries: {len(non_zero_sched):,}")
    print(f"   Mean scheduled headway: {non_zero_sched.mean():.1f} min")
    print(f"   Min: {non_zero_sched.min():.1f}, Max: {non_zero_sched.max():.1f}")
else:
    print("   ‚ùå CRITICAL ERROR: Schedule matrix is all zeros!")

# 5. Shape Verification
print(f"\n5Ô∏è‚É£ Final Shapes:")
print(f"   Headway matrix: {matrix_reshaped.shape} (expected: [T, 33, 2, 1])")
print(f"   Schedule matrix: {sched_matrix.shape} (expected: [T, 2, 1])")

print("\n" + "=" * 60)
if zero_fraction < 0.01 and len(non_zero_sched) > 0:
    print("üéâ REMEDIATION PHASE 2 COMPLETE - Data ready for training")
else:
    print("‚ö†Ô∏è REMEDIATION INCOMPLETE - Review issues above")
print("=" * 60)


--- Validation Check ---
Sample Headway Values (should show minutes like 5.0, 12.0, etc, NOT 0.1, 0.4):
[5.16666667 6.16666667 7.16666667 8.16666667 9.16666667]

Sample Schedule Values:
[20. 20. 20. 20. 20.]

Max Headway Value in Validation: 30.0
Max Schedule Value in Validation: 63.5

‚úÖ Schedule Check: Found 528354 non-zero entries.
Sample non-zero values: [18. 18. 18. 18. 18.]

‚úÖ SUCCESS: Data appears to be un-normalized (Raw Minutes).
