# Model Training & Experimentation Framework

This notebook implements the "Experiment Factory" for the Headway Prediction model. 
It is designed to support the ablation analysis defined in the project abstract, allowing us to vary:
1.  **Lookback Window ($L$):** 30, 45, 60 minutes.
2.  **Input Features:** With or without Terminal Headways ($T$).
3.  **Prediction Horizon:** Recursive prediction up to 60 minutes.

We start by importing the necessary libraries, including TensorFlow/Keras for the Deep Learning components.

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns

# set random seeds for reproduceability
np.random.seed(42)
tf.random.set_seed(42)


In [None]:
# 1. Configuration class 
# We define an `ExperimentConfig` class to encapsulate all hyperparameters. This makes it easy to switch between different experimental setups (e.g., changing the lookback window or enabling/disabling terminal headways) without rewriting code. 

class ExperimentConfig:
    def __init__(
        self,
        lookback_mins=30, # Paper: 30 minutes
        forecast_mins=15, # Paper: 15 minutes (Single Step)
        time_bin_size_min=5, # Reverted to 5 minutes for stability
        use_terminal_headway=True,
        batch_size=32,    # Paper: 32
        epochs=32, 
        learning_rate=0.001 # Paper: 0.001
    ):
        self.lookback_mins = lookback_mins
        self.forecast_mins = forecast_mins
        self.time_bin_size_min = time_bin_size_min
        self.use_terminal_headway = use_terminal_headway
        self.batch_size = batch_size
        self.epochs = epochs
        self.learning_rate = learning_rate

        # calculated properties
        self.lookback_bins = lookback_mins // time_bin_size_min
        self.forecast_bins = forecast_mins // time_bin_size_min

    def __repr__(self):
        return (f"ExperimentalConfig(L={self.lookback_mins}m, "
                f"F={self.forecast_mins}m, "
                f"Use_T={self.use_terminal_headway}")

# create baseline configuration (exp-A1)
config = ExperimentConfig(
    lookback_mins=30, # Baseline from Table 1
    forecast_mins=15, # Baseline from Table 1
    use_terminal_headway=True
)

print(f"Active Configuration {config}")
print(f"Lookback Bins: {config.lookback_bins}")
print(f"Forecast Bins: {config.forecast_bins}")

## 2. Data Loading & Preparation

We load the preprocessed matrix and schedule data. We then use the `create_dataset` function (adapted from the EDA notebook) to generate the tensors based on the active `config`.

In [None]:
# file paths
MATRIX_PATH = "../data/headway_matrix_bidirectional.npy"
SCHEDULE_PATH = "../data/target_terminal_headways.csv"
GLOBAL_START_TIME = "2025-06-06 00:00:00"

def create_dataset(matrix, schedule_aligned, config):
    """
    Creates (X, T, Y) tensors from the matrix and aligned schedule.
    X: Historical Headways (Batch, Lookback, Space, Dir, 1)
    T: Future Terminal Headways (Batch, Forecast, Dir, 1)
    Y: Future Headways (Batch, Forecast, Space, Dir, 1)
    """
    X, T, Y = [], [], []
    
    # We need to ensure we have enough data for lookback AND forecast
    # Range: [Lookback, Total - Forecast]
    total_bins = len(matrix)
    start_idx = config.lookback_bins
    end_idx = total_bins - config.forecast_bins
    
    for i in range(start_idx, end_idx):
        # 1. History Window (i-L to i)
        # Shape: (L, Space, Dir)
        x_window = matrix[i-config.lookback_bins : i]
        
        # 2. Target Window (i to i+F)
        # Shape: (F, Space, Dir)
        y_window = matrix[i : i+config.forecast_bins]
        
        # 3. Schedule Window (i to i+F)
        # Shape: (F, Dir) -> We need to slice the aligned schedule
        # The schedule is already aligned to the matrix time index
        t_window = schedule_aligned[i : i+config.forecast_bins]
        
        X.append(x_window)
        Y.append(y_window)
        T.append(t_window)
        
    # Convert to numpy arrays and add channel dimension
    X = np.array(X)[..., np.newaxis] # (Batch, L, Space, Dir, 1)
    Y = np.array(Y)[..., np.newaxis] # (Batch, F, Space, Dir, 1)
    T = np.array(T)[..., np.newaxis] # (Batch, F, Dir, 1)
    
    return X, T, Y

def load_and_process_data(config):
    print("Loading data...")
    
    # 1. Load Matrix (Bidirectional)
    # Shape: (Time, Space, 2)
    matrix = np.load(MATRIX_PATH)

    # 2. Normalize to [0, 1] (MinMax Scaling)
    # We assume max headway is around 20 mins for normalization stability
    # Real values can be higher, but we clip to avoid outliers
    MAX_HEADWAY_MIN = 20.0
    matrix = np.clip(matrix, 0, MAX_HEADWAY_MIN) / MAX_HEADWAY_MIN

    # 3 load and align schedule
    schedule_df = pd.read_csv(SCHEDULE_PATH)
    
    # Convert departure seconds to absolute datetime
    # We need to map the schedule to the matrix time bins
    # The schedule has 'service_date' and 'departure_seconds'
    schedule_df['datetime'] = pd.to_datetime(schedule_df['service_date']) + \
                              pd.to_timedelta(schedule_df['departure_seconds'], unit='s')
    
    # Set index and sort
    schedule_df = schedule_df.set_index('datetime').sort_index()
    
    # Filter duplicates (if any)
    schedule_df = schedule_df[~schedule_df.index.duplicated(keep='first')]
    
    # Filter to start time
    schedule_df = schedule_df[schedule_df.index >= GLOBAL_START_TIME]

    # fill nans and resample
    schedule_df['scheduled_headway_min'] = schedule_df['scheduled_headway_min'].bfill()
    
    # Normalize Schedule too
    schedule_df['scheduled_headway_min'] = np.clip(schedule_df['scheduled_headway_min'], 0, MAX_HEADWAY_MIN) / MAX_HEADWAY_MIN
    
    # Resample to match matrix bins (5 min)
    # We use forward fill because the schedule is sparse (terminal departures)
    # But wait, 'terminal departures' is a single point in space.
    # We need a continuous time series of "what is the scheduled headway at the terminal right now?"
    # Actually, the paper implies T is "Scheduled Headway".
    # If we resample to 5 mins, we just take the scheduled value active at that time.
    
    time_coords = pd.date_range(start=GLOBAL_START_TIME, periods=matrix.shape[0], freq=f"{config.time_bin_size_min}min")
    
    # Reindex schedule to match matrix time
    # We use ffill to propagate the last known scheduled headway
    schedule_resampled = schedule_df['scheduled_headway_min'].resample(f'{config.time_bin_size_min}min').ffill()
    
    # Align with Matrix Time Coords
    schedule_aligned_1d = schedule_resampled.reindex(time_coords, method='ffill').bfill().values
    
    # Create 2D Schedule (Time, 2)
    # Since we only have one schedule file (likely Northbound or combined), 
    # and the matrix has 2 directions, we need to be careful.
    # The current CSV likely has mixed directions or just one.
    # Let's assume for now the schedule applies to both or we duplicate it.
    # TODO: Refine schedule processing to distinguish directions if CSV has 'direction_id'
    # For now, we duplicate the 1D schedule for both directions to match shape (Time, 2)
    schedule_aligned = np.stack([schedule_aligned_1d, schedule_aligned_1d], axis=1)

    # 4. Create Tensors
    print(f"Creating tensors with L={config.lookback_bins}, F={config.forecast_bins}...")
    X, T, Y = create_dataset(matrix, schedule_aligned, config)
    
    return X, T, Y

# execute data loading
X, T, Y = load_and_process_data(config)

print(f"Data Shapes:")
print(f"X (History): {X.shape}")
print(f"T (Schedule): {T.shape}")
print(f"Y (Target):  {Y.shape}")

## 3. Model Architecture (ConvLSTM)

We define the `build_model` function. It constructs a Keras model using `ConvLSTM2D` layers to capture spatiotemporal dependencies.

**Key Features:**
*   **5D Input Handling:** `ConvLSTM2D` expects `(Batch, Time, Rows, Cols, Channels)`. Since our data is 1D space (Stations), we reshape it to `(Time, Stations, 1, 1)` inside the model.
*   **Dual Input Support:** If `config.use_terminal_headway` is True, the model creates a secondary input branch for the schedule ($T$), processes it, and concatenates it with the main traffic flow features.

In [None]:
def build_metro_headway_net(
    history_steps=30,   # 'L' in paper
    future_steps=15,    # 'F' in paper
    distance_bins=64,   # 'Nd' in paper
    directions=2,       # 'Ndir' in paper
    filters=32          # Table 1
):
    # --- Input 1: Historical Headways (The "Video" of the metro line) ---
    # Shape: (Batch, 30, 64, 2, 1) -> (Time, Rows, Cols, Channels)
    input_history = layers.Input(
        shape=(history_steps, distance_bins, directions, 1), 
        name="history_input"
    )

    # --- Input 2: Future Terminal Headways (The "Schedule") ---
    # Shape: (Batch, 15, 2, 1) -> Future scheduled departures at terminals
    input_terminal = layers.Input(
        shape=(future_steps, directions, 1), 
        name="terminal_input"
    )

    # --- Encoder Branch: ConvLSTM Layers ---
    # Layer 1: Returns sequences to feed the next layer
    # Kernel Size (3, 1) slides over distance but keeps directions separate
    # Reverted activation to 'tanh' (default) for stability
    x = layers.ConvLSTM2D(
        filters=filters,
        kernel_size=(3, 1),
        padding='same',
        return_sequences=True,
        activation='tanh', 
        name="convlstm_1"
    )(input_history)

    # Layer 2: We only need the FINAL state (summary of the last 30 mins)
    # So we set return_sequences=False
    x = layers.ConvLSTM2D(
        filters=filters,
        kernel_size=(3, 1),
        padding='same',
        return_sequences=False, # Compress time dimension to single vector
        activation='tanh',
        name="convlstm_2"
    )(x)
    
    # Current shape of x: (Batch, 64, 2, 32) -> Spatial Map * Filters

    # --- Flattening for Merge ---
    # We must flatten the spatial features to concatenate with the schedule
    x_flat = layers.Flatten(name="flatten_history")(x)

    # We also flatten the Terminal Schedule
    t_flat = layers.Flatten(name="flatten_terminal")(input_terminal)

    # --- The "Secret Sauce": Fusion ---
    # Concatenate the LSTM context with the Future Schedule
    concat = layers.Concatenate(name="fusion_layer")([x_flat, t_flat])

    # --- Decoder: Dense Layer ---
    # Project to total output dimensions: 15 * 64 * 2
    # Paper explicitly calls for a "Dense Layer" here
    output_dim = future_steps * distance_bins * directions
    
    dense_out = layers.Dense(output_dim, activation='linear', name="dense_projection")(concat)

    # --- Final Reshape ---
    # Reshape back to the grid format: (Batch, 15, 64, 2, 1)
    output = layers.Reshape(
        (future_steps, distance_bins, directions, 1), 
        name="final_output"
    )(dense_out)

    # --- Compile Model ---
    model = keras.Model(inputs=[input_history, input_terminal], outputs=output)
    
    # Optimizer and Loss from Section 2.2 / Table 1
    opt = keras.optimizers.Adam(learning_rate=0.001) 
    model.compile(optimizer=opt, loss='mse', metrics=['mse'])
    
    return model

# Instantiate
model = build_metro_headway_net(
    history_steps=config.lookback_bins,
    future_steps=config.forecast_bins,
    distance_bins=X.shape[2], # Should be 64
    directions=X.shape[3],    # Should be 2
    filters=32
)
model.summary()

In [None]:
# Training
print(f"Training model for {config.epochs} epochs...")

# Mixed Precision Removed for Stability
# tf.keras.mixed_precision.set_global_policy('mixed_float16')

# Re-compile model to ensure clean state
model = build_metro_headway_net(
    history_steps=config.lookback_bins,
    future_steps=config.forecast_bins,
    distance_bins=64,
    directions=2,
    filters=32
)

# Split Data (80/20)
train_size = int(len(X) * 0.8)
X_train, X_val = X[:train_size], X[train_size:]
T_train, T_val = T[:train_size], T[train_size:]
Y_train, Y_val = Y[:train_size], Y[train_size:]

# callbacks
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    x=[X_train, T_train], 
    y=Y_train,
    validation_data=([X_val, T_val], Y_val),
    batch_size=config.batch_size,
    epochs=config.epochs, 
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# plot training history
plt.figure(figsize=(10,6))
plt.plot(history.history['loss'], label="Training Loss")
plt.plot(history.history['val_loss'], label="Validation Loss")
plt.title('Model Training History')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# visualize a prediction
# select a random sample
sample_idx = np.random.randint(0, len(X))
x_sample = X[sample_idx:sample_idx+1]
t_sample = T[sample_idx:sample_idx+1] # Get corresponding T
y_true = Y[sample_idx:sample_idx+1]

y_pred = model.predict([x_sample, t_sample]) # Pass BOTH inputs

# shapes (1, Time, Space, Dir, 1)
# lets plot the space-time heatmap for truth vs Pred (Northbound: Dir=0)
# we transpose to have time on x-axis and space (stations) on y axis

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# ground truth (Northbound)
sns.heatmap(y_true[0, :, :, 0, 0].T, ax=axes[0], cmap="viridis", vmin=0, vmax=1)
axes[0].set_title("Ground Truth (Northbound)")
axes[0].set_xlabel("Time (Future)")
axes[0].set_ylabel("Space (Stations)")
axes[0].invert_yaxis()

# prediction (Northbound)
sns.heatmap(y_pred[0, :, :, 0, 0].T, ax=axes[1], cmap="viridis", vmin=0, vmax=1)
axes[1].set_title("Prediction (Northbound)")
axes[1].set_xlabel("Time (Future)")
axes[1].set_yticks([])
axes[1].invert_yaxis()

plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# 1. Re-create Validation Split (Last 20%)
val_split_idx = int(len(X) * 0.8)
X_val = X[val_split_idx:]
T_val = T[val_split_idx:]
Y_val = Y[val_split_idx:]

print(f"Evaluating on {len(X_val)} validation samples...")

# 2. Predict
# Note: We pass BOTH inputs [X_val, T_val]
Y_pred_norm = model.predict([X_val, T_val], verbose=1)

# 3. Inverse Transform (Normalized -> Minutes -> Seconds)
# Recall: We divided by 20.0 to normalize
SCALING_FACTOR = 20.0
Y_val_sec = Y_val * SCALING_FACTOR * 60
Y_pred_sec = Y_pred_norm * SCALING_FACTOR * 60

# 4. Calculate Metrics
# Flatten arrays because metrics expect 1D arrays
rmse = np.sqrt(mean_squared_error(Y_val_sec.flatten(), Y_pred_sec.flatten()))
r2 = r2_score(Y_val_sec.flatten(), Y_pred_sec.flatten())

print("\n--- Experiment Results ---")
print(f"RMSE: {rmse:.2f} seconds")
print(f"R2 Score: {r2:.4f}")