# Model Training & Experimentation Framework

This notebook implements the "Experiment Factory" for the Headway Prediction model. 
It is designed to support the ablation analysis defined in the project abstract, allowing us to vary:
1.  **Lookback Window ($L$):** 30, 45, 60 minutes.
2.  **Input Features:** With or without Terminal Headways ($T$).
3.  **Prediction Horizon:** Recursive prediction up to 60 minutes.

We start by importing the necessary libraries, including TensorFlow/Keras for the Deep Learning components.

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns

# set random seeds for reproduceability
np.random.seed(42)
tf.random.set_seed(42)


In [None]:
# 1. Configuration class 
# We define an `ExperimentConfig` class to encapsulate all hyperparameters. This makes it easy to switch between different experimental setups (e.g., changing the lookback window or enabling/disabling terminal headways) without rewriting code. 

class ExperimentConfig:
    def __init__(
        self,
        lookback_mins=60,
        forecast_mins=30, 
        time_bin_size_min=5,
        use_terminal_headway=True,
        batch_size=32,
        epochs=32, 
        learning_rate=0.001
    ):
        self.lookback_mins = lookback_mins
        self.forecast_mins = forecast_mins
        self.time_bin_size_min = time_bin_size_min
        self.use_terminal_headway = use_terminal_headway
        self.batch_size = batch_size
        self.epochs = epochs
        self.learning_rate = learning_rate

        # calculated properties
        self.lookback_bins = lookback_mins // time_bin_size_min
        self.forecast_bins = forecast_mins // time_bin_size_min

    def __repr__(self):
        return (f"ExperimentalConfig(L={self.lookback_mins}m, "
                f"F={self.forecast_mins}m, "
                f"Use_T={self.use_terminal_headway}")

# create baseline configuration (exp-A1)
config = ExperimentConfig(
    lookback_mins=30, # Baseline from Abstract
    forecast_mins=15, # single-step target for recursive prediction
    use_terminal_headway=True
)

print(f"Active Configuration {config}")
print(f"Lookback Bins: {config.lookback_bins}")
print(f"Forecast Bins: {config.forecast_bins}")

## 2. Data Loading & Preparation

We load the preprocessed matrix and schedule data. We then use the `create_dataset` function (adapted from the EDA notebook) to generate the tensors based on the active `config`.

In [None]:
# file paths
MATRIX_PATH = "../data/headway_matrix_full.npy"
SCHEDULE_PATH = "../data/target_terminal_headways.csv"
GLOBAL_START_TIME = "2025-06-06 00:00:00"

def load_and_process_data(config):
    """
    loads raw data and prepares the (X, T, Y) tensors based on the config
    """
    print("Loading data...")
    
    # 1 load matrix
    matrix = np.load(MATRIX_PATH)

    # 2 normalize the data
    SCALING_FACTOR = 20.0
    matrix = matrix / SCALING_FACTOR

    # 3 load and align schedule
    schedule_df = pd.read_csv(SCHEDULE_PATH)
    schedule_df['datetime'] = pd.to_datetime(schedule_df['service_date']) + \
                              pd.to_timedelta(schedule_df['departure_seconds'], unit='s')
    schedule_df = schedule_df.set_index('datetime').sort_index()
    schedule_df = schedule_df[~schedule_df.index.duplicated(keep='first')]
    schedule_df = schedule_df[schedule_df.index >= GLOBAL_START_TIME]

    # fill nans and resample
    schedule_df['scheduled_headway_min'] = schedule_df['scheduled_headway_min'].bfill()
    
    # normalize schedule too
    schedule_df['scheduled_headway_min'] = schedule_df['scheduled_headway_min'] / SCALING_FACTOR
    
    time_coords = pd.date_range(start=GLOBAL_START_TIME, periods=matrix.shape[0], freq=f"{config.time_bin_size_min}min")
    schedule_resampled = schedule_df['scheduled_headway_min'].resample(f'{config.time_bin_size_min}min').ffill()
    schedule_aligned = schedule_resampled.reindex(time_coords, method='ffill').bfill().values

    # 4. create tensors
    print(f"Generating tensors with L={config.lookback_bins} bins, F={config.forecast_bins} bins...")
    X, T, Y = [], [], []

    for i in range(config.lookback_bins, len(matrix) - config.forecast_bins):
        # Input X: Past L steps
        X.append(matrix[i-config.lookback_bins:i, :])
        # Input T: Future F steps
        T.append(schedule_aligned[i:i+config.forecast_bins])
        # Target Y: Future F steps
        Y.append(matrix[i:i+config.forecast_bins, :])

    X = np.array(X)[..., np.newaxis] #(Batch, Time, Space, 1)
    T = np.array(T)[..., np.newaxis] #(Batch, Time, 1)
    Y = np.array(Y)[..., np.newaxis] #(Batch, Time, Space, 1)

    return X, T, Y

# execute data loading
X, T, Y = load_and_process_data(config)

print(f"\nData Shapes:")
print(f"X (Context): {X.shape}")
print(f"T (Intent): {T.shape}")
print(f"Y (Target): {Y.shape}")

## 3. Model Architecture (ConvLSTM)

We define the `build_model` function. It constructs a Keras model using `ConvLSTM2D` layers to capture spatiotemporal dependencies.

**Key Features:**
*   **5D Input Handling:** `ConvLSTM2D` expects `(Batch, Time, Rows, Cols, Channels)`. Since our data is 1D space (Stations), we reshape it to `(Time, Stations, 1, 1)` inside the model.
*   **Dual Input Support:** If `config.use_terminal_headway` is True, the model creates a secondary input branch for the schedule ($T$), processes it, and concatenates it with the main traffic flow features.

In [None]:
def build_model(config, input_shape_x, input_shape_t=None):
    """
    Constructs the ConvLSTM model based on configuration
    """
    # branch 1 : Spatiotemporal context (X)
    # input shape: (time, space, 1)
    input_x = layers.Input(shape=input_shape_x, name="input_x")

    # reshape to 5D for ConvLSTM2D: (time, rows=space, cols=1, channels=1)
    # we treat the line of stations as a "height" of space with "width" of 1.
    x = layers.Reshape((config.lookback_bins, input_shape_x[1], 1, 1))(input_x)

    # Encoder: ConvLSTM layer
    x = layers.ConvLSTM2D(
        filters=32,
        kernel_size=(3, 1),
        padding="same",
        return_sequences=True,
        activation="relu"
    )(x)

    # branch 2: Dispathcer Intent (T) - Optional
    if config.use_terminal_headway and input_shape_t is not None:
        # Input Shape: (Time, 1)
        input_t = layers.Input(shape=input_shape_t, name="input_t")

        # process T to match X's dimensions for concatenation
        # 1. expand features: (Time, 32)
        t = layers.TimeDistributed(layers.Dense(32, activation="relu"))((input_t))
        # 2 add spatial Dims: (Time, 1, 1, 32)
        t = layers.Reshape((config.forecast_bins, 1, 1, 32))(t)
        # 3. tile across all stations: (time, space, 1, 32)
        # we repeat the single terminal value for every station
        t = tf.tile(t, [1, 1, input_shape_x[1], 1, 1])

        # Note: In a real recursive loop, X (History) and T (Future) have different time lengths.
        # For this specific training step (Single Step Horizon), we might need to align them.
        # However, for the baseline experiment, let's assume we are mapping 
        # L steps of History -> F steps of Future directly (Seq2Seq) or 
        # L steps of History -> Next Step (Many-to-One).
        
        # SIMPLIFICATION FOR BASELINE: 
        # We will use a "Many-to-Many" approach where we map the LAST step of encoder 
        # to the target. But since X and T have different lengths (L vs F), 
        # standard concatenation is tricky without a full Seq2Seq decoder.
        
        # To keep it simple and robust for the first run:
        # We will ignore T for the very first compilation to ensure the pipeline runs,
        # then refine the fusion logic.
        pass 

    # Decoder / output
    # map back to 1 channel (headway)
    outputs = layers.TimeDistributed(layers.Conv2D(filters=1, kernel_size=(1, 1)))(x)

    # reshape back to 3d: (time, space, 1)
    outputs = layers.Reshape((config.lookback_bins, input_shape_x[1], 1))(outputs)

     # Since we want to predict F steps into the future, but ConvLSTM outputs L steps (same as input),
    # we typically take the last step or use a proper decoder.
    # For this specific "Single Step" training (predicting t+15min), we just need an output 
    # that matches the target Y shape.

    # lets slice the output to match the forecast horizon if L > F
    if config.lookback_bins > config.forecast_bins:
        outputs = layers.Lambda(lambda z: z[:, -config.forecast_bins:, :, :])(outputs)
    
    model = keras.Model(inputs=[input_x], outputs=outputs)

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=config.learning_rate), loss="mse")
    return model

# build the model
# note: we drop the batch dimension (0) from shape
model = build_model(config, input_shape_x=X.shape[1:])
model.summary()

In [None]:
# Training
print(f"Training model for {config.epochs} epochs...")

# callbacks
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    x=X,
    y=Y,
    validation_split=0.2,
    epochs=config.epochs,
    batch_size=config.batch_size,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# plot training history
plt.figure(figsize=(10,6))
plt.plot(history.history['loss'], label="Training Loss")
plt.plot(history.history['val_loss'], label="Validation Loss")
plt.title('Model Training History')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# visualize a prediction
# select a random sample
sample_idx = np.random.randint(0, len(X))
x_sample = X[sample_idx:sample_idx+1]
y_true = Y[sample_idx:sample_idx+1]

y_pred = model.predict(x_sample)

# shapes (1, Time, Space, 1)
# lets plot the space-time heatmap for truth vs Pred
# we transpose to ahve time on x-axis and space (stations) on y axis

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# ground truth
sns.heatmap(y_true[0, :, :, 0].T, ax=axes[0], cmap="viridis")
axes[0].set_title(f"Ground Truth (Sample {sample_idx})")
axes[0].set_xlabel("Time Step (Future)")
axes[0].set_ylabel("Station ID")

# prediction
sns.heatmap(y_pred[0, :, :, 0].T, ax=axes[1], cmap="viridis")
axes[1].set_title(f"Prediction (Sample {sample_idx})")
axes[1].set_xlabel("Time Step (Future)")
axes[1].set_ylabel("Station ID")

plt.tight_layout()
plt.show()


## updated with T (target terminal headways)

In [None]:
def build_model(config, input_shape_x, input_shape_t=None):
    """
    Constructs the ConvLSTM model with Schedule Fusion
    """
    # --- Branch 1: Spatiotemporal Context (X) ---
    # Input: (Time, Space, 1)
    input_x = layers.Input(shape=input_shape_x, name="input_x")
    
    # Reshape for ConvLSTM: (Time, Space, 1, 1)
    x_reshaped = layers.Reshape((config.lookback_bins, input_shape_x[1], 1, 1))(input_x)

    # Encoder: ConvLSTM layer
    # We keep it simple: Map L steps of history to features
    lstm_out = layers.ConvLSTM2D(
        filters=32,
        kernel_size=(3, 1),
        padding="same",
        return_sequences=True,
        activation="relu"
    )(x_reshaped)
    # Output: (Batch, L, Space, 1, 32)

    # Remove the extra width dimension: (Batch, L, Space, 32)
    lstm_out = layers.Reshape((config.lookback_bins, input_shape_x[1], 32))(lstm_out)

    # --- Branch 2: Dispatcher Intent (T) ---
    if config.use_terminal_headway and input_shape_t is not None:
        # Input: (Time_Future, 1) -> We need to align this with LSTM output
        input_t = layers.Input(shape=input_shape_t, name="input_t")
        
        # 1. Embed the scalar headway: (F, 1) -> (F, 8)
        t_emb = layers.TimeDistributed(layers.Dense(8, activation="relu"))(input_t)
        
        # 2. Tile across space: (F, 1, 8) -> (F, Space, 8)
        t_tiled = layers.Lambda(lambda x: tf.tile(x[:, :, tf.newaxis, :], [1, 1, input_shape_x[1], 1]))(t_emb)
        
        # CRITICAL ALIGNMENT:
        # LSTM outputs L steps (History). Schedule is F steps (Future).
        # To fuse them, we need to decide on the architecture.
        # SIMPLEST FIX: We only use the LAST step of the LSTM (the "Summary" of history)
        # and repeat it F times to match the Schedule.
        
        # Take last step of LSTM: (Batch, Space, 32)
        lstm_last = layers.Lambda(lambda x: x[:, -1, :, :])(lstm_out)
        
        # Repeat for F steps: (Batch, F, Space, 32)
        lstm_repeated = layers.RepeatVector(config.forecast_bins)(lstm_last)
        lstm_repeated = layers.Reshape((config.forecast_bins, input_shape_x[1], 32))(lstm_repeated)
        
        # Concatenate: (Batch, F, Space, 32+8)
        merged = layers.Concatenate(axis=-1)([lstm_repeated, t_tiled])
        
        # Decoder: Process the merged future context
        x = layers.TimeDistributed(layers.Dense(32, activation="relu"))(merged)
        
        inputs = [input_x, input_t]
    else:
        # Fallback if no T (not recommended)
        x = lstm_out
        inputs = [input_x]

    # --- Output Projection ---
    # Map back to 1 channel (Headway)
    outputs = layers.TimeDistributed(layers.Dense(1, activation="linear"))(x)
    
    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=config.learning_rate), loss="mse")
    return model

# Re-build model
# Note: We pass input_shape_t now!
model = build_model(config, input_shape_x=X.shape[1:], input_shape_t=T.shape[1:])
model.summary()

In [None]:
# Training
print(f"Training model for {config.epochs} epochs...")

# callbacks
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    x=[X, T],  # <--- Pass BOTH inputs here
    y=Y, 
    validation_split=0.2, 
    epochs=config.epochs, 
    batch_size=config.batch_size,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# plot training history
plt.figure(figsize=(10,6))
plt.plot(history.history['loss'], label="Training Loss")
plt.plot(history.history['val_loss'], label="Validation Loss")
plt.title('Model Training History')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# visualize a prediction
# select a random sample
sample_idx = np.random.randint(0, len(X))
x_sample = X[sample_idx:sample_idx+1]
t_sample = T[sample_idx:sample_idx+1] # Get corresponding T
y_true = Y[sample_idx:sample_idx+1]

y_pred = model.predict([x_sample, t_sample]) # Pass BOTH inputs

# shapes (1, Time, Space, 1)
# lets plot the space-time heatmap for truth vs Pred
# we transpose to have time on x-axis and space (stations) on y axis

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# ground truth
sns.heatmap(y_true[0, :, :, 0].T, ax=axes[0], cmap="viridis")
axes[0].set_title(f"Ground Truth (Sample {sample_idx})")
axes[0].set_xlabel("Time Step (Future)")
axes[0].set_ylabel("Station ID")

# prediction
sns.heatmap(y_pred[0, :, :, 0].T, ax=axes[1], cmap="viridis")
axes[1].set_title(f"Prediction (Sample {sample_idx})")
axes[1].set_xlabel("Time Step (Future)")
axes[1].set_ylabel("Station ID")

plt.tight_layout()
plt.show()