In [None]:
# 5_model_optimization_v2.ipynb
# Optimized Spatiotemporal Model Training
# Architecture: ST-ConvNet V2 (Sequential Encoder + Asymmetric Schedule Processing)
# Scaling: RobustScaler (Median/IQR)

import os
import sys
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
import joblib # For saving the scaler

# --- Environment Setup (Colab vs Local) ---
try:
    from google.colab import drive
    print("Detected Colab Environment. Mounting Drive...")
    drive.mount('/content/drive')
    
    # UPDATE THIS PATH to match your Google Drive folder structure
    PROJECT_ROOT = "/content/drive/MyDrive/headway-prediction"
    
    # Add project root to system path so we can import src.config, src.models
    sys.path.append(PROJECT_ROOT)
    
except ImportError:
    print("Detected Local Environment.")
    # Assumes notebook is in 'notebooks/' and project root is one level up
    PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
    sys.path.append(PROJECT_ROOT)

print(f"Project Root: {PROJECT_ROOT}")

# Validate Imports from src (sanity check)
try:
    from src.config import Config
    from src.models.st_covnet_v2 import HeadwayConvLSTM_V2
    print("✅ Successfully imported 'src' modules.")
except ImportError as e:
    print(f"❌ Failed to import 'src' modules: {e}")
    print("Please ensure PROJECT_ROOT points to the folder containing 'src/'")

# Paths
DATA_DIR = os.path.join(PROJECT_ROOT, "data")
print(f"Data Directory: {DATA_DIR}")

In [None]:
# --- 1. Data Loading & Scaling ---
from src.data.dataset import SubwayDataGenerator

# Initialize Config & Generator
config = Config()
print(f"Loading data from: {config.DATA_DIR}")

# Load Raw Data
data_gen = SubwayDataGenerator(config)
data_gen.load_data() 
# Now data_gen.headway_data contains raw minutes (e.g., 5.0, 12.0)

# --- Robust Scaling Strategy ---
# We must fit the scaler ONLY on the training set to avoid data leakage.
# The RobustScaler centers data using the Median and scales using the IQR.
# This makes it resilient to the massive delays (outliers) common in subways.

total_timesteps = len(data_gen.headway_data)
train_split_idx = int(total_timesteps * 0.7) # 70% Train

print(f"\nTotal Timesteps: {total_timesteps}")
print(f"Training Split Index: {train_split_idx}")

# 1. Fit Scaler on Training Data Only
print("Fitting RobustScaler on training subset...")
scaler = RobustScaler()
# Flatten to (Samples, 1) because Scaler expects 2D array
train_subset = data_gen.headway_data[:train_split_idx].reshape(-1, 1)
scaler.fit(train_subset)

# 2. Transform the Entire Dataset (Headways)
print("Transforming Headway data...")
# Transform -> Reshape back to (Time, Stations, Dir, 1)
data_gen.headway_data = scaler.transform(data_gen.headway_data.reshape(-1, 1)).reshape(data_gen.headway_data.shape)

# 3. Transform Schedule Data
# Since 'Schedule' is also in minutes, we use the SAME scaler so the units match.
print("Transforming Schedule data...")
data_gen.schedule_data = scaler.transform(data_gen.schedule_data.reshape(-1, 1)).reshape(data_gen.schedule_data.shape)

# 4. Save Scaler for later Inference/Evaluation
scaler_path = os.path.join(PROJECT_ROOT, "models", "robust_scaler.pkl")
# Ensure directory exists (in case 'models' folder is missing in Drive)
os.makedirs(os.path.dirname(scaler_path), exist_ok=True)
joblib.dump(scaler, scaler_path)
print(f"✅ Scaler saved to {scaler_path}")

# 5. Validation Stats
print("\n--- Scaled Data Statistics (should be centered ~0) ---")
print(f"Mean: {np.mean(data_gen.headway_data):.2f}")
print(f"Std Dev: {np.std(data_gen.headway_data):.2f}")
print(f"Min: {np.min(data_gen.headway_data):.2f}")
print(f"Max: {np.max(data_gen.headway_data):.2f}")