In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from prophet import Prophet
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import os
from datetime import datetime
from scipy import stats
import json
from pathlib import Path

# Set random seeds for reproducibility
def set_seeds(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    
set_seeds()

## Data Loading and Preprocessing

In [None]:
def load_data(file_path):
    """Load data from parquet file."""
    data = pd.read_parquet(file_path)
    data = data.dropna(subset=['rain', 'wind'])
    return data

def split_data(data, train_years, val_year, test_year):
    """Split data into train, validation and test sets based on years."""
    data['time_bucket'] = pd.to_datetime(data['time_bucket'])
    
    train_data = data[data['time_bucket'].dt.year.isin(train_years)]
    val_data = data[data['time_bucket'].dt.year == val_year]
    test_data = data[data['time_bucket'].dt.year == test_year]
    
    print(f"Train data size: {len(train_data)}")
    print(f"Validation data size: {len(val_data)}")
    print(f"Test data size: {len(test_data)}")
    
    return train_data, val_data, test_data

def filter_ride_data(data, ride_name):
    """Filter data for a specific ride."""
    return data[data[f'ride_name_{ride_name}'] == True].copy()

def get_all_rides(data):
    """Extract all unique rides from the dataset."""
    ride_columns = [col for col in data.columns if col.startswith('ride_name_')]
    return [col.replace('ride_name_', '') for col in ride_columns]

In [None]:
# Load the data
data = load_data("../data/processed/ep/final_cleaned_processed_wait_times.parquet")
print(f"Loaded data with {len(data)} rows")

In [None]:
# Define time periods for splitting
train_years, val_year, test_year = list(range(2017, 2023)), 2023, 2024

# Split the data
train_data, val_data, test_data = split_data(data, train_years, val_year, test_year)

In [None]:
# Get all rides in the dataset
all_rides = get_all_rides(data)
print(f"Found {len(all_rides)} rides in the dataset:")
for i, ride in enumerate(all_rides):
    print(f"{i+1}. {ride}")

## Time Series Decomposition with Prophet (Modified to exclude holidays)

In [None]:
class BaseTimeSeriesModel:
    """Class to handle basic Prophet time series modeling without holidays."""
    
    def __init__(self):
        self.model = None
        self.forecast = None
        
    def is_operating_month(self, ds):
        month = ds.month
        return 1 if (month >= 4 and month <= 12) else 0

    def prepare_prophet_dataframe(self, data):
        """Prepare data for Prophet."""
        prophet_df = data[['time_bucket', 'wait_time']].copy()
        prophet_df = prophet_df.rename(columns={'time_bucket': 'ds', 'wait_time': 'y'})
        prophet_df['operating_month'] = prophet_df['ds'].apply(self.is_operating_month)
        return prophet_df
    
    def fit(self, prophet_df):
        """Fit the Prophet model without using holidays."""
        # Create a Prophet model without holidays
        self.model = Prophet(
            yearly_seasonality=False,
            weekly_seasonality=True,
            daily_seasonality=True,
            interval_width=0.95
        )
        
        # Define custom seasonality for operating months (April-December)
        self.model.add_seasonality(
            name='operating_season', 
            period=274,  # 9 months period
            fourier_order=9,
            condition_name='operating_month'
        )
        
        # Add COVID period as regressors instead of holidays
        prophet_df['during_covid_era'] = 0
        covid_period = (prophet_df['ds'] >= '2020-04-15') & (prophet_df['ds'] <= '2020-05-20')
        prophet_df.loc[covid_period, 'during_covid_era'] = 1
        self.model.add_regressor('during_covid_era')
        
        prophet_df['covid_recovery'] = 0
        recovery_period = (prophet_df['ds'] >= '2021-05-21') & (prophet_df['ds'] <= '2021-08-31')
        prophet_df.loc[recovery_period, 'covid_recovery'] = 1
        self.model.add_regressor('covid_recovery')

        self.model.fit(prophet_df)
        return self.model

    def predict(self, future_df):
        """Generate predictions with the fitted model."""
        # Add required columns for prediction
        future_df = future_df.copy()
        future_df['operating_month'] = future_df['ds'].apply(self.is_operating_month)
        
        # Add COVID regressors if they're not already present
        if 'during_covid_era' not in future_df.columns:
            future_df['during_covid_era'] = 0
            covid_period = (future_df['ds'] >= '2020-04-15') & (future_df['ds'] <= '2020-05-20')
            future_df.loc[covid_period, 'during_covid_era'] = 1
            
        if 'covid_recovery' not in future_df.columns:
            future_df['covid_recovery'] = 0
            recovery_period = (future_df['ds'] >= '2021-05-21') & (future_df['ds'] <= '2021-08-31')
            future_df.loc[recovery_period, 'covid_recovery'] = 1
        
        self.forecast = self.model.predict(future_df)
        return self.forecast
    
    def merge_predictions(self, original_data, forecast_data):
        """Merge original data with forecasts and calculate residuals."""
        result = original_data.copy()
        
        # Identify forecast columns to keep (exclude holiday components since we're not using them)
        forecast_columns = ['ds', 'trend', 'operating_season', 'weekly', 'daily', 'yhat']
        available_columns = [col for col in forecast_columns if col in forecast_data.columns]
        
        result = pd.merge(
            result, 
            forecast_data[available_columns], 
            left_on='time_bucket', 
            right_on='ds', 
            how='left'
        )
        
        result['residual'] = result['wait_time'] - result['yhat']
        return result

## Feature Engineering (With holidays for residual model)

In [None]:
class FeatureEngineer:
    """Class to handle feature engineering for the wait time prediction model."""
    
    @staticmethod
    def prepare_features(ride_data_df, forecast_df):
        """
        Prepare features for the neural network model.
        
        Parameters:
        -----------
        ride_data_df : pandas.DataFrame
            DataFrame containing ride data
        forecast_df : pandas.DataFrame
            DataFrame containing Prophet forecasts
            
        Returns:
        --------
        numpy.ndarray
            Array of features for the neural network
        """
        # Extract weather features
        weather_features = ride_data_df[['temperature', 'rain']].values
        
        # Extract holiday features (used only in the residual model)
        holiday_features = ride_data_df[
            ['is_german_holiday', 'is_swiss_holiday', 'is_french_holiday']
        ].astype(float).values

        # Extract time features
        timestamps = pd.to_datetime(ride_data_df['time_bucket'])
        time_features = FeatureEngineer._create_cyclical_time_features(timestamps)
        
        # Add prophet components as features
        prophet_features = []
        for component in ['trend', 'operating_season', 'weekly', 'daily']:
            if component in forecast_df.columns:
                # Merge the component from forecast_df to ride_data_df based on time_bucket/ds
                component_values = pd.merge(
                    ride_data_df[['time_bucket']], 
                    forecast_df[['ds', component]], 
                    left_on='time_bucket', 
                    right_on='ds', 
                    how='left'
                )[component].values.reshape(-1, 1)
                prophet_features.append(component_values)
        
        if prophet_features:
            prophet_features = np.hstack(prophet_features)
        else:
            # If no prophet features are available, create a placeholder column of zeros
            prophet_features = np.zeros((len(ride_data_df), 1))

        # Combine all features
        features = np.hstack([weather_features, holiday_features, time_features, prophet_features])
        return features
    
    @staticmethod
    def _create_cyclical_time_features(timestamps):
        """
        Create cyclical time features from timestamps.
        
        Parameters:
        -----------
        timestamps : pandas.Series
            Series of timestamps
            
        Returns:
        --------
        numpy.ndarray
            Array of cyclical time features
        """
        # Extract time components
        hour = timestamps.dt.hour
        day_of_week = timestamps.dt.dayofweek
        month = timestamps.dt.month
        day_of_year = timestamps.dt.dayofyear

        # Create cyclical features
        hour_sin = np.sin(2 * np.pi * hour / 24)
        hour_cos = np.cos(2 * np.pi * hour / 24)
        dow_sin = np.sin(2 * np.pi * day_of_week / 7)
        dow_cos = np.cos(2 * np.pi * day_of_week / 7)
        month_sin = np.sin(2 * np.pi * month / 12)
        month_cos = np.cos(2 * np.pi * month / 12)
        doy_sin = np.sin(2 * np.pi * day_of_year / 365.25)
        doy_cos = np.cos(2 * np.pi * day_of_year / 365.25)

        # Stack all cyclical features
        cyclical_features = np.column_stack([
            hour_sin, hour_cos, 
            dow_sin, dow_cos,
            month_sin, month_cos,
            doy_sin, doy_cos
        ])
        
        return cyclical_features

## Neural Network Model

In [None]:
class RideDataset(Dataset):
    """PyTorch Dataset for ride wait time data."""
    
    def __init__(self, features, targets, scaler=None):
        """
        Initialize the dataset.
        
        Parameters:
        -----------
        features : numpy.ndarray
            Array of features
        targets : numpy.ndarray
            Array of target values
        scaler : StandardScaler, optional
            Scaler for features
        """
        if scaler is None:
            self.scaler = StandardScaler()
            self.features = self.scaler.fit_transform(features)
        else:
            self.scaler = scaler
            self.features = self.scaler.transform(features)
            
        self.targets = torch.tensor(targets, dtype=torch.float32).reshape(-1, 1)
    
    def __len__(self):
        """Return the number of samples in the dataset."""
        return len(self.features)
    
    def __getitem__(self, idx):
        """Get a sample from the dataset."""
        # Handle NaN values
        features = np.nan_to_num(self.features[idx])
        targets = np.nan_to_num(self.targets[idx])

        return torch.tensor(features, dtype=torch.float32), torch.tensor(targets, dtype=torch.float32)

In [None]:
class ResidualPredictor(nn.Module):
    """Neural network model for predicting residuals."""
    
    def __init__(self, input_dim, hidden_dims=[64, 32], dropout_prob=0.2):
        super(ResidualPredictor, self).__init__()
        
        layers = []
        
        # Input layer
        layers.append(nn.Linear(input_dim, hidden_dims[0]))
        layers.append(nn.BatchNorm1d(hidden_dims[0]))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(dropout_prob))
        
        # Hidden layers
        for i in range(len(hidden_dims)-1):
            layers.append(nn.Linear(hidden_dims[i], hidden_dims[i+1]))
            layers.append(nn.BatchNorm1d(hidden_dims[i+1]))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_prob))
            
        # Output layer
        layers.append(nn.Linear(hidden_dims[-1], 1))
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        """Forward pass through the model."""
        return self.model(x)

In [None]:
class ModelTrainer:
    """Class to handle model training and evaluation."""
        
    @staticmethod
    def train_nn_model(train_features, train_residuals, val_features, val_residuals, num_epochs=50,
                      hidden_dims=[128, 64], dropout_prob=0.2, 
                      batch_size=512, patience=10, device='cpu'):
        """
        Train the neural network model.
        
        Parameters:
        -----------
        train_features : numpy.ndarray
            Training features
        train_residuals : numpy.ndarray
            Training targets (residuals)
        val_features : numpy.ndarray
            Validation features
        val_residuals : numpy.ndarray
            Validation targets (residuals)
        num_epochs : int
            Number of training epochs
        hidden_dims : list
            List of hidden layer dimensions
        dropout_prob : float
            Dropout probability
        batch_size : int
            Batch size for training
        patience : int
            Number of epochs to wait for improvement before early stopping
        device : str
            Device to use for training ('cpu' or 'cuda')
            
        Returns:
        --------
        tuple
            Trained neural network model and feature scaler
        """
        # Create datasets and dataloaders
        train_dataset = RideDataset(train_features, train_residuals)
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        feature_scaler = train_dataset.scaler

        val_dataset = RideDataset(val_features, val_residuals, scaler=feature_scaler)
        val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # Initialize model
        input_dim = train_features.shape[1]
        model = ResidualPredictor(input_dim, hidden_dims=hidden_dims, dropout_prob=dropout_prob)
        model.to(device)

        # Loss function and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

        # Training loop parameters
        best_val_loss = float('inf')
        epochs_no_improve = 0
        best_model_state = None

        # Training loop with tqdm progress bar
        pbar = tqdm(range(num_epochs), desc="Training")
        for epoch in pbar:
            # Training phase
            model.train()
            running_loss = 0.0
            
            for batch_features, batch_residuals in train_dataloader:
                batch_features = batch_features.to(device)
                batch_residuals = batch_residuals.to(device)

                # Forward and backward passes
                optimizer.zero_grad()
                outputs = model(batch_features)
                loss = criterion(outputs, batch_residuals)
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()
            
            train_loss = running_loss / len(train_dataloader)

            # Validation phase
            val_loss = float('inf')
            if val_dataloader:
                model.eval()
                val_running_loss = 0.0
                
                with torch.no_grad():
                    for val_features_batch, val_residuals_batch in val_dataloader:
                        val_features_batch = val_features_batch.to(device)
                        val_residuals_batch = val_residuals_batch.to(device)

                        val_outputs = model(val_features_batch)
                        val_loss_batch = criterion(val_outputs, val_residuals_batch)
                        val_running_loss += val_loss_batch.item()
                
                val_loss = val_running_loss / len(val_dataloader)
                scheduler.step(val_loss)

                # Early stopping logic
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    epochs_no_improve = 0
                    best_model_state = model.state_dict().copy()
                else:
                    epochs_no_improve += 1
                    if epochs_no_improve >= patience:
                        pbar.set_description(f"Early stopping at epoch {epoch+1}")
                        model.load_state_dict(best_model_state)
                        break

            # Update tqdm postfix with loss values
            pbar.set_postfix({'train_loss': f'{train_loss:.4f}', 'val_loss': f'{val_loss:.4f}'})
        
        # Load best model state if available
        if best_model_state is not None:
            model.load_state_dict(best_model_state)
        
        return model, feature_scaler

In [None]:
class WaitTimePredictor:
    """Class to combine Prophet and Neural Network predictions."""
    
    @staticmethod
    def batch_predict(ride_df, prophet_model, nn_model, feature_scaler, 
                    feature_engineer, device='cpu'):
        """
        Efficient batch prediction for wait times.
        
        Parameters:
        -----------
        ride_df : pandas.DataFrame
            DataFrame containing ride data
        prophet_model : Prophet
            Trained Prophet model
        nn_model : ResidualPredictor
            Trained neural network model
        feature_scaler : StandardScaler
            Feature scaler used during training
        feature_engineer : FeatureEngineer
            Feature engineering class
        device : str
            Device to use for prediction ('cpu' or 'cuda')
            
        Returns:
        --------
        numpy.ndarray
            Array of predicted wait times
        """
        # Get Prophet forecasts for all timestamps
        timestamps_df = pd.DataFrame({'ds': ride_df['time_bucket'].unique()})
        timestamps_df['operating_month'] = timestamps_df['ds'].apply(
            lambda x: 1 if (x.month >= 4 and x.month <= 12) else 0
        )
        
        # Add COVID regressors
        timestamps_df['during_covid_era'] = 0
        covid_period = (timestamps_df['ds'] >= '2020-04-15') & (timestamps_df['ds'] <= '2020-05-20')
        timestamps_df.loc[covid_period, 'during_covid_era'] = 1
        
        timestamps_df['covid_recovery'] = 0
        recovery_period = (timestamps_df['ds'] >= '2021-05-21') & (timestamps_df['ds'] <= '2021-08-31')
        timestamps_df.loc[recovery_period, 'covid_recovery'] = 1
        
        # Get Prophet forecasts
        prophet_forecasts = prophet_model.predict(timestamps_df)
        
        # Merge prophet forecasts with the original data
        ride_data_with_forecast = pd.merge(
            ride_df,
            prophet_forecasts[['ds', 'yhat', 'trend', 'operating_season', 'weekly', 'daily']],
            left_on='time_bucket', 
            right_on='ds',
            how='left'
        )
        
        # Generate features for neural network
        features = feature_engineer.prepare_features(ride_data_with_forecast, prophet_forecasts)
        
        # Scale features
        scaled_features = feature_scaler.transform(features)
        features_tensor = torch.tensor(scaled_features, dtype=torch.float32).to(device)
        
        # Get residual predictions from neural network
        nn_model.eval()
        with torch.no_grad():
            residuals = nn_model(features_tensor).cpu().numpy().flatten()
        
        # Combine predictions
        final_predictions = ride_data_with_forecast['yhat'].values + residuals
        
        # Ensure non-negative wait times
        final_predictions = np.maximum(0, final_predictions)
        
        return final_predictions, ride_data_with_forecast['yhat'].values, residuals

## Model Evaluation Functions

In [None]:
def evaluate_model(ride_df, actual_values, predictions, baseline_predictions, title=""):
    """
    Evaluate model performance and generate visualizations.
    
    Parameters:
    -----------
    ride_df : pandas.DataFrame
        DataFrame containing ride data
    actual_values : numpy.ndarray
        Array of actual wait times
    predictions : numpy.ndarray
        Array of predicted wait times
    baseline_predictions : numpy.ndarray
        Array of baseline (Prophet only) predictions
    title : str
        Title for the plots
        
    Returns:
    --------
    dict
        Dictionary of evaluation metrics
    """
    # Calculate metrics
    mae = np.mean(np.abs(predictions - actual_values))
    rmse = np.sqrt(np.mean(np.square(predictions - actual_values)))
    baseline_mae = np.mean(np.abs(baseline_predictions - actual_values))
    baseline_rmse = np.sqrt(np.mean(np.square(baseline_predictions - actual_values)))
    
    # Print metrics
    print(f"\n{title} MAE: {mae:.2f} minutes (Baseline: {baseline_mae:.2f})")
    print(f"{title} RMSE: {rmse:.2f} minutes (Baseline: {baseline_rmse:.2f})")
    
    # Create a DataFrame with results for time-based analysis
    results_df = pd.DataFrame({
        'time_bucket': ride_df['time_bucket'].values,
        'actual': actual_values,
        'predicted': predictions,
        'baseline': baseline_predictions,
    })
    
    # Add time components
    results_df['hour'] = results_df['time_bucket'].dt.hour
    results_df['day_of_week'] = results_df['time_bucket'].dt.dayofweek
    results_df['month'] = results_df['time_bucket'].dt.month
    
    # Calculate errors
    results_df['error'] = results_df['predicted'] - results_df['actual']
    results_df['abs_error'] = np.abs(results_df['error'])
    results_df['baseline_error'] = results_df['baseline'] - results_df['actual']
    results_df['baseline_abs_error'] = np.abs(results_df['baseline_error'])
    
    # Visualize results
    fig, axes = plt.subplots(2, 1, figsize=(12, 16))
    
    # Actual vs Predicted scatter plot
    axes[0].scatter(actual_values, predictions, alpha=0.5, label='Combined Model')
    axes[0].scatter(actual_values, baseline_predictions, alpha=0.3, color='red', label='Prophet Only')
    max_val = max(np.max(actual_values), np.max(predictions), np.max(baseline_predictions))
    axes[0].plot([0, max_val], [0, max_val], 'k--')
    axes[0].set_xlabel('Actual Wait Time (minutes)')
    axes[0].set_ylabel('Predicted Wait Time (minutes)')
    axes[0].set_title(f'{title} - Actual vs Predicted')
    axes[0].legend()
    
    # Hourly analysis
    hourly_errors = results_df.groupby('hour')[['abs_error', 'baseline_abs_error']].mean()
    hourly_errors = hourly_errors.rename(columns={
        'abs_error': 'Combined Model MAE', 
        'baseline_abs_error': 'Prophet Only MAE'
    })
    hourly_errors.plot(kind='bar', ax=axes[1])
    axes[1].set_xlabel('Hour of Day')
    axes[1].set_ylabel('Mean Absolute Error (minutes)')
    axes[1].set_title(f'{title} - Error Analysis by Hour of Day')
    
    plt.tight_layout()
    
    # Create metrics dictionary
    metrics = {
        "mae": mae,
        "rmse": rmse,
        "baseline_mae": baseline_mae,
        "baseline_rmse": baseline_rmse,
        "improvement_percentage": ((baseline_mae - mae) / baseline_mae) * 100
    }
    
    return metrics, results_df

## Model Storage Functions

In [None]:
def save_model(ride_name, prophet_model, nn_model, feature_scaler, metrics, output_dir="models"):
    """
    Save trained models and results.
    
    Parameters:
    -----------
    ride_name : str
        Name of the ride
    prophet_model : Prophet
        Trained Prophet model
    nn_model : ResidualPredictor
        Trained neural network model
    feature_scaler : StandardScaler
        Feature scaler used during training
    metrics : dict
        Dictionary of evaluation metrics
    output_dir : str
        Directory to save models and results
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create ride-specific directory
    ride_dir = os.path.join(output_dir, ride_name.replace(" ", "_"))
    os.makedirs(ride_dir, exist_ok=True)
    
    # Save Prophet model (using pickle)
    with open(os.path.join(ride_dir, "prophet_model.pkl"), "wb") as f:
        pickle.dump(prophet_model.model, f)
    
    # Save neural network model
    torch.save(nn_model.state_dict(), os.path.join(ride_dir, "nn_model.pt"))
    
    # Save feature scaler
    with open(os.path.join(ride_dir, "feature_scaler.pkl"), "wb") as f:
        pickle.dump(feature_scaler, f)
    
    # Save metrics
    with open(os.path.join(ride_dir, "metrics.json"), "w") as f:
        json.dump(metrics, f, indent=4)
    
    print(f"Models and results saved to {ride_dir}")

def load_model(ride_name, output_dir="models"):
    """
    Load trained models and results.
    
    Parameters:
    -----------
    ride_name : str
        Name of the ride
    output_dir : str
        Directory to load models and results from
        
    Returns:
    --------
    tuple
        (prophet_model, nn_model, feature_scaler, metrics)
    """
    # Create ride-specific directory path
    ride_dir = os.path.join(output_dir, ride_name.replace(" ", "_"))
    
    # Check if models exist
    if not os.path.exists(ride_dir):
        return None, None, None, None
    
    # Load Prophet model
    with open(os.path.join(ride_dir, "prophet_model.pkl"), "rb") as f:
        prophet_model = pickle.load(f)
    
    # Initialize BaseTimeSeriesModel and set the loaded model
    prophet_ts = BaseTimeSeriesModel()
    prophet_ts.model = prophet_model
    
    # Load feature scaler
    with open(os.path.join(ride_dir, "feature_scaler.pkl"), "rb") as f:
        feature_scaler = pickle.load(f)
    
    # Load metrics
    with open(os.path.join(ride_dir, "metrics.json"), "r") as f:
        metrics = json.load(f)
    
    # Load neural network model
    # First determine input dimension from feature scaler
    input_dim = feature_scaler.n_features_in_
    nn_model = ResidualPredictor(input_dim, hidden_dims=[128, 64, 32], dropout_prob=0.2)
    nn_model.load_state_dict(torch.load(os.path.join(ride_dir, "nn_model.pt")))
    nn_model.eval()
    
    return prophet_ts, nn_model, feature_scaler, metrics

def get_processed_rides(output_dir="models"):
    """
    Get a list of rides that have already been processed.
    
    Parameters:
    -----------
    output_dir : str
        Directory to check for processed rides
        
    Returns:
    --------
    list
        List of processed ride names
    """
    if not os.path.exists(output_dir):
        return []
    
    # Get all subdirectories in the output directory
    processed_rides = [d for d in os.listdir(output_dir) 
                      if os.path.isdir(os.path.join(output_dir, d))]
    
    # Convert directory names back to ride names
    processed_rides = [ride.replace("_", " ") for ride in processed_rides]
    
    return processed_rides

## Multi-Ride Processing

In [None]:
def create_checkpoint_file(processed_rides, output_dir="models"):
    """Create a checkpoint file with the list of processed rides."""
    checkpoint_path = os.path.join(output_dir, "checkpoint.json")
    with open(checkpoint_path, "w") as f:
        json.dump({"processed_rides": processed_rides}, f, indent=4)

def load_checkpoint_file(output_dir="models"):
    """Load the checkpoint file to get the list of processed rides."""
    checkpoint_path = os.path.join(output_dir, "checkpoint.json")
    if os.path.exists(checkpoint_path):
        with open(checkpoint_path, "r") as f:
            checkpoint = json.load(f)
        return checkpoint.get("processed_rides", [])
    return []

## Training Pipeline for All Rides

In [None]:
def process_single_ride(ride_name, train_data, val_data, test_data, device='cpu', output_dir="models"):
    """
    Process a single ride: train models, evaluate, and save results.
    
    Parameters:
    -----------
    ride_name : str
        Name of the ride to process
    train_data : pandas.DataFrame
        Training data
    val_data : pandas.DataFrame
        Validation data
    test_data : pandas.DataFrame
        Test data
    device : str
        Device to use for training ('cpu' or 'cuda')
    output_dir : str
        Directory to save models and results
        
    Returns:
    --------
    dict
        Dictionary of validation and test metrics
    """
    print(f"\n{'='*50}")
    print(f"Processing ride: {ride_name}")
    print(f"{'='*50}")
    
    # Filter data for the current ride
    ride_train_data = filter_ride_data(train_data, ride_name)
    ride_val_data = filter_ride_data(val_data, ride_name)
    ride_test_data = filter_ride_data(test_data, ride_name)
    
    print(f"Training data size: {len(ride_train_data)}")
    print(f"Validation data size: {len(ride_val_data)}")
    print(f"Test data size: {len(ride_test_data)}")
    
    # Skip if not enough data
    if len(ride_train_data) < 100 or len(ride_val_data) < 50 or len(ride_test_data) < 50:
        print(f"Skipping {ride_name} due to insufficient data")
        return None
    
    # Initialize feature engineer
    feature_engineer = FeatureEngineer()
    
    # Train Prophet model
    print("Training Prophet model...")
    prophet_ts = BaseTimeSeriesModel()
    prophet_df = prophet_ts.prepare_prophet_dataframe(ride_train_data)
    prophet_model = prophet_ts.fit(prophet_df)
    
    # Generate Prophet forecasts for training data
    future_train = pd.DataFrame({'ds': ride_train_data['time_bucket'].unique()})
    train_forecast = prophet_ts.predict(future_train)
    
    # Merge predictions with original data
    result = prophet_ts.merge_predictions(ride_train_data, train_forecast)
    
    # Prepare features for neural network
    train_features = feature_engineer.prepare_features(ride_train_data, train_forecast)
    train_residuals = result['residual'].values
    
    # Prepare validation data
    val_dates = pd.DataFrame({'ds': ride_val_data['time_bucket'].unique()})
    val_dates['operating_month'] = val_dates['ds'].apply(
        lambda x: 1 if (x.month >= 4 and x.month <= 12) else 0
    )
    val_dates['during_covid_era'] = 0
    val_dates['covid_recovery'] = 0
    
    val_forecast = prophet_ts.predict(val_dates)
    
    ride_val_data_with_forecast = pd.merge(
        ride_val_data,
        val_forecast[['ds', 'yhat', 'trend', 'operating_season', 'weekly', 'daily']],
        left_on='time_bucket', 
        right_on='ds',
        how='left'
    )
    
    ride_val_data_with_forecast['residual'] = (
        ride_val_data_with_forecast['wait_time'] - ride_val_data_with_forecast['yhat']
    )
    
    val_features = feature_engineer.prepare_features(ride_val_data_with_forecast, val_forecast)
    val_residuals = ride_val_data_with_forecast['residual'].values
    
    # Train neural network
    print("Training neural network...")
    nn_model, feature_scaler = ModelTrainer.train_nn_model(
        train_features, train_residuals, 
        val_features, val_residuals,
        num_epochs=50,  
        hidden_dims=[128, 64, 32],  
        dropout_prob=0.2,  
        batch_size=512,
        patience=10,  
        device=device
    )
    
    # Evaluate on validation set
    print("Evaluating on validation set...")
    val_predictions, val_baseline_predictions, _ = WaitTimePredictor.batch_predict(
        ride_val_data, 
        prophet_model, 
        nn_model, 
        feature_scaler,
        feature_engineer,
        device
    )
    
    val_actuals = ride_val_data['wait_time'].values
    
    # Evaluate on test set
    print("Evaluating on test set...")
    test_predictions, test_baseline_predictions, _ = WaitTimePredictor.batch_predict(
        ride_test_data, 
        prophet_model, 
        nn_model, 
        feature_scaler,
        feature_engineer,
        device
    )
    
    test_actuals = ride_test_data['wait_time'].values
    
    # Evaluate and visualize results
    val_metrics, val_results_df = evaluate_model(
        ride_val_data, val_actuals, val_predictions, val_baseline_predictions, 
        title=f"{ride_name} - Validation"
    )
    
    test_metrics, test_results_df = evaluate_model(
        ride_test_data, test_actuals, test_predictions, test_baseline_predictions, 
        title=f"{ride_name} - Test"
    )
    
    # Save results
    combined_metrics = {
        "validation": val_metrics,
        "test": test_metrics,
        "data_counts": {
            "train": len(ride_train_data),
            "validation": len(ride_val_data),
            "test": len(ride_test_data)
        },
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }
    
    # Save models and results
    save_model(ride_name, prophet_ts, nn_model, feature_scaler, combined_metrics, output_dir)
    
    # Save detailed results as CSV
    results_dir = os.path.join(output_dir, ride_name.replace(" ", "_"), "results")
    os.makedirs(results_dir, exist_ok=True)
    
    val_results_df.to_csv(os.path.join(results_dir, "validation_results.csv"), index=False)
    test_results_df.to_csv(os.path.join(results_dir, "test_results.csv"), index=False)
    
    return combined_metrics

## Main Execution

In [None]:
def process_all_rides(all_rides, train_data, val_data, test_data, 
                      output_dir="models", device='cpu', resume=True):
    """
    Process all rides, with support for resuming from checkpoints.
    
    Parameters:
    -----------
    all_rides : list
        List of all ride names to process
    train_data : pandas.DataFrame
        Training data
    val_data : pandas.DataFrame
        Validation data
    test_data : pandas.DataFrame
        Test data
    output_dir : str
        Directory to save models and results
    device : str
        Device to use for training ('cpu' or 'cuda')
    resume : bool
        Whether to resume from a previous run
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Get list of already processed rides
    processed_rides = []
    if resume:
        processed_rides = load_checkpoint_file(output_dir)
        if processed_rides:
            print(f"Resuming from checkpoint. {len(processed_rides)} rides already processed.")
    
    # Initialize results dictionary
    all_results = {}
    
    # Process each ride
    for i, ride_name in enumerate(all_rides):
        if ride_name in processed_rides:
            print(f"Skipping {ride_name} (already processed)")
            # Load metrics for the summary
            _, _, _, metrics = load_model(ride_name, output_dir)
            if metrics:
                all_results[ride_name] = metrics
            continue
            
        try:
            print(f"\nProcessing ride {i+1}/{len(all_rides)}: {ride_name}")
            ride_metrics = process_single_ride(ride_name, train_data, val_data, test_data, 
                                             device=device, output_dir=output_dir)
            
            if ride_metrics:
                all_results[ride_name] = ride_metrics
                processed_rides.append(ride_name)
                
                # Update checkpoint after each ride
                create_checkpoint_file(processed_rides, output_dir)
                
        except Exception as e:
            print(f"Error processing {ride_name}: {str(e)}")
            
    # Generate summary report
    generate_summary_report(all_results, output_dir)
    
    return all_results

In [None]:
def generate_summary_report(all_results, output_dir="models"):
    """
    Generate a summary report of all ride models.
    
    Parameters:
    -----------
    all_results : dict
        Dictionary of results for all rides
    output_dir : str
        Directory to save the summary report
    """
    # Create lists to store summary data
    rides = []
    val_mae = []
    val_rmse = []
    val_baseline_mae = []
    val_improvement = []
    test_mae = []
    test_rmse = []
    test_baseline_mae = []
    test_improvement = []
    data_counts = []
    
    # Extract data from results
    for ride_name, metrics in all_results.items():
        if not metrics:
            continue
            
        rides.append(ride_name)
        
        # Validation metrics
        val_metrics = metrics.get("validation", {})
        val_mae.append(val_metrics.get("mae", float('nan')))
        val_rmse.append(val_metrics.get("rmse", float('nan')))
        val_baseline_mae.append(val_metrics.get("baseline_mae", float('nan')))
        val_improvement.append(val_metrics.get("improvement_percentage", float('nan')))
        
        # Test metrics
        test_metrics = metrics.get("test", {})
        test_mae.append(test_metrics.get("mae", float('nan')))
        test_rmse.append(test_metrics.get("rmse", float('nan')))
        test_baseline_mae.append(test_metrics.get("baseline_mae", float('nan')))
        test_improvement.append(test_metrics.get("improvement_percentage", float('nan')))
        
        # Data counts
        counts = metrics.get("data_counts", {})
        data_counts.append(f"Train: {counts.get('train', 0)}, Val: {counts.get('validation', 0)}, Test: {counts.get('test', 0)}")
    
    # Create DataFrame
    summary_df = pd.DataFrame({
        "Ride Name": rides,
        "Validation MAE": val_mae,
        "Validation RMSE": val_rmse,
        "Validation Baseline MAE": val_baseline_mae,
        "Validation Improvement (%)": val_improvement,
        "Test MAE": test_mae,
        "Test RMSE": test_rmse,
        "Test Baseline MAE": test_baseline_mae,
        "Test Improvement (%)": test_improvement,
        "Data Counts": data_counts
    })
    
    # Sort by test improvement
    summary_df = summary_df.sort_values("Test Improvement (%)", ascending=False)
    
    # Save to CSV
    summary_path = os.path.join(output_dir, "model_summary.csv")
    summary_df.to_csv(summary_path, index=False)
    
    # Print a brief summary
    print("\n" + "="*80)
    print("Model Summary:")
    print(f"Total rides processed: {len(summary_df)}")
    print(f"Average validation MAE: {np.mean(val_mae):.2f} (baseline: {np.mean(val_baseline_mae):.2f})")
    print(f"Average test MAE: {np.mean(test_mae):.2f} (baseline: {np.mean(test_baseline_mae):.2f})")
    print(f"Average improvement: {np.mean(test_improvement):.2f}%")
    print(f"Summary saved to: {summary_path}")
    print("="*80)
    
    # Create a visualization of the results
    plt.figure(figsize=(14, 8))
    
    # Sort rides by test improvement for the plot
    plot_df = summary_df.sort_values("Test Improvement (%)")
    
    # Plot test improvement
    plt.barh(plot_df["Ride Name"], plot_df["Test Improvement (%)"])
    plt.xlabel("Improvement over Baseline (%)")
    plt.ylabel("Ride Name")
    plt.title("Model Improvement over Baseline (Prophet) by Ride")
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, "model_improvement_summary.png"))
    plt.close()

## Execute the Training Pipeline for All Rides

In [None]:
# Get device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set output directory for models and results
output_dir = "../models/wait_time_prediction"

# Process all rides
results = process_all_rides(
    all_rides=all_rides,
    train_data=train_data,
    val_data=val_data,
    test_data=test_data,
    output_dir=output_dir,
    device=device,
    resume=True  # Resume from checkpoint if available
)

## Load and Analyze Results

In [None]:
# Load the summary results
summary_path = os.path.join(output_dir, "model_summary.csv")
if os.path.exists(summary_path):
    summary_df = pd.read_csv(summary_path)
    
    # Display the top 10 performing rides
    print("Top 10 rides by test improvement:")
    display(summary_df.head(10))
    
    # Create a visualization of MAE comparison
    plt.figure(figsize=(14, 8))
    
    # Sort rides by test MAE for the plot (show top 15)
    top_rides = summary_df.sort_values("Test MAE").head(15)
    
    # Plot side by side bars for combined model and baseline
    x = np.arange(len(top_rides))
    width = 0.35
    
    plt.bar(x - width/2, top_rides["Test MAE"], width, label="Combined Model")
    plt.bar(x + width/2, top_rides["Test Baseline MAE"], width, label="Prophet Baseline")
    
    plt.xlabel("Ride Name")
    plt.ylabel("Mean Absolute Error (minutes)")
    plt.title("Model Performance Comparison - Top 15 Rides")
    plt.xticks(x, top_rides["Ride Name"], rotation=45, ha="right")
    plt.legend()
    plt.tight_layout()
    
    plt.savefig(os.path.join(output_dir, "top_rides_mae_comparison.png"))
    plt.show()
else:
    print("Summary file not found. Run the processing pipeline first.")

## Example: Load a Specific Model and Make Predictions

In [None]:
def load_and_test_specific_model(ride_name, test_data, output_dir="../models/wait_time_prediction"):
    """
    Load a specific model and test it on the test dataset.
    
    Parameters:
    -----------
    ride_name : str
        Name of the ride to load
    test_data : pandas.DataFrame
        Test data to use for predictions
    output_dir : str
        Directory where models are saved
    """
    # Load the model
    prophet_model, nn_model, feature_scaler, _ = load_model(ride_name, output_dir)
    
    if prophet_model is None:
        print(f"No model found for {ride_name}")
        return
    
    # Filter test data for the ride
    ride_test_data = filter_ride_data(test_data, ride_name)
    
    if len(ride_test_data) == 0:
        print(f"No test data found for {ride_name}")
        return
    
    # Initialize feature engineer
    feature_engineer = FeatureEngineer()
    
    # Get device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Make predictions
    predictions, baseline_predictions, residuals = WaitTimePredictor.batch_predict(
        ride_test_data, 
        prophet_model, 
        nn_model, 
        feature_scaler,
        feature_engineer,
        device
    )
    
    actuals = ride_test_data['wait_time'].values
    
    # Evaluate and visualize
    metrics, _ = evaluate_model(
        ride_test_data, actuals, predictions, baseline_predictions, 
        title=f"{ride_name} - Test Data"
    )
    
    # Show a time series plot for a sample period
    ride_test_data = ride_test_data.sort_values('time_bucket')
    
    # Take the most recent month of data
    recent_data = ride_test_data.tail(24*30)  # Assuming hourly data, ~30 days
    
    plt.figure(figsize=(14, 6))
    plt.plot(recent_data['time_bucket'], recent_data['wait_time'], 'k-', label='Actual')
    
    # Find corresponding predictions
    mask = np.isin(ride_test_data.index, recent_data.index)
    recent_preds = predictions[mask]
    recent_baseline = baseline_predictions[mask]
    
    plt.plot(recent_data['time_bucket'], recent_preds, 'b-', label='Combined Model')
    plt.plot(recent_data['time_bucket'], recent_baseline, 'r--', label='Prophet Baseline')
    
    plt.xlabel('Date')
    plt.ylabel('Wait Time (minutes)')
    plt.title(f'Wait Time Predictions for {ride_name} - Most Recent Month')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return metrics

# You can uncomment and run this after the models have been trained
# Example usage:
# ride_to_test = "silver star"  # Replace with a ride name from your dataset
# load_and_test_specific_model(ride_to_test, test_data)