# Baseline Models for Theme Park Wait Time Prediction


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import json
from datetime import datetime
from tqdm import tqdm

# Define consistent colors for plots
TRAIN_COLOR = 'steelblue'
TRAIN_FILL_COLOR = 'steelblue'
TRAIN_FILL_ALPHA = 0.3
VAL_COLOR = 'coral'
VAL_FILL_COLOR = 'coral'
VAL_FILL_ALPHA = 0.3

## Data Loading and Initial Preprocessing

We'll use the same preprocessing as the Prophet model for consistency.

In [2]:
def load_data(file_path):
    data = pd.read_parquet(file_path)
    return data

def add_features(data):
    data['hour'] = data['timestamp'].dt.hour
    data['minute'] = data['timestamp'].dt.minute
    data['time_key'] = data['hour'] * 60 + data['minute']
    data['day_of_week'] = data['timestamp'].dt.dayofweek
    data['month'] = data['timestamp'].dt.month
    data['time_key'] = data['hour'] * 60 + data['minute']
    return data

def check_for_missing_values(data):
    missing_values = data.isnull().sum()
    if missing_values.any():
        print("Missing values found in the dataset:")
        print(missing_values[missing_values > 0])
    else:
        print("No missing values found in the dataset.")
    return missing_values

def split_data(data, train_years, val_year, test_year):
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    
    train_data = data[data['timestamp'].dt.year.isin(train_years)]
    val_data = data[data['timestamp'].dt.year == val_year]
    test_data = data[data['timestamp'].dt.year == test_year]
    
    print(f"Train data size: {len(train_data)}")
    print(f"Validation data size: {len(val_data)}")
    print(f"Test data size: {len(test_data)}")
    
    return train_data, val_data, test_data

def filter_ride_data(data, ride_name):
    return data[data[f'ride_name_{ride_name}'] == True].copy()

def get_all_rides(data):
    ride_columns = [col for col in data.columns if col.startswith('ride_name_')]
    return [col.replace('ride_name_', '') for col in ride_columns]

def filter_to_operating_hours(ride_data):
    # Determine operating hours from data where wait times > 0
    operating_hours = ride_data[ride_data["wait_time"] > 0].groupby(
        ride_data["timestamp"].dt.date
    )["timestamp"].agg(['min', 'max']).reset_index()
    
    # Extract opening and closing hours
    operating_hours['opening_hour'] = pd.to_datetime(operating_hours['min']).dt.hour
    operating_hours['closing_hour'] = pd.to_datetime(operating_hours['max']).dt.hour
    
    # Set reasonable boundaries for operating hours
    operating_hours['opening_hour'] = operating_hours['opening_hour'].clip(lower=9, upper=11)
    operating_hours['closing_hour'] = operating_hours['closing_hour'].clip(lower=17, upper=21)
    
    # Create date-to-hours mapping
    date_to_hours = {}
    for _, row in operating_hours.iterrows():
        date_to_hours[row['timestamp']] = (row['opening_hour'], row['closing_hour'])
    
    # Filter data to operating hours only
    def is_operating_hour(timestamp):
        date = timestamp.date()
        if date not in date_to_hours:
            return 0
        
        open_hour, close_hour = date_to_hours[date]
        hour = timestamp.hour
        return 1 if open_hour <= hour < close_hour else 0
    
    ride_data['operating_hour'] = ride_data['timestamp'].apply(is_operating_hour)
    ride_data = ride_data[ride_data['operating_hour'] == 1]
    ride_data = ride_data.drop(columns=["operating_hour"])
    
    return ride_data

## Holiday Effects

We'll create the holiday dataframes.

In [3]:
def create_holiday_dataframes(data):
    holiday_dfs = []
    
    # Process country holidays
    for country in ['swiss', 'german', 'french']:
        holiday_col = f"is_{country}_holiday"
        if holiday_col in data.columns:
            country_holidays = data.loc[data[holiday_col] == 1, ["timestamp"]]
            if len(country_holidays) > 0:
                country_holidays["timestamp"] = pd.to_datetime(country_holidays["timestamp"]).dt.date
                country_holidays = country_holidays.drop_duplicates(subset=["timestamp"])
                country_holidays["holiday"] = f"{country}_holiday"
                holiday_dfs.append(country_holidays.reset_index(drop=True))
    # Combine all holidays
    if holiday_dfs:
        all_holidays = pd.concat(holiday_dfs)
        all_holidays["timestamp"] = pd.to_datetime(all_holidays["timestamp"])
        return all_holidays.sort_values(by=["timestamp"]).reset_index(drop=True)
    return None

## Helper Functions for Evaluation and Visualization

In [4]:
def post_process_forecast(forecast, closed_data):
    """Apply corrections to forecasted values."""
    forecast = forecast.copy()
    
    # Set predictions to zero during known closures (if closed_data has such info)
    if 'closed' in closed_data.columns:
        closed_mask = forecast['timestamp'].isin(closed_data.loc[closed_data['closed'] == 1, 'timestamp'])
        forecast.loc[closed_mask, 'wait_time'] = 0
    
    # Correct negative predictions
    if 'wait_time' in forecast.columns:
        negative_mask = forecast['wait_time'] < 0
        forecast.loc[negative_mask, 'wait_time'] = 0
    elif 'yhat' in forecast.columns:
        negative_mask = forecast['yhat'] < 0
        forecast.loc[negative_mask, 'yhat'] = 0
    
    return forecast

def evaluate_model(ride_df, actual_values, predictions, title=""):
    # Calculate metrics
    mae = np.mean(np.abs(predictions - actual_values))
    rmse = np.sqrt(np.mean(np.square(predictions - actual_values)))
    
    # For sMAPE, avoid division by zero
    epsilon = 1e-8
    abs_pct_errors = np.abs(predictions - actual_values) / (np.abs(predictions) + np.abs(actual_values) + epsilon)
    # Only include points where actual values are non-zero
    non_zero_mask = (actual_values > 0) & (predictions > 0)
    smape = np.mean(abs_pct_errors[non_zero_mask]) * 100 if np.any(non_zero_mask) else 0

    # Print metrics
    print(f"\n{title} MAE: {mae:.2f} minutes")
    print(f"{title} RMSE: {rmse:.2f} minutes")
    print(f"{title} sMAPE: {smape:.2f}%")
    
    # Create a DataFrame with results for time-based analysis
    results_df = pd.DataFrame({
        'timestamp': ride_df['timestamp'].values,
        'actual': actual_values,
        'predicted': predictions,
    })
    
    # Add time components
    results_df['hour'] = results_df['timestamp'].dt.hour
    results_df['day_of_week'] = results_df['timestamp'].dt.dayofweek
    results_df['month'] = results_df['timestamp'].dt.month
    
    # Calculate errors
    results_df['error'] = results_df['predicted'] - results_df['actual']
    results_df['abs_error'] = np.abs(results_df['error'])
    results_df['pct_error'] = abs_pct_errors * 100
    
    # Create metrics dictionary
    metrics = {
        "mae": mae,
        "rmse": rmse,
        "smape": smape
    }
    
    return metrics, results_df

## Baseline Model Classes

In [5]:
class MeanBaselineModel:
    """A simple baseline model that predicts the mean value of the training data."""
    def __init__(self):
        self.mean_value = None
        
    def fit(self, train_data):
        """Fit the model by calculating the mean of the training data"""
        self.mean_value = train_data['wait_time'].mean()
        return self
        
    def predict(self, future_df):
        """Predict using the mean value for all future points"""
        predictions = pd.DataFrame({'timestamp': future_df['timestamp']})
        predictions['yhat'] = self.mean_value
        return predictions

class TimeOfDayBaselineModel:
    """A baseline model that uses patterns at different times of day."""
    def __init__(self):
        self.time_of_day_means = None
        self.global_mean = None
        
    def fit(self, train_data):
        """Fit the model by calculating mean values for each time of day"""
        # Calculate mean for each time of day
        self.time_of_day_means = train_data.groupby('time_key')['wait_time'].mean().to_dict()
        self.global_mean = train_data['wait_time'].mean()  # Fallback value
        return self
        
    def predict(self, future_df):
        """Predict using time-of-day pattern"""
        predictions = pd.DataFrame({'timestamp': future_df['timestamp']})
        
        # Extract hour and minute from prediction dates
        predictions['hour'] = predictions['timestamp'].dt.hour
        predictions['minute'] = predictions['timestamp'].dt.minute
        predictions['time_key'] = predictions['hour'] * 60 + predictions['minute']
        
        # Assign predictions based on time of day
        predictions['yhat'] = predictions['time_key'].map(
            lambda x: self.time_of_day_means.get(x, self.global_mean))
        
        return predictions[['timestamp', 'yhat']]

class DayAndTimeBaselineModel:
    """A model that combines day of week patterns with time of day patterns."""
    def __init__(self):
        self.day_time_means = None
        self.time_means = None
        self.global_mean = None
        
    def fit(self, train_data):
        """Fit the model by calculating mean values for each day+time combination"""
        # Calculate mean for each day and time combination
        train_data['day_time_key'] = (train_data['day_of_week'] * 24 * 60 + 
                                      train_data['time_key'])
        
        self.day_time_means = train_data.groupby('day_time_key')['wait_time'].mean().to_dict()
        self.time_means = train_data.groupby('time_key')['wait_time'].mean().to_dict()
        self.global_mean = train_data['wait_time'].mean()
        self.std_dev = train_data['wait_time'].std()
        
        return self
        
    def predict(self, future_df):
        """Predict using day-of-week and time-of-day patterns"""
        predictions = pd.DataFrame({'timestamp': future_df['timestamp']})
        
        # Extract day of week, hour and minute
        predictions['day_of_week'] = predictions['timestamp'].dt.dayofweek
        predictions['hour'] = predictions['timestamp'].dt.hour
        predictions['minute'] = predictions['timestamp'].dt.minute
        predictions['time_key'] = predictions['hour'] * 60 + predictions['minute']
        predictions['day_time_key'] = (predictions['day_of_week'] * 24 * 60 + 
                                      predictions['time_key'])
        
        # First try to find day+time combination
        predictions['yhat'] = predictions['day_time_key'].map(
            lambda x: self.day_time_means.get(x, None))
        
        # If not found, fall back to time of day
        mask = predictions['yhat'].isna()
        predictions.loc[mask, 'yhat'] = predictions.loc[mask, 'time_key'].map(
            lambda x: self.time_means.get(x, self.global_mean))
                
        return predictions[['timestamp', 'yhat']]

class MovingAverageBaselineModel:
    """This model uses the average of recent observations for predictions."""
    def __init__(self, window_size=48):  # Default: 1 day (48 30-min intervals)
        self.window_size = window_size
        self.historical_data = None
        self.global_mean = None
        self.std_dev = None
        
    def fit(self, train_data):
        """Store the training data for later prediction"""
        # Make a copy and reset index to avoid ambiguity issues
        self.historical_data = train_data.copy().reset_index(drop=True)
        self.global_mean = self.historical_data['wait_time'].mean()
        self.std_dev = self.historical_data['wait_time'].std()
        return self
        
    def predict(self, future_df):
        """Predict using moving average of recent observations"""
        # Create a copy of future_df with reset index
        future_copy = future_df.copy().reset_index(drop=True)
        predictions = pd.DataFrame({'timestamp': future_copy['timestamp']})
        
        # Create lists for predictions and confidence intervals
        yhat = []
        
        # Sort historical data by date
        sorted_history = self.historical_data.sort_values('timestamp').reset_index(drop=True)
        
        # For each prediction date, calculate the moving average
        for pred_date in predictions['timestamp']:
            # Find recent observations (before the prediction date)
            recent_data = sorted_history[sorted_history['timestamp'] < pred_date].tail(self.window_size)
            
            if len(recent_data) > 0:
                # Calculate the mean of recent observations
                pred_value = recent_data['wait_time'].mean()
            else:
                # Fallback to global mean if no recent data
                pred_value = self.global_mean
                
            # Store prediction and confidence interval
            yhat.append(pred_value)
        
        # Add predictions to dataframe
        predictions['yhat'] = yhat
        
        return predictions

class SeasonalWeeklyBaselineModel:
    """This model uses data from the same day and time in previous weeks."""
    def __init__(self, num_weeks=4):
        self.num_weeks = num_weeks
        self.training_data = None
        self.global_mean = None
        self.std_dev = None
        
    def fit(self, train_data):
        """Store the training data for later prediction"""
        # Make a copy and reset index to avoid ambiguity issues
        self.training_data = train_data.copy().reset_index(drop=True)
        self.global_mean = self.training_data['wait_time'].mean()
        self.std_dev = self.training_data['wait_time'].std()
        
        # Pre-calculate time features for training data
        self.training_data['day_of_week'] = self.training_data['timestamp'].dt.dayofweek
        self.training_data['hour'] = self.training_data['timestamp'].dt.hour
        self.training_data['minute'] = self.training_data['timestamp'].dt.minute
        self.training_data['time_key'] = self.training_data['hour'] * 60 + self.training_data['minute']
        
        return self
        
    def predict(self, future_df):
        """Predict using the same day of week and time of day from previous weeks"""
        # Create a copy of future_df with reset index
        future_copy = future_df.copy().reset_index(drop=True)
        predictions = pd.DataFrame({'timestamp': future_copy['timestamp']})
        
        # Extract day of week and time for predictions
        predictions['day_of_week'] = predictions['timestamp'].dt.dayofweek
        predictions['hour'] = predictions['timestamp'].dt.hour
        predictions['minute'] = predictions['timestamp'].dt.minute
        predictions['time_key'] = predictions['hour'] * 60 + predictions['minute']
        
        # Predict for each point based on same day/time from previous weeks
        yhat = []
        
        for _, row in predictions.iterrows():
            # Find matching day and time in training data
            matches = self.training_data[
                (self.training_data['day_of_week'] == row['day_of_week']) & 
                (self.training_data['time_key'] == row['time_key'])
            ].copy().reset_index(drop=True)
            
            # Sort by date (descending) and take most recent num_weeks
            if len(matches) > 0:
                matches = matches.sort_values('timestamp', ascending=False).reset_index(drop=True)
                matches = matches.head(self.num_weeks)
                
                # Calculate prediction and confidence interval
                pred = matches['wait_time'].mean()
                yhat.append(pred)
            else:
                # Fallback to global mean if no matches
                yhat.append(self.global_mean)
        
        # Add predictions to dataframe
        predictions['yhat'] = yhat
        
        return predictions[['timestamp', 'yhat']]

class HolidayAwareBaselineModel:
    """A model that uses different patterns for holidays and normal days."""
    def __init__(self):
        self.holiday_day_time_means = None
        self.normal_day_time_means = None
        self.holiday_time_means = None
        self.normal_time_means = None
        self.holiday_mean = None
        self.normal_mean = None
        self.global_mean = None
        
    def fit(self, train_data, holiday_data):
        """Fit separate models for holidays and normal days"""
        # Mark holiday days
        if holiday_data is not None:
            holiday_dates = set(pd.to_datetime(holiday_data['timestamp']).dt.date)
        else:
            holiday_dates = set()
            
        train_data = train_data.copy()
        train_data['is_holiday'] = train_data['timestamp'].dt.date.isin(holiday_dates)
        
        # Create day-time keys if they don't exist yet
        if 'day_time_key' not in train_data.columns:
            train_data['day_time_key'] = (train_data['day_of_week'] * 24 * 60 + 
                                         train_data['time_key'])
        
        # Calculate means for holiday days
        holiday_data_subset = train_data[train_data['is_holiday']]
        self.holiday_day_time_means = holiday_data_subset.groupby('day_time_key')['wait_time'].mean().to_dict()
        self.holiday_time_means = holiday_data_subset.groupby('time_key')['wait_time'].mean().to_dict()
        
        # Calculate means for normal days
        normal_data = train_data[~train_data['is_holiday']]
        self.normal_day_time_means = normal_data.groupby('day_time_key')['wait_time'].mean().to_dict()
        self.normal_time_means = normal_data.groupby('time_key')['wait_time'].mean().to_dict()
        
        # Global means as fallback
        self.holiday_mean = holiday_data_subset['wait_time'].mean() if len(holiday_data_subset) > 0 else train_data['wait_time'].mean()
        self.normal_mean = normal_data['wait_time'].mean() if len(normal_data) > 0 else train_data['wait_time'].mean()
        self.global_mean = train_data['wait_time'].mean()
        self.std_dev = train_data['wait_time'].std()
        
        return self
        
    def predict(self, future_df, holiday_data):
        """Predict using different models for holidays and normal days"""
        predictions = pd.DataFrame({'timestamp': future_df['timestamp']})
        
        # Extract day and time features
        predictions['day_of_week'] = predictions['timestamp'].dt.dayofweek
        predictions['hour'] = predictions['timestamp'].dt.hour
        predictions['minute'] = predictions['timestamp'].dt.minute
        predictions['time_key'] = predictions['hour'] * 60 + predictions['minute']
        predictions['day_time_key'] = (predictions['day_of_week'] * 24 * 60 + 
                                      predictions['time_key'])
        
        # Mark holiday days
        if holiday_data is not None:
            holiday_dates = set(pd.to_datetime(holiday_data['timestamp']).dt.date)
        else:
            holiday_dates = set()
        predictions['is_holiday'] = predictions['timestamp'].dt.date.isin(holiday_dates)
        
        # Initialize predictions
        predictions['yhat'] = np.nan
        
        # Predict for holiday days
        holiday_mask = predictions['is_holiday']
        
        # First try day+time for holidays
        predictions.loc[holiday_mask, 'yhat'] = predictions.loc[holiday_mask, 'day_time_key'].map(
            lambda x: self.holiday_day_time_means.get(x, None))
        
        # Fall back to time of day for holidays
        still_na = holiday_mask & predictions['yhat'].isna()
        predictions.loc[still_na, 'yhat'] = predictions.loc[still_na, 'time_key'].map(
            lambda x: self.holiday_time_means.get(x, self.holiday_mean))
        
        # Predict for normal days
        normal_mask = ~predictions['is_holiday']
        
        # First try day+time for normal days
        predictions.loc[normal_mask, 'yhat'] = predictions.loc[normal_mask, 'day_time_key'].map(
            lambda x: self.normal_day_time_means.get(x, None))
        
        # Fall back to time of day for normal days
        still_na = normal_mask & predictions['yhat'].isna()
        predictions.loc[still_na, 'yhat'] = predictions.loc[still_na, 'time_key'].map(
            lambda x: self.normal_time_means.get(x, self.normal_mean))
        
        # Final fallback to global mean
        still_na = predictions['yhat'].isna()
        predictions.loc[still_na, 'yhat'] = self.global_mean
        
        return predictions[['timestamp', 'yhat']]

class TrueLastWeekModel:
    """A model that uses the exact value from 7 days before in the same dataset."""
    def __init__(self):
        self.train_data = None
        self.std_dev = None
        
    def fit(self, train_data):
        """Store training data"""
        self.train_data = train_data.copy().reset_index(drop=True)
        self.train_data_dict = dict(zip(self.train_data['timestamp'], self.train_data['wait_time']))
        self.std_dev = self.train_data['wait_time'].std()
        self.global_mean = self.train_data['wait_time'].mean()
        return self
        
    def predict(self, future_df, val_data=None):
        """Predict using values from 7 days before in the same dataset"""
        predictions = pd.DataFrame({'timestamp': future_df['timestamp']})
        yhat = []
        
        # Create value lookup dictionaries
        if val_data is not None:
            val_data = val_data.copy().reset_index(drop=True)
            val_data_dict = dict(zip(val_data['timestamp'], val_data['wait_time']))
        else:
            val_data_dict = {}
        
        # For each prediction date
        for date in predictions['timestamp']:
            last_week = date - pd.Timedelta(days=7)
            
            # For validation dates, first check in validation data
            if val_data is not None and date in val_data_dict:
                if last_week in val_data_dict:
                    yhat.append(val_data_dict[last_week])
                elif last_week in self.train_data_dict:
                    yhat.append(self.train_data_dict[last_week])
                else:
                    # If no exact match, use global mean
                    yhat.append(self.global_mean)
            # For training dates, check in training data
            else:
                if last_week in self.train_data_dict:
                    yhat.append(self.train_data_dict[last_week])
                else:
                    # If no exact match, use global mean
                    yhat.append(self.global_mean)
        
        # Add predictions and confidence intervals
        predictions['yhat'] = yhat
        
        return predictions[['timestamp', 'yhat']]

class LastYearModel:
    """A baseline that uses the value from the same day of the week in the previous year."""
    def __init__(self):
        self.historical_data = None
        self.std_dev = None
        
    def fit(self, train_data):
        """Store the training data indexed by timestamp"""
        self.historical_data = train_data.copy().reset_index(drop=True)
        # Create a lookup dictionary for fast access
        self.date_to_value = dict(zip(self.historical_data['timestamp'], self.historical_data['wait_time']))
        self.std_dev = self.historical_data['wait_time'].std()
        self.global_mean = self.historical_data['wait_time'].mean()
        return self
    
    def _get_same_day_previous_year(self, date):
        """Get the same day of the week from the previous year, handling leap years."""
        # Start with the same date last year
        try:
            same_date_last_year = date.replace(year=date.year - 1)
        except ValueError:
            # Handle Feb 29 in leap years - move to Feb 28
            same_date_last_year = date.replace(year=date.year - 1, day=28)
        
        # Calculate the difference in days of the week
        current_weekday = date.weekday()
        last_year_weekday = same_date_last_year.weekday()
        
        # Adjust to get the same day of the week
        days_diff = current_weekday - last_year_weekday
        target_date = same_date_last_year + pd.Timedelta(days=days_diff)
        
        return target_date
        
    def predict(self, future_df):
        """Predict using the same day of the week from the previous year"""
        predictions = pd.DataFrame({'timestamp': future_df['timestamp']})
        yhat = []
        
        for date in predictions['timestamp']:
            # Get the same day of the week from previous year
            target_date = self._get_same_day_previous_year(date)
            
            # If we have data for that exact date, use it
            if target_date in self.date_to_value:
                yhat.append(self.date_to_value[target_date])
            else:
                # Look for closest date within a 7-day window (prefer same weekday)
                closest_date = None
                min_diff = pd.Timedelta(days=7)
                
                for historical_date in self.date_to_value.keys():
                    diff = abs(historical_date - target_date)
                    # Prefer dates with the same weekday
                    if historical_date.weekday() == date.weekday():
                        diff = diff - pd.Timedelta(hours=1)  # Small bias towards same weekday
                    
                    if diff < min_diff:
                        min_diff = diff
                        closest_date = historical_date
                
                if closest_date is not None:
                    yhat.append(self.date_to_value[closest_date])
                else:
                    # Otherwise use global mean
                    yhat.append(self.global_mean)
                
        predictions['yhat'] = yhat
        
        return predictions

## Model Storage and Management Functions

In [6]:
def save_baseline_results(ride_name, all_metrics, output_dir="baseline_models"):
    """Save baseline model results for a ride."""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Create ride-specific directory
    ride_dir = os.path.join(output_dir, ride_name.replace(" ", "_"))
    os.makedirs(ride_dir, exist_ok=True)
    
    # Save metrics
    with open(os.path.join(ride_dir, "baseline_metrics.json"), "w") as f:
        json.dump(all_metrics, f, indent=4)
    
    print(f"Baseline results saved to {ride_dir}")

def load_baseline_results(ride_name, output_dir="baseline_models"):
    """Load baseline model results for a ride."""
    ride_dir = os.path.join(output_dir, ride_name.replace(" ", "_"))
    
    # Check if results exist
    metrics_path = os.path.join(ride_dir, "baseline_metrics.json")
    if not os.path.exists(metrics_path):
        return None
    
    # Load metrics
    with open(metrics_path, "r") as f:
        metrics = json.load(f)
    
    return metrics

def create_checkpoint_file(processed_rides, output_dir="baseline_models"):
    """Create a checkpoint file to track progress."""
    checkpoint_path = os.path.join(output_dir, "checkpoint.json")
    with open(checkpoint_path, "w") as f:
        json.dump({"processed_rides": processed_rides}, f, indent=4)

def load_checkpoint_file(output_dir="baseline_models"):
    """Load checkpoint file to resume processing."""
    checkpoint_path = os.path.join(output_dir, "checkpoint.json")
    if os.path.exists(checkpoint_path):
        with open(checkpoint_path, "r") as f:
            checkpoint = json.load(f)
        return checkpoint.get("processed_rides", [])
    return []

## Single Ride Processing Function

In [7]:
def process_single_ride_baselines(ride_name, train_data, val_data, output_dir="baseline_models"):
    """Process all baseline models for a single ride."""
    print(f"\n{'='*50}")
    print(f"Processing baseline models for ride: {ride_name}")
    print(f"{'='*50}")
    
    # Filter data for the current ride
    ride_train_data = filter_ride_data(train_data, ride_name)
    ride_val_data = filter_ride_data(val_data, ride_name)
    
    print(f"Training data size: {len(ride_train_data)}")
    print(f"Validation data size: {len(ride_val_data)}")
    
    # Skip if not enough data
    if len(ride_train_data) < 100 or len(ride_val_data) < 50:
        print(f"Skipping {ride_name} due to insufficient data")
        return None
    
    # Add features
    ride_train_data = add_features(ride_train_data)
    ride_val_data = add_features(ride_val_data)
    
    # Create holidays dataframe
    holidays_df = create_holiday_dataframes(ride_train_data)
    
    # Create future dataframe for predictions
    future = pd.DataFrame({'timestamp': ride_val_data['timestamp'].unique()})
    future = future.sort_values('timestamp').reset_index(drop=True)
    
    # Initialize results dictionary
    all_metrics = {}
    
    # Define all baseline models
    baseline_models = {
        "Mean Baseline": MeanBaselineModel(),
        "Time-of-Day Baseline": TimeOfDayBaselineModel(),
        "Day+Time Baseline": DayAndTimeBaselineModel(),
        "Moving Average Baseline": MovingAverageBaselineModel(window_size=48),
        "Seasonal Weekly Baseline": SeasonalWeeklyBaselineModel(num_weeks=4),
        "Holiday-Aware Baseline": HolidayAwareBaselineModel(),
        "LastWeek Baseline": TrueLastWeekModel(),
        "LastYear Baseline": LastYearModel()
    }
    
    # Get actual values
    val_actual = ride_val_data['wait_time'].values
    
    # Process each baseline model
    for model_name, model in baseline_models.items():
        try:
            print(f"  Training {model_name}...")
            
            # Fit the model
            if model_name == "Holiday-Aware Baseline":
                model.fit(ride_train_data, holidays_df)
            else:
                model.fit(ride_train_data)
            
            # Make predictions
            if model_name == "Holiday-Aware Baseline":
                forecast = model.predict(future, holidays_df)
            elif model_name == "LastWeek Baseline":
                forecast = model.predict(future, ride_val_data)
            else:
                forecast = model.predict(future)
            
            # Post-process forecast
            forecast = post_process_forecast(forecast, ride_val_data)
            
            # Get predictions
            val_predictions = forecast['yhat'].values
            
            # Evaluate model
            metrics, results_df = evaluate_model(
                ride_val_data, val_actual, val_predictions, 
                title=f"{model_name}"
            )
            
            # Store metrics
            all_metrics[model_name] = metrics
            
        except Exception as e:
            print(f"  Error processing {model_name}: {str(e)}")
            all_metrics[model_name] = {"mae": float('inf'), "rmse": float('inf'), "smape": float('inf')}
    
    # Add metadata
    all_metrics["metadata"] = {
        "ride_name": ride_name,
        "train_data_size": len(ride_train_data),
        "val_data_size": len(ride_val_data),
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }
    
    # Save results
    save_baseline_results(ride_name, all_metrics, output_dir)
    
    return all_metrics

## All Rides Processing Function

In [8]:
def process_all_rides_baselines(all_rides, train_data, val_data, 
                               output_dir="baseline_models", resume=True):
    """Process baseline models for all rides."""
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Get list of already processed rides
    processed_rides = []
    if resume:
        processed_rides = load_checkpoint_file(output_dir)
        if processed_rides:
            print(f"Resuming from checkpoint. {len(processed_rides)} rides already processed.")
    
    # Initialize results dictionary
    all_results = {}
    
    # Process each ride
    for i, ride_name in enumerate(tqdm(all_rides, desc="Processing rides")):
        if ride_name in processed_rides:
            print(f"Skipping {ride_name} (already processed)")
            # Load metrics for the summary
            metrics = load_baseline_results(ride_name, output_dir)
            if metrics:
                all_results[ride_name] = metrics
            continue
            
        print(f"\nProcessing ride {i+1}/{len(all_rides)}: {ride_name}")
        ride_metrics = process_single_ride_baselines(ride_name, train_data, val_data, 
                                                    output_dir=output_dir)
        
        if ride_metrics:
            all_results[ride_name] = ride_metrics
            processed_rides.append(ride_name)
            
            # Update checkpoint after each ride
            create_checkpoint_file(processed_rides, output_dir)
    
    # Generate summary report
    generate_baseline_summary_report(all_results, output_dir)
    
    return all_results

## Summary Report Generation

In [9]:
def generate_baseline_summary_report(all_results, output_dir="baseline_models"):
    """Generate a comprehensive summary report of all baseline models across all rides."""
    
    # Create lists to store summary data
    summary_data = []
    
    # Extract data from results
    for ride_name, ride_results in all_results.items():
        if not ride_results or "metadata" not in ride_results:
            continue
        
        # Base ride info
        metadata = ride_results["metadata"]
        base_info = {
            "ride_name": ride_name,
            "train_data_size": metadata.get("train_data_size", 0),
            "val_data_size": metadata.get("val_data_size", 0)
        }
        
        # Add metrics for each baseline model
        for model_name, metrics in ride_results.items():
            if model_name == "metadata":
                continue
                
            row = base_info.copy()
            row["model_name"] = model_name
            row["mae"] = metrics.get("mae", float('inf'))
            row["rmse"] = metrics.get("rmse", float('inf'))
            row["smape"] = metrics.get("smape", float('inf'))
            
            summary_data.append(row)
    
    # Create DataFrame
    summary_df = pd.DataFrame(summary_data)
    
    if len(summary_df) == 0:
        print("No results to summarize.")
        return None
    
    # Save detailed summary
    detailed_summary_path = os.path.join(output_dir, "detailed_baseline_summary.csv")
    summary_df.to_csv(detailed_summary_path, index=False)
    
    # Create model comparison summary
    model_summary = summary_df.groupby('model_name').agg({
        'mae': ['mean', 'std', 'median'],
        'rmse': ['mean', 'std', 'median'],
        'smape': ['mean', 'std', 'median']
    }).round(2)
    
    # Flatten column names
    model_summary.columns = ['_'.join(col).strip() for col in model_summary.columns.values]
    model_summary = model_summary.reset_index()
    
    # Save model comparison summary
    model_summary_path = os.path.join(output_dir, "model_comparison_summary.csv")
    model_summary.to_csv(model_summary_path, index=False)
    
    # Create ride-wise best model summary
    ride_best = summary_df.loc[summary_df.groupby('ride_name')['mae'].idxmin()]
    ride_best_summary = ride_best[['ride_name', 'model_name', 'mae', 'rmse', 'smape']].copy()
    ride_best_summary = ride_best_summary.sort_values('mae')
    
    # Save ride-wise best model summary
    ride_best_path = os.path.join(output_dir, "best_model_per_ride.csv")
    ride_best_summary.to_csv(ride_best_path, index=False)
    
    # Print summary statistics
    print("\n" + "="*80)
    print("BASELINE MODELS SUMMARY:")
    print(f"Total rides processed: {summary_df['ride_name'].nunique()}")
    print(f"Total model evaluations: {len(summary_df)}")
    print("\nModel Performance (Average MAE):")
    print(model_summary[['model_name', 'mae_mean']].sort_values('mae_mean'))
    print(f"\nDetailed summary saved to: {detailed_summary_path}")
    print(f"Model comparison saved to: {model_summary_path}")
    print(f"Best models per ride saved to: {ride_best_path}")
    print("="*80)
    
    # Create visualizations
    create_baseline_visualizations(summary_df, model_summary, ride_best_summary, output_dir)
    
    return summary_df, model_summary, ride_best_summary

def create_baseline_visualizations(summary_df, model_summary, ride_best_summary, output_dir):
    """Create visualizations for baseline model results."""
    
    # 1. Model comparison boxplot
    plt.figure(figsize=(15, 8))
    summary_df.boxplot(column='mae', by='model_name', ax=plt.gca())
    plt.title('MAE Distribution by Baseline Model')
    plt.xlabel('Baseline Model')
    plt.ylabel('Mean Absolute Error (minutes)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "model_comparison_boxplot.png"))
    plt.close()
    
    # 2. Average performance bar chart
    plt.figure(figsize=(12, 6))
    model_avg = model_summary.sort_values('mae_mean')
    plt.bar(model_avg['model_name'], model_avg['mae_mean'])
    plt.title('Average MAE by Baseline Model')
    plt.xlabel('Baseline Model')
    plt.ylabel('Average MAE (minutes)')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "average_mae_by_model.png"))
    plt.close()
    
    # 3. Best model distribution
    plt.figure(figsize=(12, 6))
    best_model_counts = ride_best_summary['model_name'].value_counts()
    plt.bar(best_model_counts.index, best_model_counts.values)
    plt.title('Frequency of Best Performing Model by Ride')
    plt.xlabel('Baseline Model')
    plt.ylabel('Number of Rides Where Model is Best')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "best_model_frequency.png"))
    plt.close()
    
    # 4. Performance heatmap
    pivot_df = summary_df.pivot(index='ride_name', columns='model_name', values='mae')
    plt.figure(figsize=(15, 20))
    plt.imshow(pivot_df.values, cmap='YlOrRd', aspect='auto')
    plt.colorbar(label='MAE (minutes)')
    plt.xticks(range(len(pivot_df.columns)), pivot_df.columns, rotation=45)
    plt.yticks(range(len(pivot_df.index)), pivot_df.index)
    plt.title('MAE Heatmap: Models vs Rides')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "mae_heatmap.png"))
    plt.close()
    
    print("Visualizations saved to output directory.")

## Main Execution

In [10]:

print("Loading data...")
data = load_data("../data/processed/ep/final_cleaned_processed_wait_times.parquet")
print(f"Loaded data with {len(data)} rows")

check_for_missing_values(data)

data = filter_to_operating_hours(data)

# Define time periods for splitting
train_years, val_year, test_year = list(range(2017, 2023)), 2023, 2024

# Split the data
train_data, val_data, test_data = split_data(data, train_years, val_year, test_year)

# Get all rides in the dataset
all_rides = get_all_rides(data)
print(f"Found {len(all_rides)} rides in the dataset:")
for i, ride in enumerate(all_rides):
    print(f"{i+1}. {ride}")

# Set output directory for baseline models and results
output_dir = "../models/baseline_models/"

# Process all rides with baseline models
print("\n" + "="*60)
print("STARTING BASELINE MODEL PROCESSING FOR ALL RIDES")
print("="*60)

results = process_all_rides_baselines(
    all_rides=all_rides,
    train_data=train_data,
    val_data=val_data,
    output_dir=output_dir,
    resume=True  # Resume from checkpoint if available
)

print("\n" + "="*60)
print("BASELINE MODEL PROCESSING COMPLETED")
print("="*60)

Loading data...
Loaded data with 7834739 rows
No missing values found in the dataset.
Train data size: 297362
Validation data size: 61851
Test data size: 55699
Found 31 rides in the dataset:
1. alpine express enzian
2. arena of football  be part of it
3. arthur
4. atlantica supersplash
5. atlantis adventure
6. baaa express
7. blue fire megacoaster
8. castello dei medici
9. dancing dingie
10. euromir
11. eurosat  cancan coaster
12. eurotower
13. fjordrafting
14. jim button  journey through morrowland
15. josefinas magical imperial journey
16. kolumbusjolle
17. madame freudenreich curiosits
18. matterhornblitz
19. old mac donalds tractor fun
20. pegasus
21. poppy towers
22. poseidon
23. silver star
24. swiss bob run
25. tirol log flume
26. vienna wave swing  glckspilz
27. vindjammer
28. voletarium
29. volo da vinci
30. voltron nevera powered by rimac
31. whale adventures  northern lights

STARTING BASELINE MODEL PROCESSING FOR ALL RIDES


Processing rides:   0%|          | 0/31 [00:00<?, ?it/s]


Processing ride 1/31: alpine express enzian

Processing baseline models for ride: alpine express enzian
Training data size: 10302
Validation data size: 2019
  Training Mean Baseline...

Mean Baseline MAE: 7.87 minutes
Mean Baseline RMSE: 9.64 minutes
Mean Baseline sMAPE: 22.56%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 6.77 minutes
Time-of-Day Baseline RMSE: 8.27 minutes
Time-of-Day Baseline sMAPE: 24.18%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 6.04 minutes
Day+Time Baseline RMSE: 7.48 minutes
Day+Time Baseline sMAPE: 22.39%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 7.04 minutes
Moving Average Baseline RMSE: 8.79 minutes
Moving Average Baseline sMAPE: 22.90%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 7.99 minutes
Seasonal Weekly Baseline RMSE: 10.59 minutes
Seasonal Weekly Baseline sMAPE: 39.46%
  Training Holiday-Aware Baseline...

Holiday-Aware Baseline MAE: 5.86 minutes
Holiday-Aware Baseline R

Processing rides:   3%|▎         | 1/31 [00:05<02:31,  5.04s/it]


LastYear Baseline MAE: 9.06 minutes
LastYear Baseline RMSE: 12.09 minutes
LastYear Baseline sMAPE: 29.08%
Baseline results saved to ../models/baseline_models/alpine_express_enzian

Processing ride 2/31: arena of football  be part of it

Processing baseline models for ride: arena of football  be part of it
Training data size: 9612
Validation data size: 2052
  Training Mean Baseline...

Mean Baseline MAE: 2.57 minutes
Mean Baseline RMSE: 2.73 minutes
Mean Baseline sMAPE: 32.88%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 2.50 minutes
Time-of-Day Baseline RMSE: 2.76 minutes
Time-of-Day Baseline sMAPE: 34.45%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 2.47 minutes
Day+Time Baseline RMSE: 2.74 minutes
Day+Time Baseline sMAPE: 33.24%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 1.90 minutes
Moving Average Baseline RMSE: 2.18 minutes
Moving Average Baseline sMAPE: 19.23%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE

Processing rides:   6%|▋         | 2/31 [00:25<06:49, 14.13s/it]


LastYear Baseline MAE: 4.30 minutes
LastYear Baseline RMSE: 4.82 minutes
LastYear Baseline sMAPE: 3.55%
Baseline results saved to ../models/baseline_models/arena_of_football__be_part_of_it

Processing ride 3/31: arthur

Processing baseline models for ride: arthur
Training data size: 10298
Validation data size: 2067
  Training Mean Baseline...

Mean Baseline MAE: 11.02 minutes
Mean Baseline RMSE: 14.69 minutes
Mean Baseline sMAPE: 18.44%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 9.12 minutes
Time-of-Day Baseline RMSE: 12.58 minutes
Time-of-Day Baseline sMAPE: 18.38%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 8.60 minutes
Day+Time Baseline RMSE: 12.03 minutes
Day+Time Baseline sMAPE: 17.56%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 13.87 minutes
Moving Average Baseline RMSE: 17.32 minutes
Moving Average Baseline sMAPE: 21.26%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 11.15 minutes
Seasonal Weekly Base

Processing rides:  10%|▉         | 3/31 [00:32<05:01, 10.78s/it]


LastYear Baseline MAE: 10.75 minutes
LastYear Baseline RMSE: 15.56 minutes
LastYear Baseline sMAPE: 19.43%
Baseline results saved to ../models/baseline_models/arthur

Processing ride 4/31: atlantica supersplash

Processing baseline models for ride: atlantica supersplash
Training data size: 9825
Validation data size: 2065
  Training Mean Baseline...

Mean Baseline MAE: 11.95 minutes
Mean Baseline RMSE: 14.23 minutes
Mean Baseline sMAPE: 31.59%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 9.69 minutes
Time-of-Day Baseline RMSE: 12.66 minutes
Time-of-Day Baseline sMAPE: 30.69%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 9.43 minutes
Day+Time Baseline RMSE: 12.49 minutes
Day+Time Baseline sMAPE: 29.63%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 12.87 minutes
Moving Average Baseline RMSE: 14.69 minutes
Moving Average Baseline sMAPE: 30.40%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 13.15 minutes
Seasonal Weekl

Processing rides:  13%|█▎        | 4/31 [00:38<03:56,  8.77s/it]


LastYear Baseline MAE: 10.18 minutes
LastYear Baseline RMSE: 14.52 minutes
LastYear Baseline sMAPE: 31.26%
Baseline results saved to ../models/baseline_models/atlantica_supersplash

Processing ride 5/31: atlantis adventure

Processing baseline models for ride: atlantis adventure
Training data size: 10289
Validation data size: 2066
  Training Mean Baseline...

Mean Baseline MAE: 2.45 minutes
Mean Baseline RMSE: 2.97 minutes
Mean Baseline sMAPE: 53.25%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 2.18 minutes
Time-of-Day Baseline RMSE: 2.81 minutes
Time-of-Day Baseline sMAPE: 44.66%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 2.08 minutes
Day+Time Baseline RMSE: 2.75 minutes
Day+Time Baseline sMAPE: 42.85%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 2.94 minutes
Moving Average Baseline RMSE: 3.18 minutes
Moving Average Baseline sMAPE: 31.30%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 2.17 minutes
Seasonal We

Processing rides:  16%|█▌        | 5/31 [00:43<03:16,  7.57s/it]


LastYear Baseline MAE: 1.98 minutes
LastYear Baseline RMSE: 3.59 minutes
LastYear Baseline sMAPE: 12.06%
Baseline results saved to ../models/baseline_models/atlantis_adventure

Processing ride 6/31: baaa express

Processing baseline models for ride: baaa express
Training data size: 10291
Validation data size: 2067
  Training Mean Baseline...

Mean Baseline MAE: 3.80 minutes
Mean Baseline RMSE: 6.19 minutes
Mean Baseline sMAPE: 17.08%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 4.40 minutes
Time-of-Day Baseline RMSE: 6.03 minutes
Time-of-Day Baseline sMAPE: 31.81%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 4.26 minutes
Day+Time Baseline RMSE: 5.97 minutes
Day+Time Baseline sMAPE: 31.14%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 4.99 minutes
Moving Average Baseline RMSE: 6.08 minutes
Moving Average Baseline sMAPE: 24.52%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 4.85 minutes
Seasonal Weekly Baseline RMS

Processing rides:  19%|█▉        | 6/31 [00:49<02:53,  6.94s/it]


LastYear Baseline MAE: 6.09 minutes
LastYear Baseline RMSE: 8.74 minutes
LastYear Baseline sMAPE: 24.82%
Baseline results saved to ../models/baseline_models/baaa_express

Processing ride 7/31: blue fire megacoaster

Processing baseline models for ride: blue fire megacoaster
Training data size: 10288
Validation data size: 2067
  Training Mean Baseline...

Mean Baseline MAE: 13.29 minutes
Mean Baseline RMSE: 17.47 minutes
Mean Baseline sMAPE: 24.98%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 12.39 minutes
Time-of-Day Baseline RMSE: 16.65 minutes
Time-of-Day Baseline sMAPE: 23.17%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 11.76 minutes
Day+Time Baseline RMSE: 15.76 minutes
Day+Time Baseline sMAPE: 22.41%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 13.34 minutes
Moving Average Baseline RMSE: 17.84 minutes
Moving Average Baseline sMAPE: 25.29%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 16.41 minutes
Seasona

Processing rides:  23%|██▎       | 7/31 [00:55<02:42,  6.76s/it]


LastYear Baseline MAE: 11.10 minutes
LastYear Baseline RMSE: 16.56 minutes
LastYear Baseline sMAPE: 21.24%
Baseline results saved to ../models/baseline_models/blue_fire_megacoaster

Processing ride 8/31: castello dei medici

Processing baseline models for ride: castello dei medici
Training data size: 10193
Validation data size: 2066
  Training Mean Baseline...

Mean Baseline MAE: 2.45 minutes
Mean Baseline RMSE: 2.60 minutes
Mean Baseline sMAPE: 45.68%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 2.44 minutes
Time-of-Day Baseline RMSE: 2.61 minutes
Time-of-Day Baseline sMAPE: 45.73%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 2.37 minutes
Day+Time Baseline RMSE: 2.56 minutes
Day+Time Baseline sMAPE: 44.62%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 2.91 minutes
Moving Average Baseline RMSE: 3.83 minutes
Moving Average Baseline sMAPE: 0.56%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 2.55 minutes
Seasonal W

Processing rides:  26%|██▌       | 8/31 [01:02<02:36,  6.80s/it]


LastYear Baseline MAE: 2.70 minutes
LastYear Baseline RMSE: 3.73 minutes
LastYear Baseline sMAPE: 2.41%
Baseline results saved to ../models/baseline_models/castello_dei_medici

Processing ride 9/31: dancing dingie

Processing baseline models for ride: dancing dingie
Training data size: 10292
Validation data size: 2066
  Training Mean Baseline...

Mean Baseline MAE: 2.98 minutes
Mean Baseline RMSE: 3.87 minutes
Mean Baseline sMAPE: 30.73%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 2.80 minutes
Time-of-Day Baseline RMSE: 3.85 minutes
Time-of-Day Baseline sMAPE: 33.68%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 2.75 minutes
Day+Time Baseline RMSE: 3.85 minutes
Day+Time Baseline sMAPE: 35.07%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 2.51 minutes
Moving Average Baseline RMSE: 3.67 minutes
Moving Average Baseline sMAPE: 14.83%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 3.06 minutes
Seasonal Weekly Baseline

Processing rides:  29%|██▉       | 9/31 [01:08<02:23,  6.51s/it]


LastYear Baseline MAE: 3.70 minutes
LastYear Baseline RMSE: 5.81 minutes
LastYear Baseline sMAPE: 14.66%
Baseline results saved to ../models/baseline_models/dancing_dingie

Processing ride 10/31: euromir

Processing baseline models for ride: euromir
Training data size: 10289
Validation data size: 2049
  Training Mean Baseline...

Mean Baseline MAE: 12.15 minutes
Mean Baseline RMSE: 15.21 minutes
Mean Baseline sMAPE: 25.79%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 9.62 minutes
Time-of-Day Baseline RMSE: 13.25 minutes
Time-of-Day Baseline sMAPE: 23.07%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 9.10 minutes
Day+Time Baseline RMSE: 12.64 minutes
Day+Time Baseline sMAPE: 21.97%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 12.08 minutes
Moving Average Baseline RMSE: 14.94 minutes
Moving Average Baseline sMAPE: 24.21%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 11.77 minutes
Seasonal Weekly Baseline RMSE: 16.

Processing rides:  32%|███▏      | 10/31 [01:13<02:07,  6.06s/it]


LastYear Baseline MAE: 8.93 minutes
LastYear Baseline RMSE: 14.01 minutes
LastYear Baseline sMAPE: 22.73%
Baseline results saved to ../models/baseline_models/euromir

Processing ride 11/31: eurosat  cancan coaster

Processing baseline models for ride: eurosat  cancan coaster
Training data size: 8650
Validation data size: 2067
  Training Mean Baseline...

Mean Baseline MAE: 11.67 minutes
Mean Baseline RMSE: 13.98 minutes
Mean Baseline sMAPE: 23.88%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 9.19 minutes
Time-of-Day Baseline RMSE: 11.98 minutes
Time-of-Day Baseline sMAPE: 19.07%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 8.41 minutes
Day+Time Baseline RMSE: 11.21 minutes
Day+Time Baseline sMAPE: 17.93%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 11.70 minutes
Moving Average Baseline RMSE: 14.11 minutes
Moving Average Baseline sMAPE: 23.78%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 11.23 minutes
Seasonal 

Processing rides:  35%|███▌      | 11/31 [01:18<01:54,  5.74s/it]


LastYear Baseline MAE: 8.60 minutes
LastYear Baseline RMSE: 12.17 minutes
LastYear Baseline sMAPE: 17.16%
Baseline results saved to ../models/baseline_models/eurosat__cancan_coaster

Processing ride 12/31: eurotower

Processing baseline models for ride: eurotower
Training data size: 10308
Validation data size: 2067
  Training Mean Baseline...

Mean Baseline MAE: 3.61 minutes
Mean Baseline RMSE: 5.34 minutes
Mean Baseline sMAPE: 20.51%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 3.29 minutes
Time-of-Day Baseline RMSE: 5.02 minutes
Time-of-Day Baseline sMAPE: 18.88%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 3.26 minutes
Day+Time Baseline RMSE: 4.98 minutes
Day+Time Baseline sMAPE: 18.66%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 3.84 minutes
Moving Average Baseline RMSE: 5.28 minutes
Moving Average Baseline sMAPE: 21.99%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 3.33 minutes
Seasonal Weekly Baseline RM

Processing rides:  39%|███▊      | 12/31 [01:24<01:49,  5.76s/it]


LastYear Baseline MAE: 3.10 minutes
LastYear Baseline RMSE: 5.78 minutes
LastYear Baseline sMAPE: 14.17%
Baseline results saved to ../models/baseline_models/eurotower

Processing ride 13/31: fjordrafting

Processing baseline models for ride: fjordrafting
Training data size: 9808
Validation data size: 2066
  Training Mean Baseline...

Mean Baseline MAE: 13.55 minutes
Mean Baseline RMSE: 15.77 minutes
Mean Baseline sMAPE: 33.07%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 11.53 minutes
Time-of-Day Baseline RMSE: 14.54 minutes
Time-of-Day Baseline sMAPE: 32.66%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 11.15 minutes
Day+Time Baseline RMSE: 14.21 minutes
Day+Time Baseline sMAPE: 31.75%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 11.92 minutes
Moving Average Baseline RMSE: 16.00 minutes
Moving Average Baseline sMAPE: 34.96%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 11.96 minutes
Seasonal Weekly Baseline RMS

Processing rides:  42%|████▏     | 13/31 [01:30<01:45,  5.89s/it]


LastYear Baseline MAE: 9.68 minutes
LastYear Baseline RMSE: 14.46 minutes
LastYear Baseline sMAPE: 29.66%
Baseline results saved to ../models/baseline_models/fjordrafting

Processing ride 14/31: jim button  journey through morrowland

Processing baseline models for ride: jim button  journey through morrowland
Training data size: 8597
Validation data size: 2065
  Training Mean Baseline...

Mean Baseline MAE: 2.84 minutes
Mean Baseline RMSE: 3.08 minutes
Mean Baseline sMAPE: 34.86%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 2.60 minutes
Time-of-Day Baseline RMSE: 2.98 minutes
Time-of-Day Baseline sMAPE: 32.70%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 2.49 minutes
Day+Time Baseline RMSE: 2.94 minutes
Day+Time Baseline sMAPE: 30.81%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 2.41 minutes
Moving Average Baseline RMSE: 2.88 minutes
Moving Average Baseline sMAPE: 51.60%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline

Processing rides:  45%|████▌     | 14/31 [01:37<01:46,  6.25s/it]


LastYear Baseline MAE: 1.94 minutes
LastYear Baseline RMSE: 3.37 minutes
LastYear Baseline sMAPE: 9.33%
Baseline results saved to ../models/baseline_models/jim_button__journey_through_morrowland

Processing ride 15/31: josefinas magical imperial journey

Processing baseline models for ride: josefinas magical imperial journey
Training data size: 10266
Validation data size: 2067
  Training Mean Baseline...

Mean Baseline MAE: 4.09 minutes
Mean Baseline RMSE: 5.23 minutes
Mean Baseline sMAPE: 41.62%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 3.59 minutes
Time-of-Day Baseline RMSE: 4.96 minutes
Time-of-Day Baseline sMAPE: 38.17%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 3.45 minutes
Day+Time Baseline RMSE: 4.74 minutes
Day+Time Baseline sMAPE: 39.01%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 4.56 minutes
Moving Average Baseline RMSE: 5.44 minutes
Moving Average Baseline sMAPE: 19.71%
  Training Seasonal Weekly Baseline...

Seasona

Processing rides:  48%|████▊     | 15/31 [01:43<01:40,  6.25s/it]


LastYear Baseline MAE: 4.49 minutes
LastYear Baseline RMSE: 7.00 minutes
LastYear Baseline sMAPE: 27.29%
Baseline results saved to ../models/baseline_models/josefinas_magical_imperial_journey

Processing ride 16/31: kolumbusjolle

Processing baseline models for ride: kolumbusjolle
Training data size: 10300
Validation data size: 2067
  Training Mean Baseline...

Mean Baseline MAE: 2.53 minutes
Mean Baseline RMSE: 2.63 minutes
Mean Baseline sMAPE: 32.86%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 2.50 minutes
Time-of-Day Baseline RMSE: 2.66 minutes
Time-of-Day Baseline sMAPE: 33.47%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 2.50 minutes
Day+Time Baseline RMSE: 2.67 minutes
Day+Time Baseline sMAPE: 33.62%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 0.70 minutes
Moving Average Baseline RMSE: 1.49 minutes
Moving Average Baseline sMAPE: 3.71%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 1.32 minutes
Seasonal W

Processing rides:  52%|█████▏    | 16/31 [01:49<01:33,  6.24s/it]


LastYear Baseline MAE: 1.80 minutes
LastYear Baseline RMSE: 3.05 minutes
LastYear Baseline sMAPE: 1.98%
Baseline results saved to ../models/baseline_models/kolumbusjolle

Processing ride 17/31: madame freudenreich curiosits

Processing baseline models for ride: madame freudenreich curiosits
Training data size: 8643
Validation data size: 2066
  Training Mean Baseline...

Mean Baseline MAE: 0.51 minutes
Mean Baseline RMSE: 0.67 minutes
Mean Baseline sMAPE: 83.00%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 0.47 minutes
Time-of-Day Baseline RMSE: 0.65 minutes
Time-of-Day Baseline sMAPE: 84.12%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 0.47 minutes
Day+Time Baseline RMSE: 0.67 minutes
Day+Time Baseline sMAPE: 83.81%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 0.16 minutes
Moving Average Baseline RMSE: 0.54 minutes
Moving Average Baseline sMAPE: 95.92%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 0.15 minutes


Processing rides:  55%|█████▍    | 17/31 [01:55<01:22,  5.93s/it]


LastYear Baseline MAE: 0.07 minutes
LastYear Baseline RMSE: 0.60 minutes
LastYear Baseline sMAPE: 0.00%
Baseline results saved to ../models/baseline_models/madame_freudenreich_curiosits

Processing ride 18/31: matterhornblitz

Processing baseline models for ride: matterhornblitz
Training data size: 10299
Validation data size: 2065
  Training Mean Baseline...

Mean Baseline MAE: 12.61 minutes
Mean Baseline RMSE: 15.29 minutes
Mean Baseline sMAPE: 20.42%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 9.03 minutes
Time-of-Day Baseline RMSE: 12.38 minutes
Time-of-Day Baseline sMAPE: 17.04%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 8.63 minutes
Day+Time Baseline RMSE: 11.93 minutes
Day+Time Baseline sMAPE: 16.39%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 12.63 minutes
Moving Average Baseline RMSE: 15.31 minutes
Moving Average Baseline sMAPE: 20.59%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 12.56 minutes
Seas

Processing rides:  58%|█████▊    | 18/31 [02:00<01:15,  5.77s/it]


LastYear Baseline MAE: 7.85 minutes
LastYear Baseline RMSE: 11.94 minutes
LastYear Baseline sMAPE: 16.96%
Baseline results saved to ../models/baseline_models/matterhornblitz

Processing ride 19/31: old mac donalds tractor fun

Processing baseline models for ride: old mac donalds tractor fun
Training data size: 10292
Validation data size: 2065
  Training Mean Baseline...

Mean Baseline MAE: 2.13 minutes
Mean Baseline RMSE: 2.45 minutes
Mean Baseline sMAPE: 52.18%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 1.94 minutes
Time-of-Day Baseline RMSE: 2.41 minutes
Time-of-Day Baseline sMAPE: 48.83%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 1.86 minutes
Day+Time Baseline RMSE: 2.37 minutes
Day+Time Baseline sMAPE: 46.71%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 2.63 minutes
Moving Average Baseline RMSE: 2.77 minutes
Moving Average Baseline sMAPE: 37.12%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 1.64 minutes

Processing rides:  61%|██████▏   | 19/31 [02:07<01:12,  6.04s/it]


LastYear Baseline MAE: 1.93 minutes
LastYear Baseline RMSE: 3.41 minutes
LastYear Baseline sMAPE: 11.03%
Baseline results saved to ../models/baseline_models/old_mac_donalds_tractor_fun

Processing ride 20/31: pegasus

Processing baseline models for ride: pegasus
Training data size: 10299
Validation data size: 2067
  Training Mean Baseline...

Mean Baseline MAE: 6.87 minutes
Mean Baseline RMSE: 8.49 minutes
Mean Baseline sMAPE: 25.67%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 5.53 minutes
Time-of-Day Baseline RMSE: 7.55 minutes
Time-of-Day Baseline sMAPE: 22.90%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 5.46 minutes
Day+Time Baseline RMSE: 7.48 minutes
Day+Time Baseline sMAPE: 22.95%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 6.72 minutes
Moving Average Baseline RMSE: 8.43 minutes
Moving Average Baseline sMAPE: 26.08%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 6.84 minutes
Seasonal Weekly Baseline RMS

Processing rides:  65%|██████▍   | 20/31 [02:12<01:04,  5.86s/it]


LastYear Baseline MAE: 5.59 minutes
LastYear Baseline RMSE: 8.54 minutes
LastYear Baseline sMAPE: 24.87%
Baseline results saved to ../models/baseline_models/pegasus

Processing ride 21/31: poppy towers

Processing baseline models for ride: poppy towers
Training data size: 10276
Validation data size: 2065
  Training Mean Baseline...

Mean Baseline MAE: 2.46 minutes
Mean Baseline RMSE: 2.87 minutes
Mean Baseline sMAPE: 27.06%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 2.45 minutes
Time-of-Day Baseline RMSE: 2.94 minutes
Time-of-Day Baseline sMAPE: 29.31%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 2.44 minutes
Day+Time Baseline RMSE: 2.94 minutes
Day+Time Baseline sMAPE: 29.65%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 1.16 minutes
Moving Average Baseline RMSE: 2.16 minutes
Moving Average Baseline sMAPE: 6.94%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 1.96 minutes
Seasonal Weekly Baseline RMSE: 2.82 min

Processing rides:  68%|██████▊   | 21/31 [02:17<00:56,  5.67s/it]


LastYear Baseline MAE: 2.16 minutes
LastYear Baseline RMSE: 3.42 minutes
LastYear Baseline sMAPE: 5.57%
Baseline results saved to ../models/baseline_models/poppy_towers

Processing ride 22/31: poseidon

Processing baseline models for ride: poseidon
Training data size: 9832
Validation data size: 2064
  Training Mean Baseline...

Mean Baseline MAE: 15.90 minutes
Mean Baseline RMSE: 18.20 minutes
Mean Baseline sMAPE: 31.77%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 13.56 minutes
Time-of-Day Baseline RMSE: 16.88 minutes
Time-of-Day Baseline sMAPE: 28.46%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 13.11 minutes
Day+Time Baseline RMSE: 16.50 minutes
Day+Time Baseline sMAPE: 27.64%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 15.17 minutes
Moving Average Baseline RMSE: 21.94 minutes
Moving Average Baseline sMAPE: 71.21%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 15.37 minutes
Seasonal Weekly Baseline RMSE: 23.

Processing rides:  71%|███████   | 22/31 [02:23<00:51,  5.70s/it]


LastYear Baseline MAE: 10.45 minutes
LastYear Baseline RMSE: 16.64 minutes
LastYear Baseline sMAPE: 26.24%
Baseline results saved to ../models/baseline_models/poseidon

Processing ride 23/31: silver star

Processing baseline models for ride: silver star
Training data size: 10244
Validation data size: 2064
  Training Mean Baseline...

Mean Baseline MAE: 10.89 minutes
Mean Baseline RMSE: 13.65 minutes
Mean Baseline sMAPE: 25.21%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 9.46 minutes
Time-of-Day Baseline RMSE: 12.32 minutes
Time-of-Day Baseline sMAPE: 21.51%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 8.55 minutes
Day+Time Baseline RMSE: 11.39 minutes
Day+Time Baseline sMAPE: 20.08%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 11.43 minutes
Moving Average Baseline RMSE: 13.69 minutes
Moving Average Baseline sMAPE: 25.81%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 11.31 minutes
Seasonal Weekly Baseline RMSE:

Processing rides:  74%|███████▍  | 23/31 [02:29<00:44,  5.61s/it]


LastYear Baseline MAE: 9.02 minutes
LastYear Baseline RMSE: 13.02 minutes
LastYear Baseline sMAPE: 21.15%
Baseline results saved to ../models/baseline_models/silver_star

Processing ride 24/31: swiss bob run

Processing baseline models for ride: swiss bob run
Training data size: 10293
Validation data size: 2066
  Training Mean Baseline...

Mean Baseline MAE: 12.48 minutes
Mean Baseline RMSE: 14.64 minutes
Mean Baseline sMAPE: 22.39%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 9.64 minutes
Time-of-Day Baseline RMSE: 12.42 minutes
Time-of-Day Baseline sMAPE: 19.77%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 9.32 minutes
Day+Time Baseline RMSE: 12.10 minutes
Day+Time Baseline sMAPE: 19.30%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 12.36 minutes
Moving Average Baseline RMSE: 14.53 minutes
Moving Average Baseline sMAPE: 21.63%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 13.37 minutes
Seasonal Weekly Baseline

Processing rides:  77%|███████▋  | 24/31 [02:34<00:38,  5.57s/it]


LastYear Baseline MAE: 7.60 minutes
LastYear Baseline RMSE: 11.41 minutes
LastYear Baseline sMAPE: 17.15%
Baseline results saved to ../models/baseline_models/swiss_bob_run

Processing ride 25/31: tirol log flume

Processing baseline models for ride: tirol log flume
Training data size: 9849
Validation data size: 2019
  Training Mean Baseline...

Mean Baseline MAE: 8.90 minutes
Mean Baseline RMSE: 11.50 minutes
Mean Baseline sMAPE: 28.99%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 6.89 minutes
Time-of-Day Baseline RMSE: 10.11 minutes
Time-of-Day Baseline sMAPE: 26.88%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 6.51 minutes
Day+Time Baseline RMSE: 9.26 minutes
Day+Time Baseline sMAPE: 25.60%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 11.32 minutes
Moving Average Baseline RMSE: 15.95 minutes
Moving Average Baseline sMAPE: 0.00%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 11.32 minutes
Seasonal Weekly Baseli

Processing rides:  81%|████████  | 25/31 [02:40<00:34,  5.72s/it]


LastYear Baseline MAE: 9.49 minutes
LastYear Baseline RMSE: 13.66 minutes
LastYear Baseline sMAPE: 30.33%
Baseline results saved to ../models/baseline_models/tirol_log_flume

Processing ride 26/31: vienna wave swing  glckspilz

Processing baseline models for ride: vienna wave swing  glckspilz
Training data size: 8416
Validation data size: 2067
  Training Mean Baseline...

Mean Baseline MAE: 2.20 minutes
Mean Baseline RMSE: 2.57 minutes
Mean Baseline sMAPE: 23.35%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 2.20 minutes
Time-of-Day Baseline RMSE: 2.64 minutes
Time-of-Day Baseline sMAPE: 25.04%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 2.16 minutes
Day+Time Baseline RMSE: 2.62 minutes
Day+Time Baseline sMAPE: 25.09%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 0.91 minutes
Moving Average Baseline RMSE: 1.96 minutes
Moving Average Baseline sMAPE: 4.41%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 1.49 minutes

Processing rides:  84%|████████▍ | 26/31 [02:45<00:27,  5.40s/it]


LastYear Baseline MAE: 2.13 minutes
LastYear Baseline RMSE: 3.38 minutes
LastYear Baseline sMAPE: 6.10%
Baseline results saved to ../models/baseline_models/vienna_wave_swing__glckspilz

Processing ride 27/31: vindjammer

Processing baseline models for ride: vindjammer
Training data size: 10258
Validation data size: 2064
  Training Mean Baseline...

Mean Baseline MAE: 2.82 minutes
Mean Baseline RMSE: 3.50 minutes
Mean Baseline sMAPE: 29.33%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 2.76 minutes
Time-of-Day Baseline RMSE: 3.50 minutes
Time-of-Day Baseline sMAPE: 31.29%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 2.72 minutes
Day+Time Baseline RMSE: 3.43 minutes
Day+Time Baseline sMAPE: 31.83%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 2.47 minutes
Moving Average Baseline RMSE: 3.27 minutes
Moving Average Baseline sMAPE: 21.02%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 2.52 minutes
Seasonal Weekly Baseli

Processing rides:  87%|████████▋ | 27/31 [02:50<00:21,  5.36s/it]


LastYear Baseline MAE: 2.56 minutes
LastYear Baseline RMSE: 3.97 minutes
LastYear Baseline sMAPE: 8.28%
Baseline results saved to ../models/baseline_models/vindjammer

Processing ride 28/31: voletarium

Processing baseline models for ride: voletarium
Training data size: 8430
Validation data size: 2067
  Training Mean Baseline...

Mean Baseline MAE: 14.25 minutes
Mean Baseline RMSE: 16.78 minutes
Mean Baseline sMAPE: 35.47%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 11.02 minutes
Time-of-Day Baseline RMSE: 13.40 minutes
Time-of-Day Baseline sMAPE: 30.09%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 10.35 minutes
Day+Time Baseline RMSE: 12.93 minutes
Day+Time Baseline sMAPE: 28.57%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 14.74 minutes
Moving Average Baseline RMSE: 17.23 minutes
Moving Average Baseline sMAPE: 35.66%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 10.36 minutes
Seasonal Weekly Baseline RMSE: 1

Processing rides:  90%|█████████ | 28/31 [02:55<00:15,  5.13s/it]


LastYear Baseline MAE: 10.27 minutes
LastYear Baseline RMSE: 14.78 minutes
LastYear Baseline sMAPE: 26.03%
Baseline results saved to ../models/baseline_models/voletarium

Processing ride 29/31: volo da vinci

Processing baseline models for ride: volo da vinci
Training data size: 10309
Validation data size: 2066
  Training Mean Baseline...

Mean Baseline MAE: 6.34 minutes
Mean Baseline RMSE: 7.68 minutes
Mean Baseline sMAPE: 22.79%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 6.12 minutes
Time-of-Day Baseline RMSE: 7.38 minutes
Time-of-Day Baseline sMAPE: 23.65%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 5.84 minutes
Day+Time Baseline RMSE: 7.16 minutes
Day+Time Baseline sMAPE: 22.77%
  Training Moving Average Baseline...

Moving Average Baseline MAE: 6.36 minutes
Moving Average Baseline RMSE: 7.65 minutes
Moving Average Baseline sMAPE: 21.32%
  Training Seasonal Weekly Baseline...

Seasonal Weekly Baseline MAE: 6.72 minutes
Seasonal Weekly Baseline RMSE: 

Processing rides:  94%|█████████▎| 29/31 [03:00<00:10,  5.23s/it]


LastYear Baseline MAE: 5.57 minutes
LastYear Baseline RMSE: 7.64 minutes
LastYear Baseline sMAPE: 19.35%
Baseline results saved to ../models/baseline_models/volo_da_vinci

Processing ride 30/31: voltron nevera powered by rimac

Processing baseline models for ride: voltron nevera powered by rimac
Training data size: 0
Validation data size: 0
Skipping voltron nevera powered by rimac due to insufficient data

Processing ride 31/31: whale adventures  northern lights

Processing baseline models for ride: whale adventures  northern lights
Training data size: 10314
Validation data size: 2063
  Training Mean Baseline...

Mean Baseline MAE: 1.52 minutes
Mean Baseline RMSE: 2.31 minutes
Mean Baseline sMAPE: 70.81%
  Training Time-of-Day Baseline...

Time-of-Day Baseline MAE: 1.39 minutes
Time-of-Day Baseline RMSE: 2.24 minutes
Time-of-Day Baseline sMAPE: 61.33%
  Training Day+Time Baseline...

Day+Time Baseline MAE: 1.38 minutes
Day+Time Baseline RMSE: 2.24 minutes
Day+Time Baseline sMAPE: 60.7

Processing rides: 100%|██████████| 31/31 [03:05<00:00,  5.99s/it]


LastYear Baseline MAE: 1.11 minutes
LastYear Baseline RMSE: 2.59 minutes
LastYear Baseline sMAPE: 12.26%
Baseline results saved to ../models/baseline_models/whale_adventures__northern_lights

BASELINE MODELS SUMMARY:
Total rides processed: 30
Total model evaluations: 240

Model Performance (Average MAE):
                 model_name  mae_mean
2         LastWeek Baseline      4.52
1    Holiday-Aware Baseline      5.62
0         Day+Time Baseline      5.63
3         LastYear Baseline      5.81
7      Time-of-Day Baseline      5.90
6  Seasonal Weekly Baseline      6.83
4             Mean Baseline      6.96
5   Moving Average Baseline      6.98

Detailed summary saved to: ../models/baseline_models/detailed_baseline_summary.csv
Model comparison saved to: ../models/baseline_models/model_comparison_summary.csv
Best models per ride saved to: ../models/baseline_models/best_model_per_ride.csv





Visualizations saved to output directory.

BASELINE MODEL PROCESSING COMPLETED


## Analysis Functions for Loaded Results

In [11]:
def analyze_baseline_results(output_dir="../models/baseline_models/"):
    """Load and analyze saved baseline model results."""
    
    # Check if summary files exist
    summary_path = os.path.join(output_dir, "model_comparison_summary.csv")
    if not os.path.exists(summary_path):
        print("Summary files not found. Run the processing pipeline first.")
        return None
    
    # Load summary data
    model_summary = pd.read_csv(summary_path)
    detailed_summary = pd.read_csv(os.path.join(output_dir, "detailed_baseline_summary.csv"))
    best_per_ride = pd.read_csv(os.path.join(output_dir, "best_model_per_ride.csv"))
    
    # Display key insights
    print("="*60)
    print("BASELINE MODELS ANALYSIS")
    print("="*60)
    
    print("\n1. Overall Model Performance (sorted by average MAE):")
    print(model_summary[['model_name', 'mae_mean', 'mae_std']].sort_values('mae_mean'))
    
    print("\n2. Best performing model distribution:")
    best_counts = best_per_ride['model_name'].value_counts()
    print(best_counts)
    
    print("\n3. Top 10 rides with lowest MAE:")
    print(best_per_ride.head(10)[['ride_name', 'model_name', 'mae']])
    
    print("\n4. Worst 10 rides with highest MAE:")
    print(best_per_ride.tail(10)[['ride_name', 'model_name', 'mae']])
    
    return {
        'model_summary': model_summary,
        'detailed_summary': detailed_summary,
        'best_per_ride': best_per_ride
    }

results = analyze_baseline_results()

BASELINE MODELS ANALYSIS

1. Overall Model Performance (sorted by average MAE):
                 model_name  mae_mean  mae_std
2         LastWeek Baseline      4.52     3.43
1    Holiday-Aware Baseline      5.62     3.63
0         Day+Time Baseline      5.63     3.64
3         LastYear Baseline      5.81     3.55
7      Time-of-Day Baseline      5.90     3.86
6  Seasonal Weekly Baseline      6.83     5.11
4             Mean Baseline      6.96     4.86
5   Moving Average Baseline      6.98     5.14

2. Best performing model distribution:
model_name
LastWeek Baseline           23
Moving Average Baseline      3
LastYear Baseline            2
Seasonal Weekly Baseline     1
Holiday-Aware Baseline       1
Name: count, dtype: int64

3. Top 10 rides with lowest MAE:
                           ride_name                model_name       mae
0      madame freudenreich curiosits         LastYear Baseline  0.072604
1                      kolumbusjolle   Moving Average Baseline  0.702860
2   arena of

In [12]:
results["model_summary"]

Unnamed: 0,model_name,mae_mean,mae_std,mae_median,rmse_mean,rmse_std,rmse_median,smape_mean,smape_std,smape_median
0,Day+Time Baseline,5.63,3.64,4.86,7.4,4.82,6.56,31.13,14.09,29.1
1,Holiday-Aware Baseline,5.62,3.63,4.82,7.41,4.82,6.61,31.61,14.23,29.0
2,LastWeek Baseline,4.52,3.43,3.35,7.41,4.81,6.13,13.07,7.76,15.09
3,LastYear Baseline,5.81,3.55,5.58,8.68,5.07,8.09,16.87,9.38,17.15
4,Mean Baseline,6.96,4.86,5.22,8.54,5.88,6.94,32.94,15.01,29.16
5,Moving Average Baseline,6.98,5.14,5.67,8.85,6.48,6.87,27.5,20.81,23.34
6,Seasonal Weekly Baseline,6.83,5.11,5.78,9.65,7.08,7.7,31.31,15.05,29.69
7,Time-of-Day Baseline,5.9,3.86,4.97,7.68,5.05,6.71,31.83,14.07,29.7


In [14]:
results["detailed_summary"]

Unnamed: 0,ride_name,train_data_size,val_data_size,model_name,mae,rmse,smape
0,alpine express enzian,10302,2019,Mean Baseline,7.867062,9.636920,22.562017
1,alpine express enzian,10302,2019,Time-of-Day Baseline,6.768024,8.271249,24.178457
2,alpine express enzian,10302,2019,Day+Time Baseline,6.042877,7.480032,22.385837
3,alpine express enzian,10302,2019,Moving Average Baseline,7.044948,8.790591,22.896676
4,alpine express enzian,10302,2019,Seasonal Weekly Baseline,7.989723,10.585238,39.458133
...,...,...,...,...,...,...,...
235,whale adventures northern lights,10314,2063,Moving Average Baseline,1.638239,2.332535,67.045061
236,whale adventures northern lights,10314,2063,Seasonal Weekly Baseline,0.996122,2.378914,56.369556
237,whale adventures northern lights,10314,2063,Holiday-Aware Baseline,1.375083,2.243407,60.826685
238,whale adventures northern lights,10314,2063,LastWeek Baseline,1.048697,2.717328,16.170213


In [13]:
results["best_per_ride"]

Unnamed: 0,ride_name,model_name,mae,rmse,smape
0,madame freudenreich curiosits,LastYear Baseline,0.072604,0.602512,0.0
1,kolumbusjolle,Moving Average Baseline,0.70286,1.489051,3.714653
2,arena of football be part of it,LastWeek Baseline,0.714049,1.987732,2.332204
3,vienna wave swing glckspilz,Moving Average Baseline,0.911647,1.959635,4.412949
4,whale adventures northern lights,Seasonal Weekly Baseline,0.996122,2.378914,56.369556
5,poppy towers,Moving Average Baseline,1.162429,2.158024,6.936781
6,old mac donalds tractor fun,LastWeek Baseline,1.48349,3.02014,8.683799
7,castello dei medici,LastWeek Baseline,1.551188,2.828388,1.034564
8,vindjammer,LastWeek Baseline,1.695793,3.427279,6.069146
9,atlantis adventure,LastWeek Baseline,1.794465,3.30426,10.750283
