In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge, LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import date
import warnings
warnings.filterwarnings('ignore')


In [None]:
# ============== DATA LOADING AND PREPROCESSING ==============

def load_and_preprocess_data(file_path, ride_name):
    """Load and preprocess ride data."""
    # Load data
    full_data = pd.read_parquet(file_path)
    ride_data = full_data[full_data["ride_name"] == ride_name].copy()
    
    # Convert closed column to int
    if "closed" in ride_data.columns:
        ride_data["closed"] = ride_data["closed"].astype(int)
    
    # Remove duplicates
    ride_data = ride_data.drop_duplicates(subset=['timestamp'])
    
    # Remove NaN wait times
    ride_data = ride_data.dropna(subset=['wait_time'])
    
    # Resample to 30min intervals
    ride_data = ride_data.set_index("timestamp").resample("30min").ffill().reset_index()
    
    # Filter to months after March
    ride_data = ride_data[ride_data["timestamp"].dt.month > 3]
    
    # Add date column
    ride_data['date'] = ride_data['timestamp'].dt.date
    
    return ride_data


In [None]:
def filter_operating_data(ride_data):
    """Remove closed days and filter to operating hours."""
    # Remove fully closed days
    daily_closure = ride_data.groupby('date')["closed"].mean()
    fully_closed_days = daily_closure[daily_closure == 1].index
    ride_data = ride_data[~ride_data['date'].isin(fully_closed_days)]
    
    # Remove zero wait days
    daily_wait = ride_data.groupby('date')["wait_time"].mean()
    zero_wait_days = daily_wait[daily_wait < 1].index
    ride_data = ride_data[~ride_data['date'].isin(zero_wait_days)]
    
    # Filter to operating hours (when wait_time > 0)
    ride_data = ride_data[ride_data['wait_time'] > 0]
    
    return ride_data

In [None]:
# ============== FEATURE ENGINEERING ==============

def create_time_features(df):
    """Create time-based features."""
    df = df.copy()
    
    # Basic time features
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month
    df['day_of_month'] = df['timestamp'].dt.day
    df['week_of_year'] = df['timestamp'].dt.isocalendar().week
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Cyclical encoding
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['dow_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['dow_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
    # Time of day bins
    df['time_of_day'] = pd.cut(df['hour'], 
                               bins=[0, 12, 17, 24], 
                               labels=['morning', 'afternoon', 'evening'])
    
    # Hour bins (more granular)
    df['hour_bin'] = pd.cut(df['hour'], 
                            bins=[0, 9, 12, 15, 18, 24], 
                            labels=['early_morning', 'late_morning', 'early_afternoon', 
                                   'late_afternoon', 'evening'])
    
    return df

In [None]:
def create_weather_features(df):
    """Create weather features if available."""
    if 'temperature' in df.columns:
        df['temp_squared'] = df['temperature'] ** 2
        df['temp_comfortable'] = ((df['temperature'] >= 15) & (df['temperature'] <= 25)).astype(int)
        # Temperature deviation from ideal (22°C)
        df['temp_deviation'] = np.abs(df['temperature'] - 22)
    
    if 'rain' in df.columns:
        df['has_rain'] = (df['rain'] > 0).astype(int)
        df['rain_intensity'] = pd.cut(df['rain'], 
                                      bins=[-0.1, 0, 1, 5, 100], 
                                      labels=['none', 'light', 'moderate', 'heavy'])
    
    if 'wind' in df.columns:
        df['wind_squared'] = df['wind'] ** 2
        df['high_wind'] = (df['wind'] > 15).astype(int)
    
    return df


In [None]:
# ============== ATTENDANCE MODEL ENSEMBLE ==============

def calculate_daily_attendance(train_data):
    """Calculate daily attendance metrics from training data only."""
    daily_metrics = train_data.groupby('date').agg({
        'wait_time': ['mean', 'max', 'std', 'median'],
        'timestamp': 'count'
    }).reset_index()
    
    daily_metrics.columns = ['date', 'daily_avg_wait', 'daily_max_wait', 
                           'daily_std_wait', 'daily_median_wait', 'hourly_samples']
    
    # Create multiple attendance scores
    daily_metrics['attendance_score_avg'] = daily_metrics['daily_avg_wait'].rank(pct=True) * 100
    daily_metrics['attendance_score_max'] = daily_metrics['daily_max_wait'].rank(pct=True) * 100
    daily_metrics['attendance_score_median'] = daily_metrics['daily_median_wait'].rank(pct=True) * 100
    
    # Combined attendance score
    daily_metrics['attendance_score'] = (
        daily_metrics['attendance_score_avg'] * 0.5 +
        daily_metrics['attendance_score_max'] * 0.3 +
        daily_metrics['attendance_score_median'] * 0.2
    )
    
    return daily_metrics

In [None]:
def train_attendance_ensemble(train_data):
    """Train ensemble model to predict daily attendance."""
    # Calculate daily attendance for training data
    daily_attendance = calculate_daily_attendance(train_data)
    
    # Add calendar features
    daily_attendance['day_of_week'] = pd.to_datetime(daily_attendance['date']).dt.dayofweek
    daily_attendance['month'] = pd.to_datetime(daily_attendance['date']).dt.month
    daily_attendance['day_of_month'] = pd.to_datetime(daily_attendance['date']).dt.day
    daily_attendance['is_weekend'] = (daily_attendance['day_of_week'] >= 5).astype(int)
    
    # Add cyclical features
    daily_attendance['dow_sin'] = np.sin(2 * np.pi * daily_attendance['day_of_week'] / 7)
    daily_attendance['dow_cos'] = np.cos(2 * np.pi * daily_attendance['day_of_week'] / 7)
    daily_attendance['month_sin'] = np.sin(2 * np.pi * daily_attendance['month'] / 12)
    daily_attendance['month_cos'] = np.cos(2 * np.pi * daily_attendance['month'] / 12)
    
    # Features and target
    features = ['day_of_week', 'month', 'day_of_month', 'is_weekend',
               'dow_sin', 'dow_cos', 'month_sin', 'month_cos']
    target = 'attendance_score'
    
    X = daily_attendance[features]
    y = daily_attendance[target]
    
    # Create ensemble of models
    models = [
        ('ridge', Ridge(alpha=1.0)),
        ('rf', RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42)),
        ('gb', GradientBoostingRegressor(n_estimators=50, max_depth=3, random_state=42))
    ]
    
    ensemble = VotingRegressor(estimators=models)
    ensemble.fit(X, y)
    
    return ensemble, features

In [None]:
def predict_daily_attendance(dates, attendance_model, features):
    """Predict attendance for given dates."""
    # Create feature dataframe
    date_df = pd.DataFrame({'date': dates})
    date_df['day_of_week'] = pd.to_datetime(date_df['date']).dt.dayofweek
    date_df['month'] = pd.to_datetime(date_df['date']).dt.month
    date_df['day_of_month'] = pd.to_datetime(date_df['date']).dt.day
    date_df['is_weekend'] = (date_df['day_of_week'] >= 5).astype(int)
    
    # Add cyclical features
    date_df['dow_sin'] = np.sin(2 * np.pi * date_df['day_of_week'] / 7)
    date_df['dow_cos'] = np.cos(2 * np.pi * date_df['day_of_week'] / 7)
    date_df['month_sin'] = np.sin(2 * np.pi * date_df['month'] / 12)
    date_df['month_cos'] = np.cos(2 * np.pi * date_df['month'] / 12)
    
    # Predict
    date_df['attendance_score'] = attendance_model.predict(date_df[features])
    
    # Clip to valid range
    date_df['attendance_score'] = date_df['attendance_score'].clip(0, 100)
    
    return date_df[['date', 'attendance_score']]

In [None]:
# ============== VISUALIZATION FUNCTIONS ==============

def visualize_results(y_test, y_pred, test_data, title_suffix=""):
    """Visualize the prediction results with comprehensive analysis."""
    # Ensure predictions are non-negative
    y_pred = np.maximum(y_pred, 0)
    
    # Create results dataframe
    results_df = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred,
        'Error': y_pred - y_test,
        'Timestamp': test_data['timestamp'].values
    })

    # Main scatter plot and time series
    fig, axes = plt.subplots(2, 1, figsize=(14, 12))

    # Scatter plot
    axes[0].scatter(results_df['Actual'], results_df['Predicted'], alpha=0.5)
    max_val = max(results_df['Actual'].max(), results_df['Predicted'].max())
    axes[0].plot([0, max_val], [0, max_val], 'r--')
    axes[0].set_xlabel('Actual Wait Time (minutes)')
    axes[0].set_ylabel('Predicted Wait Time (minutes)')
    axes[0].set_title(f'Actual vs Predicted Wait Times - {title_suffix}')
    axes[0].grid(True, linestyle=':')

    # Add metrics
    mae = mean_absolute_error(results_df['Actual'], results_df['Predicted'])
    rmse = np.sqrt(mean_squared_error(results_df['Actual'], results_df['Predicted']))
    r2 = r2_score(results_df['Actual'], results_df['Predicted'])
    
    metrics_text = f"MAE: {mae:.2f}\nRMSE: {rmse:.2f}\nR²: {r2:.4f}"
    axes[0].text(0.05, 0.95, metrics_text, transform=axes[0].transAxes, 
                 verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.5))

    # Time series plot
    results_df = results_df.sort_values('Timestamp')
    axes[1].plot(results_df['Timestamp'], results_df['Actual'], label='Actual', alpha=0.7)
    axes[1].plot(results_df['Timestamp'], results_df['Predicted'], label='Predicted', alpha=0.7)
    axes[1].set_xlabel('Date')
    axes[1].set_ylabel('Wait Time (minutes)')
    axes[1].set_title(f'Actual vs Predicted Wait Times Over Time - {title_suffix}')
    axes[1].grid(True, linestyle=':')
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()

    # Error analysis by wait time range
    plt.figure(figsize=(10, 6))
    
    # Create wait time bins
    bins = [0, 10, 20, 30, 40, 50, 60, np.inf]
    labels = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60+']
    
    results_df['wait_bin'] = pd.cut(results_df['Actual'], bins=bins, labels=labels)
    
    # Calculate metrics for each bin
    bin_metrics = results_df.groupby('wait_bin').agg({
        'Error': ['mean', 'std'],
        'Actual': 'count'
    })
    
    bin_metrics.columns = ['Mean Error', 'Std Error', 'Count']
    bin_metrics['Abs Mean Error'] = results_df.groupby('wait_bin')['Error'].apply(lambda x: np.abs(x).mean())
    
    # Plot mean absolute error by wait time range
    plt.bar(bin_metrics.index, bin_metrics['Abs Mean Error'], alpha=0.7)
    plt.xlabel('Actual Wait Time Range (minutes)')
    plt.ylabel('Mean Absolute Error (minutes)')
    plt.title(f'Error by Wait Time Range - {title_suffix}')
    
    # Add count labels
    for i, count in enumerate(bin_metrics['Count']):
        plt.text(i, bin_metrics['Abs Mean Error'].iloc[i] + 0.5, f"n={count}", 
                 ha='center', va='bottom', fontsize=8)
    
    plt.grid(True, linestyle=':')
    plt.tight_layout()
    plt.show()
    
    # Hour of day analysis
    if 'hour' not in test_data.columns:
        test_data['hour'] = test_data['timestamp'].dt.hour
    
    results_df['hour'] = test_data['hour'].values
    hourly_abs_errors = results_df.groupby('hour').apply(lambda x: np.abs(x['Error']).mean())

    # Plot hourly MAE
    plt.figure(figsize=(12, 6))
    hourly_abs_errors.plot(kind='bar')
    plt.xlabel('Hour of Day')
    plt.ylabel('Mean Absolute Error (minutes)')
    plt.title(f'Mean Absolute Error by Hour of Day - {title_suffix}')
    plt.grid(True, linestyle=':')
    plt.tight_layout()
    plt.show()
    
    # Day of week analysis
    if 'day_of_week' not in test_data.columns:
        test_data['day_of_week'] = test_data['timestamp'].dt.dayofweek
    
    results_df['day'] = test_data['day_of_week'].values
    daily_abs_errors = results_df.groupby('day').apply(lambda x: np.abs(x['Error']).mean())

    # Plot daily MAE
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    plt.figure(figsize=(12, 6))
    daily_abs_errors.plot(kind='bar')
    plt.xlabel('Day of Week')
    plt.ylabel('Mean Absolute Error (minutes)')
    plt.title(f'Mean Absolute Error by Day of Week - {title_suffix}')
    plt.xticks(range(len(days)), days)
    plt.grid(True, linestyle=':')
    plt.tight_layout()
    plt.show()
    
    return results_df


In [None]:
def visualize_future_predictions(predictions_df, ride_name):
    """Create heatmap visualization for future predictions."""
    # Create pivot table for heatmap
    pivot_table = predictions_df.pivot_table(
        index='date',
        columns='hour',
        values='predicted_wait'
    )
    
    # Plot heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(
        pivot_table, 
        annot=True, 
        fmt='.1f', 
        cmap='YlOrRd',
        linewidths=.5
    )
    plt.title(f'Predicted Wait Times for {ride_name.title()}')
    plt.xlabel('Hour of Day')
    plt.ylabel('Date')
    plt.tight_layout()
    plt.show()

In [None]:
# ============== WAIT TIME MODEL ENSEMBLE ==============

def prepare_model_features(df, attendance_predictions=None):
    """Prepare features for wait time model."""
    df = df.copy()
    
    # Merge attendance predictions if provided
    if attendance_predictions is not None:
        df = df.merge(attendance_predictions, on='date', how='left')
        # Fill missing attendance with median
        df['attendance_score'] = df['attendance_score'].fillna(50)
    
    # Define features
    categorical_features = ['time_of_day', 'hour_bin']
    numerical_features = [
        'hour', 'is_weekend', 'hour_sin', 'hour_cos', 
        'month_sin', 'month_cos', 'dow_sin', 'dow_cos',
        'day_of_month', 'week_of_year'
    ]
    
    # Add attendance if available
    if 'attendance_score' in df.columns:
        numerical_features.append('attendance_score')
        # Create interaction features
        df['attendance_hour'] = df['attendance_score'] * df['hour'] / 100
        df['attendance_weekend'] = df['attendance_score'] * df['is_weekend']
        numerical_features.extend(['attendance_hour', 'attendance_weekend'])
    
    # Add weather features if available
    if 'temperature' in df.columns:
        numerical_features.extend(['temperature', 'temp_squared', 'temp_comfortable', 'temp_deviation'])
    if 'has_rain' in df.columns:
        numerical_features.append('has_rain')
        categorical_features.append('rain_intensity')
    if 'wind' in df.columns:
        numerical_features.extend(['wind', 'wind_squared', 'high_wind'])
    
    # Handle missing values
    for col in numerical_features:
        if col in df.columns and df[col].isna().any():
            print(f"   Filling {df[col].isna().sum()} NaN values in '{col}'")
            df[col] = df[col].fillna(df[col].median())
    
    for col in categorical_features:
        if col in df.columns and df[col].isna().any():
            print(f"   Filling {df[col].isna().sum()} NaN values in '{col}'")
            df[col] = df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'unknown')
    
    return df, categorical_features, numerical_features

In [None]:
def build_ensemble_models(categorical_features, numerical_features):
    """Build ensemble of models for wait time prediction."""
    # Common preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features),
            ('num', StandardScaler(), numerical_features)
        ]
    )
    
    # Individual models
    models = []
    
    # Linear models
    ridge_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', Ridge(alpha=1.0))
    ])
    models.append(('ridge', ridge_pipeline))
    
    elastic_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42))
    ])
    models.append(('elastic', elastic_pipeline))
    
    # Tree-based models
    rf_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(
            n_estimators=100, 
            max_depth=10,
            min_samples_split=5,
            random_state=42
        ))
    ])
    models.append(('rf', rf_pipeline))
    
    gb_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', GradientBoostingRegressor(
            n_estimators=100,
            max_depth=4,
            learning_rate=0.1,
            random_state=42
        ))
    ])
    models.append(('gb', gb_pipeline))
    
    return models


In [None]:
# ============== MAIN PIPELINE ==============

def train_ensemble_with_attendance(file_path, ride_name, test_year=2023):
    """Main pipeline for ensemble model with attendance prediction."""
    print(f"=== Training Ensemble Model with Attendance for {ride_name} ===")
    
    # Step 1: Load and preprocess data
    print("\n1. Loading and preprocessing data...")
    ride_data = load_and_preprocess_data(file_path, ride_name)
    ride_data = filter_operating_data(ride_data)
    print(f"   Total data shape: {ride_data.shape}")
    
    # Step 2: Create features
    print("\n2. Creating features...")
    ride_data = create_time_features(ride_data)
    ride_data = create_weather_features(ride_data)
    
    # Step 3: Train/test split (BEFORE any model training)
    print(f"\n3. Splitting data (test year: {test_year})...")
    train_mask = ride_data['timestamp'].dt.year < test_year
    test_mask = ride_data['timestamp'].dt.year == test_year
    
    train_data = ride_data[train_mask].copy()
    test_data = ride_data[test_mask].copy()
    print(f"   Train data: {len(train_data)} samples")
    print(f"   Test data: {len(test_data)} samples")
    
    # Step 4: Train attendance ensemble (on training data only)
    print("\n4. Training attendance ensemble...")
    attendance_model, attendance_features = train_attendance_ensemble(train_data)
    
    # Step 5: Predict attendance for train and test sets
    print("\n5. Predicting attendance...")
    train_dates = train_data['date'].unique()
    test_dates = test_data['date'].unique()
    
    train_attendance = predict_daily_attendance(train_dates, attendance_model, attendance_features)
    test_attendance = predict_daily_attendance(test_dates, attendance_model, attendance_features)
    
    # Step 6: Prepare features for wait time model
    print("\n6. Preparing features for wait time model...")
    train_data, cat_features, num_features = prepare_model_features(train_data, train_attendance)
    test_data, _, _ = prepare_model_features(test_data, test_attendance)
    
    # Step 7: Train individual models
    print("\n7. Training individual models...")
    models = build_ensemble_models(cat_features, num_features)
    
    X_train = train_data[cat_features + num_features]
    y_train = train_data['wait_time']
    X_test = test_data[cat_features + num_features]
    y_test = test_data['wait_time']
    
    # Train and evaluate individual models
    individual_predictions = {}
    individual_metrics = {}
    
    for name, model in models:
        print(f"   Training {name}...")
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        y_pred = np.maximum(y_pred, 0)  # Ensure non-negative
        
        individual_predictions[name] = y_pred
        
        # Metrics
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        individual_metrics[name] = {'mae': mae, 'rmse': rmse, 'r2': r2}
        print(f"      MAE: {mae:.2f}, RMSE: {rmse:.2f}, R²: {r2:.4f}")
    
    # Step 8: Create ensemble predictions
    print("\n8. Creating ensemble predictions...")
    
    # Simple average ensemble
    ensemble_pred = np.mean(list(individual_predictions.values()), axis=0)
    
    # Weighted ensemble (based on individual R² scores)
    weights = np.array([individual_metrics[name]['r2'] for name in individual_predictions.keys()])
    weights = weights / weights.sum()  # Normalize
    
    weighted_ensemble_pred = np.average(
        list(individual_predictions.values()), 
        axis=0, 
        weights=weights
    )
    
    # Evaluate ensemble
    ensemble_mae = mean_absolute_error(y_test, ensemble_pred)
    ensemble_rmse = np.sqrt(mean_squared_error(y_test, ensemble_pred))
    ensemble_r2 = r2_score(y_test, ensemble_pred)
    
    weighted_mae = mean_absolute_error(y_test, weighted_ensemble_pred)
    weighted_rmse = np.sqrt(mean_squared_error(y_test, weighted_ensemble_pred))
    weighted_r2 = r2_score(y_test, weighted_ensemble_pred)
    
    print(f"\nEnsemble Results:")
    print(f"   Simple Average - MAE: {ensemble_mae:.2f}, RMSE: {ensemble_rmse:.2f}, R²: {ensemble_r2:.4f}")
    print(f"   Weighted Average - MAE: {weighted_mae:.2f}, RMSE: {weighted_rmse:.2f}, R²: {weighted_r2:.4f}")
    
    # Use the better ensemble
    if weighted_mae < ensemble_mae:
        final_pred = weighted_ensemble_pred
        final_type = "Weighted"
    else:
        final_pred = ensemble_pred
        final_type = "Simple"
    
    # Evaluate by wait time range
    print(f"\nMAE by Wait Time Range ({final_type} Ensemble):")
    for min_wait, max_wait in [(0, 10), (10, 30), (30, 60), (60, np.inf)]:
        mask = (y_test >= min_wait) & (y_test < max_wait)
        if mask.sum() > 0:
            range_mae = mean_absolute_error(y_test[mask], final_pred[mask])
            print(f"   {min_wait}-{max_wait if max_wait != np.inf else '∞'} min: {range_mae:.2f} (n={mask.sum()})")
    
    # Plot results
    print("\n9. Visualizing results...")
    
    # Visualize final ensemble predictions
    visualize_results(y_test, final_pred, test_data, 
                     title_suffix=f"{final_type} Ensemble - {ride_name}")
    
    # Additional plots for model comparison
    plt.figure(figsize=(15, 5))
    
    # Individual model comparison
    plt.subplot(1, 3, 1)
    model_names = list(individual_metrics.keys()) + [f'{final_type} Ensemble']
    mae_values = [individual_metrics[name]['mae'] for name in individual_metrics.keys()] + [mae]
    bars = plt.bar(model_names, mae_values)
    # Color the best model differently
    best_idx = mae_values.index(min(mae_values))
    bars[best_idx].set_color('green')
    plt.xlabel('Model')
    plt.ylabel('MAE')
    plt.title('Model Comparison - MAE')
    plt.xticks(rotation=45)
    
    # R² comparison
    plt.subplot(1, 3, 2)
    r2_values = [individual_metrics[name]['r2'] for name in individual_metrics.keys()] + [r2]
    bars = plt.bar(model_names, r2_values)
    best_idx = r2_values.index(max(r2_values))
    bars[best_idx].set_color('green')
    plt.xlabel('Model')
    plt.ylabel('R²')
    plt.title('Model Comparison - R²')
    plt.xticks(rotation=45)
    
    # Feature importance from Random Forest
    plt.subplot(1, 3, 3)
    rf_model = models[2][1]  # Random Forest is the 3rd model
    preprocessor = rf_model.named_steps['preprocessor']
    regressor = rf_model.named_steps['regressor']
    
    # Get feature names after preprocessing
    feature_names = []
    for name, transformer, features in preprocessor.transformers_:
        if name == 'cat':
            # Get one-hot encoded names
            encoded_names = preprocessor.named_transformers_['cat'].get_feature_names_out(features)
            feature_names.extend(encoded_names)
        else:
            feature_names.extend(features)
    
    # Get importances
    importances = regressor.feature_importances_
    indices = np.argsort(importances)[::-1][:10]  # Top 10
    
    plt.barh(range(10), importances[indices])
    plt.yticks(range(10), [feature_names[i] for i in indices])
    plt.xlabel('Importance')
    plt.title('Top 10 Feature Importances (RF)')
    
    plt.tight_layout()
    plt.show()
    
    return {
        'models': dict(models),
        'attendance_model': attendance_model,
        'attendance_features': attendance_features,
        'cat_features': cat_features,
        'num_features': num_features,
        'ensemble_weights': weights if final_type == "Weighted" else None,
        'metrics': {
            'individual': individual_metrics,
            'ensemble': {
                'mae': mae, 
                'rmse': rmse, 
                'r2': r2,
                'type': final_type
            }
        }
    }

In [None]:
# ============== PREDICTION FUNCTION ==============

def predict_future_wait_times(models_dict, future_dates, future_hours):
    """Predict wait times for future dates and hours."""
    # Predict attendance for future dates
    attendance_predictions = predict_daily_attendance(
        future_dates, 
        models_dict['attendance_model'], 
        models_dict['attendance_features']
    )
    
    # Create future data
    future_data = []
    for date in future_dates:
        for hour in future_hours:
            future_data.append({
                'timestamp': pd.Timestamp.combine(date, pd.Timestamp(f"{hour}:00:00").time()),
                'date': date,
                'hour': hour
            })
    
    future_df = pd.DataFrame(future_data)
    
    # Add features
    future_df = create_time_features(future_df)
    
    # Add default weather (can be modified if forecast available)
    future_df['temperature'] = 20
    future_df['rain'] = 0
    future_df['wind'] = 5
    
    future_df = create_weather_features(future_df)
    future_df, _, _ = prepare_model_features(future_df, attendance_predictions)
    
    # Predict with each model
    X_future = future_df[models_dict['cat_features'] + models_dict['num_features']]
    predictions = {}
    
    for name, model in models_dict['models'].items():
        pred = model.predict(X_future)
        predictions[name] = np.maximum(pred, 0)
    
    # Ensemble prediction
    if models_dict['ensemble_weights'] is not None:
        ensemble_pred = np.average(list(predictions.values()), axis=0, weights=models_dict['ensemble_weights'])
    else:
        ensemble_pred = np.mean(list(predictions.values()), axis=0)
    
    future_df['predicted_wait'] = ensemble_pred
    
    return future_df[['timestamp', 'date', 'hour', 'predicted_wait']]


In [None]:
# ============== EXAMPLE USAGE ==============

if __name__ == "__main__":
    # Parameters
    data_path = "../data/processed/ep/merged_with_holidays.parquet"
    ride_name = "silver star"
    test_year = 2023
    
    # Train model
    results = train_ensemble_with_attendance(data_path, ride_name, test_year)
    
    print("\nEnsemble model training completed!")
    
    # Demo: Predict future wait times
    print("\n=== PREDICTING FUTURE WAIT TIMES ===")
    
    # Define future dates (7 days)
    future_dates = [date(2025, 5, 15 + i) for i in range(7)]
    future_hours = list(range(10, 20))  # 10am to 7pm
    
    print("Future dates for prediction:")
    for d in future_dates:
        print(f"  - {d}")
    
    # Make predictions
    future_predictions = predict_future_wait_times(results, future_dates, future_hours)
    
    print(f"\nGenerated {len(future_predictions)} predictions")
    print(future_predictions.head(10))
    
    # Visualize future predictions
    visualize_future_predictions(future_predictions, ride_name)