In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# ============== DATA LOADING AND PREPROCESSING ==============

def load_and_preprocess_data(file_path, ride_name):
    """Load and preprocess ride data."""
    # Load data
    full_data = pd.read_parquet(file_path)
    ride_data = full_data[full_data["ride_name"] == ride_name].copy()
    
    # Convert closed column to int
    if "closed" in ride_data.columns:
        ride_data["closed"] = ride_data["closed"].astype(int)
    
    # Remove duplicates
    ride_data = ride_data.drop_duplicates(subset=['timestamp'])
    
    # Remove NaN wait times
    ride_data = ride_data.dropna(subset=['wait_time'])
    
    # Resample to 30min intervals
    ride_data = ride_data.set_index("timestamp").resample("30min").ffill().reset_index()
    
    # Filter to months after March
    ride_data = ride_data[ride_data["timestamp"].dt.month > 3]
    
    # Add date column
    ride_data['date'] = ride_data['timestamp'].dt.date
    
    return ride_data

In [3]:
def filter_operating_data(ride_data):
    """Remove closed days and filter to operating hours."""
    # Remove fully closed days
    daily_closure = ride_data.groupby('date')["closed"].mean()
    fully_closed_days = daily_closure[daily_closure == 1].index
    ride_data = ride_data[~ride_data['date'].isin(fully_closed_days)]
    
    # Remove zero wait days
    daily_wait = ride_data.groupby('date')["wait_time"].mean()
    zero_wait_days = daily_wait[daily_wait < 1].index
    ride_data = ride_data[~ride_data['date'].isin(zero_wait_days)]
    
    # Filter to operating hours (when wait_time > 0)
    ride_data = ride_data[ride_data['wait_time'] > 0]
    
    return ride_data

In [4]:
# ============== FEATURE ENGINEERING ==============

def create_time_features(df):
    """Create time-based features."""
    df = df.copy()
    
    # Basic time features
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Cyclical encoding
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # Time of day bins
    df['time_of_day'] = pd.cut(df['hour'], 
                               bins=[0, 12, 17, 24], 
                               labels=['morning', 'afternoon', 'evening'])
    
    return df

In [5]:
def create_weather_features(df):
    """Create weather features if available."""
    if 'temperature' in df.columns:
        df['temp_squared'] = df['temperature'] ** 2
        df['temp_comfortable'] = ((df['temperature'] >= 15) & (df['temperature'] <= 25)).astype(int)
    
    if 'rain' in df.columns:
        df['has_rain'] = (df['rain'] > 0).astype(int)
    
    return df


In [6]:
# ============== ATTENDANCE MODEL ==============

def calculate_daily_attendance(train_data):
    """Calculate daily attendance metrics from training data only."""
    daily_metrics = train_data.groupby('date').agg({
        'wait_time': ['mean', 'max', 'std']
    }).reset_index()
    
    daily_metrics.columns = ['date', 'daily_avg_wait', 'daily_max_wait', 'daily_std_wait']
    
    # Create attendance score (percentile-based)
    daily_metrics['attendance_score'] = daily_metrics['daily_avg_wait'].rank(pct=True) * 100
    
    return daily_metrics

In [7]:
def train_attendance_model(train_data):
    """Train model to predict daily attendance."""
    # Calculate daily attendance for training data
    daily_attendance = calculate_daily_attendance(train_data)
    
    # Add calendar features
    daily_attendance['day_of_week'] = pd.to_datetime(daily_attendance['date']).dt.dayofweek
    daily_attendance['month'] = pd.to_datetime(daily_attendance['date']).dt.month
    daily_attendance['is_weekend'] = (daily_attendance['day_of_week'] >= 5).astype(int)
    
    # Features and target
    features = ['day_of_week', 'month', 'is_weekend']
    target = 'attendance_score'
    
    X = daily_attendance[features]
    y = daily_attendance[target]
    
    # Simple Ridge regression
    model = Ridge(alpha=1.0)
    model.fit(X, y)
    
    return model, features

In [8]:
def predict_daily_attendance(dates, attendance_model, features):
    """Predict attendance for given dates."""
    # Create feature dataframe
    date_df = pd.DataFrame({'date': dates})
    date_df['day_of_week'] = pd.to_datetime(date_df['date']).dt.dayofweek
    date_df['month'] = pd.to_datetime(date_df['date']).dt.month
    date_df['is_weekend'] = (date_df['day_of_week'] >= 5).astype(int)
    
    # Predict
    date_df['attendance_score'] = attendance_model.predict(date_df[features])
    
    # Clip to valid range
    date_df['attendance_score'] = date_df['attendance_score'].clip(0, 100)
    
    return date_df[['date', 'attendance_score']]


In [9]:
# ============== WAIT TIME MODEL ENSEMBLE ==============

def prepare_model_features(df, attendance_predictions=None):
    """Prepare features for wait time model."""
    df = df.copy()
    
    # Merge attendance predictions if provided
    if attendance_predictions is not None:
        df = df.merge(attendance_predictions, on='date', how='left')
        # Fill missing attendance with median
        df['attendance_score'] = df['attendance_score'].fillna(50)
    
    # Define features
    categorical_features = ['time_of_day', 'hour_bin']
    numerical_features = [
        'hour', 'is_weekend', 'hour_sin', 'hour_cos', 
        'month_sin', 'month_cos', 'dow_sin', 'dow_cos',
        'day_of_month', 'week_of_year'
    ]
    
    # Add attendance if available
    if 'attendance_score' in df.columns:
        numerical_features.append('attendance_score')
        # Create interaction features
        df['attendance_hour'] = df['attendance_score'] * df['hour'] / 100
        df['attendance_weekend'] = df['attendance_score'] * df['is_weekend']
        numerical_features.extend(['attendance_hour', 'attendance_weekend'])
    
    # Add weather features if available
    if 'temperature' in df.columns:
        numerical_features.extend(['temperature', 'temp_squared', 'temp_comfortable', 'temp_deviation'])
    if 'has_rain' in df.columns:
        numerical_features.append('has_rain')
        categorical_features.append('rain_intensity')
    if 'wind' in df.columns:
        numerical_features.extend(['wind', 'wind_squared', 'high_wind'])
    
    # Handle missing values
    for col in numerical_features:
        if col in df.columns and df[col].isna().any():
            print(f"   Filling {df[col].isna().sum()} NaN values in '{col}'")
            df[col] = df[col].fillna(df[col].median())
    
    for col in categorical_features:
        if col in df.columns and df[col].isna().any():
            print(f"   Filling {df[col].isna().sum()} NaN values in '{col}'")
            df[col] = df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'unknown')
    
    return df, categorical_features, numerical_features


def build_wait_time_model(categorical_features, numerical_features):
    """Build pipeline for wait time prediction."""
    # Preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features),
            ('num', StandardScaler(), numerical_features)
        ]
    )
    
    # Pipeline
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', Ridge(alpha=1.0))
    ])
    
    return model

In [10]:
# ============== MAIN PIPELINE ==============

def train_linear_model_with_attendance(file_path, ride_name, test_year=2023):
    """Main pipeline for linear model with attendance prediction."""
    print(f"=== Training Linear Model with Attendance for {ride_name} ===")
    
    # Step 1: Load and preprocess data
    print("\n1. Loading and preprocessing data...")
    ride_data = load_and_preprocess_data(file_path, ride_name)
    ride_data = filter_operating_data(ride_data)
    print(f"   Total data shape: {ride_data.shape}")
    
    # Step 2: Create features
    print("\n2. Creating features...")
    ride_data = create_time_features(ride_data)
    ride_data = create_weather_features(ride_data)
    
    # Step 3: Train/test split (BEFORE any model training)
    print(f"\n3. Splitting data (test year: {test_year})...")
    train_mask = ride_data['timestamp'].dt.year < test_year
    test_mask = ride_data['timestamp'].dt.year == test_year
    
    train_data = ride_data[train_mask].copy()
    test_data = ride_data[test_mask].copy()
    print(f"   Train data: {len(train_data)} samples")
    print(f"   Test data: {len(test_data)} samples")
    
    # Step 4: Train attendance model (on training data only)
    print("\n4. Training attendance model...")
    attendance_model, attendance_features = train_attendance_model(train_data)
    
    # Step 5: Predict attendance for train and test sets
    print("\n5. Predicting attendance...")
    train_dates = train_data['date'].unique()
    test_dates = test_data['date'].unique()
    
    train_attendance = predict_daily_attendance(train_dates, attendance_model, attendance_features)
    test_attendance = predict_daily_attendance(test_dates, attendance_model, attendance_features)
    
    # Step 6: Prepare features for wait time model
    print("\n6. Preparing features for wait time model...")
    train_data, cat_features, num_features = prepare_model_features(train_data, train_attendance)
    test_data, _, _ = prepare_model_features(test_data, test_attendance)
    
    # Step 7: Train wait time model
    print("\n7. Training wait time model...")
    wait_model = build_wait_time_model(cat_features, num_features)
    
    X_train = train_data[cat_features + num_features]
    y_train = train_data['wait_time']
    
    wait_model.fit(X_train, y_train)
    
    # Step 8: Evaluate on test set
    print("\n8. Evaluating model...")
    X_test = test_data[cat_features + num_features]
    y_test = test_data['wait_time']
    
    y_pred = wait_model.predict(X_test)
    y_pred = np.maximum(y_pred, 0)  # Ensure non-negative
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"\nTest Set Metrics:")
    print(f"   MAE: {mae:.2f} minutes")
    print(f"   RMSE: {rmse:.2f} minutes")
    print(f"   R²: {r2:.4f}")
    
    # Evaluate by wait time range
    print(f"\nMAE by Wait Time Range:")
    for min_wait, max_wait in [(0, 10), (10, 30), (30, 60), (60, np.inf)]:
        mask = (y_test >= min_wait) & (y_test < max_wait)
        if mask.sum() > 0:
            range_mae = mean_absolute_error(y_test[mask], y_pred[mask])
            print(f"   {min_wait}-{max_wait if max_wait != np.inf else '∞'} min: {range_mae:.2f} (n={mask.sum()})")
    
    # Plot results
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([0, max(y_test)], [0, max(y_test)], 'r--')
    plt.xlabel('Actual Wait Time')
    plt.ylabel('Predicted Wait Time')
    plt.title(f'Linear Model with Attendance - {ride_name}')
    plt.text(0.05, 0.95, f'MAE: {mae:.1f}\nRMSE: {rmse:.1f}\nR²: {r2:.3f}', 
             transform=plt.gca().transAxes, verticalalignment='top',
             bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.subplot(1, 2, 2)
    residuals = y_pred - y_test
    plt.scatter(y_test, residuals, alpha=0.5)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Actual Wait Time')
    plt.ylabel('Residual')
    plt.title('Residual Plot')
    
    plt.tight_layout()
    plt.show()
    
    return {
        'wait_model': wait_model,
        'attendance_model': attendance_model,
        'attendance_features': attendance_features,
        'cat_features': cat_features,
        'num_features': num_features,
        'metrics': {'mae': mae, 'rmse': rmse, 'r2': r2}
    }

In [11]:
# Parameters
data_path = "../data/processed/ep/merged_with_holidays.parquet"
ride_name = "silver star"
test_year = 2023
    
# Train model
results = train_linear_model_with_attendance(data_path, ride_name, test_year)
    
print("\nModel training completed!")

=== Training Linear Model with Attendance for silver star ===

1. Loading and preprocessing data...
   Total data shape: (33053, 11)

2. Creating features...

3. Splitting data (test year: 2023)...
   Train data: 23988 samples
   Test data: 4673 samples

4. Training attendance model...

5. Predicting attendance...

6. Preparing features for wait time model...
   Filling 42 NaN values in 'temperature'
   Filling 42 NaN values in 'temp_squared'
   Filling 278 NaN values in 'wind'
   Filling 4 NaN values in 'time_of_day'

7. Training wait time model...


KeyError: "['hour_bin', 'rain_intensity', 'dow_sin', 'dow_cos', 'day_of_month', 'week_of_year', 'temp_deviation', 'wind_squared', 'high_wind'] not in index"