In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from datetime import datetime


def decompose_timestamp(df):
    """Extract temporal components from timestamp"""
    df['datetime'] = pd.to_datetime(df['timestamp'])
    
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['weekday'] = df['datetime'].dt.weekday  # Monday=0, Sunday=6
    df['hour'] = df['datetime'].dt.hour
    df['minute'] = df['datetime'].dt.minute
    

    df['is_weekend'] = df['weekday'].apply(lambda x: 1 if x >= 5 else 0)
    df['part_of_day'] = df['hour'].apply(lambda x: 
                                        'morning' if 6 <= x < 12 else
                                        'afternoon' if 12 <= x < 17 else
                                        'evening' if 17 <= x < 20 else
                                        'night')
    
    df['season'] = df['month'].apply(lambda x:
                                    'winter' if x in [12, 1, 2] else
                                    'spring' if x in [3, 4, 5] else
                                    'summer' if x in [6, 7, 8] else
                                    'fall')
    

    # This preserves the cyclical nature of these features
    # If we use these values directly as numerical features (1, 2, 3, etc.), the model would incorrectly interpret that as e.q.:
    # Month 12 (December) and Month 1 (January) are far apart or Hour 23 and Hour 0 are far apart
    # But in reality, these pairs of values are next to each other!
    
    df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
    df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['weekday_sin'] = np.sin(2 * np.pi * df['weekday']/7)
    df['weekday_cos'] = np.cos(2 * np.pi * df['weekday']/7)
    
    return df


def encode_categoricals(df):
    """Encode categorical variables using one-hot encoding"""

    cat_cols = ['ride_name', 'feature_attraction_type', 'feature_category', 'part_of_day', 'season']
    
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    encoded_cats = encoder.fit_transform(df[cat_cols])

    encoded_df = pd.DataFrame(
        encoded_cats, 
        columns=encoder.get_feature_names_out(cat_cols),
        index=df.index
    )
    
    df = pd.concat([df.drop(cat_cols, axis=1), encoded_df], axis=1)
    
    return df, encoder


def process_boolean_features(df):
    """Convert boolean features to integers"""
    bool_cols = ['closed', 'is_german_holiday', 'is_swiss_holiday', 'is_french_holiday']
    
    for col in bool_cols:
        if df[col].dtype == bool:
            df[col] = df[col].astype(int)
        elif df[col].dtype == object:
            df[col] = df[col].map({'True': 1, 'False': 0})
    
    return df


def normalize_numericals(df):
    """Standardize numerical features"""
    num_cols = [
        'temperature', 'rain', 'wind', 
        'feature_max_height', 'feature_track_length', 'feature_max_speed',
        'feature_g_force', 'feature_min_age', 'feature_min_height', 
        'feature_capacity_per_hour'
    ]
    

    #for col in num_cols:
        #if col in df.columns:
            #df[col] = df[col].fillna(df[col].median())
    
    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])
    
    return df, scaler

# Lag features refer to using previous values of your target variable (wait times) as predictors for future values.
def create_lag_features(df, groupby_col='ride_name', target_col='wait_time', lag_periods=[1, 2, 3, 6, 12, 24]):
    """Create lagged features for each ride"""
    for lag in lag_periods:
        df[f'wait_time_lag_{lag}'] = df.groupby(groupby_col)[target_col].shift(lag)
    
    # Fill NaN values in lag columns with 0
    lag_cols = [f'wait_time_lag_{lag}' for lag in lag_periods]
    df[lag_cols] = df[lag_cols].fillna(0)
    
    return df



def preprocess_theme_park_data(df):
    """Apply all preprocessing steps to the dataset"""
    df = df.copy() 
    
    df = decompose_timestamp(df)
    df = process_boolean_features(df)
    df, encoder = encode_categoricals(df)
    df, scaler = normalize_numericals(df)
    df = create_lag_features(df)

    cols_to_drop = ['timestamp', 'datetime']
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])
    
    return df, {'encoder': encoder, 'scaler': scaler}

preprocessed_df, preprocessors = preprocess_theme_park_data(df)


print(f"Original shape: {df.shape}")
print(f"Preprocessed shape: {preprocessed_df.shape}")
print(f"New features created: {preprocessed_df.columns.difference(df.columns).tolist()}")

