In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from datetime import datetime
import os
import pyarrow as pa
import pyarrow.parquet as pq

from pathlib import Path

In [2]:
def decompose_timestamp(df):
    """Extract temporal components from timestamp"""
    df['datetime'] = pd.to_datetime(df['timestamp'])
    
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['weekday'] = df['datetime'].dt.weekday  # Monday=0, Sunday=6
    df['hour'] = df['datetime'].dt.hour
    df['minute'] = df['datetime'].dt.minute

    df['is_weekend'] = df['weekday'].apply(lambda x: 1 if x >= 5 else 0)
    df['part_of_day'] = df['hour'].apply(lambda x: 
                                        'morning' if 6 <= x < 12 else
                                        'afternoon' if 12 <= x < 17 else
                                        'evening' if 17 <= x < 20 else
                                        'night')
    
    df['season'] = df['month'].apply(lambda x:
                                    'winter' if x in [12, 1, 2] else
                                    'spring' if x in [3, 4, 5] else
                                    'summer' if x in [6, 7, 8] else
                                    'fall')
    
    # Cyclical encoding
    df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
    df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['weekday_sin'] = np.sin(2 * np.pi * df['weekday']/7)
    df['weekday_cos'] = np.cos(2 * np.pi * df['weekday']/7)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute']/60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute']/60)
    
    return df


def process_boolean_features(df):
    """Convert boolean features to integers"""
    bool_cols = ['closed', 'is_german_holiday', 'is_swiss_holiday', 'is_french_holiday']
    
    for col in bool_cols:
        if col in df.columns:
            if df[col].dtype == bool:
                df[col] = df[col].astype(int)
            elif df[col].dtype == object:
                df[col] = df[col].map({'True': 1, 'False': 0})
    
    return df




In [None]:
def preprocess_theme_park_data_memory_efficient(df, output_file='processed_data.parquet', batch_size=100000, temp_dir='temp_efficient'):
    """
    Memory-efficient implementation that processes the entire dataset for scaling/encoding
    but operates in batches to maintain memory efficiency.
    """
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    

    for f in os.listdir(temp_dir):
        if f.endswith('.parquet'):
            os.remove(os.path.join(temp_dir, f))
    
    total_rows = len(df)
    print(f"Total rows to process: {total_rows}")
    
    print("Phase 1: Calculating statistics for encoding and scaling...")
    
    cat_cols = ['ride_name', 'part_of_day', 'season', 'year']
    num_cols = ['temperature', 'rain', 'wind']
    num_cols = [col for col in num_cols if col in df.columns]
    
    count_all = 0
    mean_all = np.zeros(len(num_cols))
    var_all = np.zeros(len(num_cols))
    
    cat_values = {col: set() for col in cat_cols}
    
    # First pass: decompose timestamps and collect statistics
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch = df.iloc[start_idx:end_idx].copy()
        
        # Process timestamps to get categorical features
        batch = decompose_timestamp(batch)
        batch = process_boolean_features(batch)
        
        # Collect unique values for categorical columns
        for col in cat_cols:
            unique_vals = batch[col].dropna().astype(str).unique()
            cat_values[col].update(unique_vals)
        
        # Calculate statistics for numerical columns (for online StandardScaler)
        if num_cols:
            batch_count = len(batch)
            batch_mean = batch[num_cols].mean().values
            batch_var = batch[num_cols].var().values
            
            # Update running statistics using Welford's algorithm for stable variance calculation
            if count_all == 0:
                mean_all = batch_mean
                var_all = batch_var
                count_all = batch_count
            else:
                delta = batch_mean - mean_all
                mean_all_new = mean_all + delta * (batch_count / (count_all + batch_count))
                delta2 = batch_mean - mean_all_new
                var_all = (var_all * count_all + batch_var * batch_count + 
                          delta * delta2 * count_all * batch_count / (count_all + batch_count)) / (count_all + batch_count)
                mean_all = mean_all_new
                count_all += batch_count
        
        progress = (end_idx / total_rows) * 100
        print(f"Statistics collection progress: {progress:.2f}%")
        
        # Release memory
        del batch
    
    # Create and fit scaler with calculated statistics
    scaler = StandardScaler()
    if num_cols:
        scaler.mean_ = mean_all
        scaler.scale_ = np.sqrt(var_all)
        scaler.var_ = var_all
        scaler.n_features_in_ = len(num_cols)
        scaler.n_samples_seen_ = count_all
        scaler.feature_names_in_ = np.array(num_cols)
    
    # Create encoder with predefined categories
    categories = []
    cat_indices = []
    start_idx = 0
    
    for col in cat_cols:
        sorted_cats = sorted(list(cat_values[col]))
        categories.append(np.array(sorted_cats))
        n_cats = len(sorted_cats)
        cat_indices.append((start_idx, start_idx + n_cats))
        start_idx += n_cats
    
    encoder = OneHotEncoder(
        sparse_output=False,
        handle_unknown='ignore',
        categories=categories
    )
    
    dummy_data = pd.DataFrame([[categories[i][0] for i in range(len(cat_cols))]], columns=cat_cols)
    encoder.fit(dummy_data)
    
    print("Statistics calculated. Starting data transformation...")
    

    batch_files = []
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch = df.iloc[start_idx:end_idx].copy()
        
        batch_num = (start_idx // batch_size) + 1
        print(f"Processing batch {batch_num}: rows {start_idx} to {end_idx}")
        
        batch = decompose_timestamp(batch)
        
        batch = batch.drop(columns=['month', 'day', 'hour', 'minute'], errors='ignore')
        batch = process_boolean_features(batch)
        
        try:
            for col in cat_cols:
                batch[col] = batch[col].astype(str)
                
            encoded_cats = encoder.transform(batch[cat_cols])
            encoded_df = pd.DataFrame(
                encoded_cats,
                columns=encoder.get_feature_names_out(cat_cols),
                index=batch.index
            )
        except Exception as e:
            print(f"Error during encoding: {e}")
            print(f"Unique values: {[batch[col].unique()[:5] for col in cat_cols]}")
            raise

        if num_cols:
            batch[num_cols] = scaler.transform(batch[num_cols])

        batch = pd.concat([batch.drop(cat_cols, axis=1), encoded_df], axis=1)

        cols_to_drop = ['timestamp', 'datetime']
        batch = batch.drop(columns=[col for col in cols_to_drop if col in batch.columns])
        
        temp_file = os.path.join(temp_dir, f"batch_{batch_num}.parquet")
        batch.to_parquet(temp_file, index=False)
        batch_files.append(temp_file)
        
        del batch
        del encoded_df
        
        progress = (end_idx / total_rows) * 100
        print(f"Transformation progress: {progress:.2f}%")
    
    print(f"All batches processed. Creating final output file...")
    
    if os.path.exists(output_file):
        os.remove(output_file)
    
    concat_batch_size = 5 
    for i in range(0, len(batch_files), concat_batch_size):
        batch_group = batch_files[i:i+concat_batch_size]
        print(f"Combining batch files {i+1} to {min(i+concat_batch_size, len(batch_files))}")
        
        group_dfs = [pd.read_parquet(file) for file in batch_group]
        combined_df = pd.concat(group_dfs, ignore_index=True)
        
        mode = 'w' if i == 0 else 'a'
        combined_df.to_parquet(output_file, index=False, engine='fastparquet', append=(mode=='a'))
        
        # Clean up
        for df_obj in group_dfs:
            del df_obj
        del combined_df
    
    print(f"All data combined and saved to {output_file}")
    
    # Clean up temporary files
    for file in batch_files:
        os.remove(file)
    
    print("Temporary files removed")
    
    transformers = {'encoder': encoder, 'scaler': scaler}
    return transformers

In [4]:
data_input_dir = "../data/processed"
input_file = os.path.join(data_input_dir, "ep", "bucket_cleaned_wait_times.parquet")
ep_df = pd.read_parquet(input_file)
print(ep_df.columns.unique())
ep_df.drop(columns=['feature_attraction_type', 'feature_category', 'feature_max_height', 'feature_track_length', 'feature_max_speed', 'feature_g_force',
       'feature_min_age', 'feature_min_height', 'feature_capacity_per_hour', 'date'], errors='ignore', inplace=True)

Index(['ride_name', 'timestamp', 'wait_time', 'closed', 'temperature', 'rain',
       'wind', 'is_german_holiday', 'is_swiss_holiday', 'is_french_holiday',
       'feature_attraction_type', 'feature_category', 'feature_max_height',
       'feature_track_length', 'feature_max_speed', 'feature_g_force',
       'feature_min_age', 'feature_min_height', 'feature_capacity_per_hour',
       'date', 'datetime', 'time_bucket', 'day_of_week'],
      dtype='object')


In [5]:
years = ep_df["timestamp"].unique().year
print(set(years))

{np.int32(2017), np.int32(2018), np.int32(2019), np.int32(2020), np.int32(2021), np.int32(2022), np.int32(2023), np.int32(2024)}


In [None]:
data_dir = Path('../data')
output_path = data_dir / 'processed' / 'ep' / 'final_cleaned_processed_wait_times.parquet'

output_path.parent.mkdir(parents=True, exist_ok=True)
transformers = preprocess_theme_park_data_memory_efficient(ep_df, output_path, batch_size=1000000)


Total rows to process: 904296
Phase 1: Calculating statistics for encoding and scaling...
Statistics collection progress: 100.00%
Statistics calculated. Starting data transformation...
Processing batch 1: rows 0 to 904296
Transformation progress: 100.00%
All batches processed. Creating final output file...
Combining batch files 1 to 1
All data combined and saved to ../data/processed/ep/final_cleaned_processed_wait_times.parquet
Temporary files removed


In [None]:
parquet_file = pq.ParquetFile(output_path)
all_columns = parquet_file.schema.names

columns_to_read = [col for col in all_columns 
                   if not (col.startswith("feature_attraction_type") or col.startswith("feature_category") or col.startswith("feature"))]

table = pq.read_table(output_path)
ep_df_preview = table.slice(0, 1000000).to_pandas()


In [8]:
len(ep_df_preview.columns)

67

In [9]:
ep_df_preview.columns

Index(['wait_time', 'closed', 'temperature', 'rain', 'wind',
       'is_german_holiday', 'is_swiss_holiday', 'is_french_holiday',
       'time_bucket', 'day_of_week', 'weekday', 'is_weekend', 'month_sin',
       'month_cos', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos',
       'minute_sin', 'minute_cos', 'ride_name_alpine express enzian',
       'ride_name_arena of football  be part of it', 'ride_name_arthur',
       'ride_name_atlantica supersplash', 'ride_name_atlantis adventure',
       'ride_name_baaa express', 'ride_name_blue fire megacoaster',
       'ride_name_castello dei medici', 'ride_name_dancing dingie',
       'ride_name_euromir', 'ride_name_eurosat  cancan coaster',
       'ride_name_eurotower', 'ride_name_fjordrafting',
       'ride_name_jim button  journey through morrowland',
       'ride_name_josefinas magical imperial journey',
       'ride_name_kolumbusjolle', 'ride_name_madame freudenreich curiosits',
       'ride_name_matterhornblitz', 'ride_name_old mac d

In [None]:
parquet_file = pq.ParquetFile(output_path)
all_columns = parquet_file.schema.names

columns_to_read = [col for col in all_columns 
                   if not (col.startswith("ride_name") or col.startswith("season")or col.startswith("part"))]

table = pq.read_table(output_path, columns=columns_to_read)
ep_df_column_analyze = table.slice(0, 1000000).to_pandas()


In [11]:
print(sorted(ep_df_column_analyze["month_sin"].unique()))
print(sorted(ep_df_column_analyze["month_cos"].unique()))

[np.float64(-1.0), np.float64(-0.8660254037844386), np.float64(-0.8660254037844384), np.float64(-0.5000000000000004), np.float64(-0.4999999999999997), np.float64(-2.4492935982947064e-16), np.float64(1.2246467991473532e-16), np.float64(0.49999999999999994), np.float64(0.8660254037844387)]
[np.float64(-1.0), np.float64(-0.8660254037844388), np.float64(-0.8660254037844387), np.float64(-0.5000000000000004), np.float64(-0.4999999999999998), np.float64(-1.8369701987210297e-16), np.float64(0.5000000000000001), np.float64(0.8660254037844384), np.float64(1.0)]


In [12]:
ep_df_column_analyze.columns

Index(['wait_time', 'closed', 'temperature', 'rain', 'wind',
       'is_german_holiday', 'is_swiss_holiday', 'is_french_holiday',
       'time_bucket', 'day_of_week', 'weekday', 'is_weekend', 'month_sin',
       'month_cos', 'hour_sin', 'hour_cos', 'weekday_sin', 'weekday_cos',
       'minute_sin', 'minute_cos', 'year_2017', 'year_2018', 'year_2019',
       'year_2020', 'year_2021', 'year_2022', 'year_2023', 'year_2024'],
      dtype='object')

In [13]:
def sanity_check(data_path):
    """
    Perform sanity checks on the processed theme park data
    
    Parameters:
    -----------
    data_path : str
        Path to the processed parquet file
    """
    print("Loading processed data...")
    df = pd.read_parquet(data_path)
    
    print(f"\n==== Basic Information ====")
    print(f"Data shape: {df.shape}")
    
    # Check 1: Missing values
    print("\n==== Check 1: Missing Values ====")
    missing = df.isna().sum()
    if missing.sum() > 0:
        print(f"Warning: Missing values found!")
        print(missing[missing > 0])
    else:
        print("✓ No missing values found")
    
    # Check 2: Feature ranges
    print("\n==== Check 2: Feature Ranges ====")
    
    # Cyclical features should be between -1 and 1
    cyclical_cols = [col for col in df.columns if col.endswith('_sin') or col.endswith('_cos')]
    for col in cyclical_cols:
        min_val, max_val = df[col].min(), df[col].max()
        if min_val < -1.1 or max_val > 1.1:  # allow small floating point errors
            print(f"Warning: {col} range is [{min_val:.2f}, {max_val:.2f}], expected [-1, 1]")
        else:
            print(f"✓ {col} is within expected range [-1, 1]")
    
    # One-hot encoded features should be 0 or 1
    one_hot_cols = [
        col for col in df.columns if 
        col.startswith('ride_name_') or 
        col.startswith('part_of_day_') or
        col.startswith('season_')
    ]
    
    for col in one_hot_cols:
        unique_vals = df[col].unique()
        if not np.all(np.isin(unique_vals, [0, 1])):
            print(f"Warning: {col} contains values other than 0 and 1: {unique_vals}")
        else:
            print(f"✓ {col} contains only 0 and 1 as expected")
    
    # Boolean features should be 0 or 1
    bool_cols = ['closed', 'is_german_holiday', 'is_swiss_holiday', 'is_french_holiday', 'is_weekend']
    bool_cols = [col for col in bool_cols if col in df.columns]
    
    for col in bool_cols:
        unique_vals = df[col].unique()
        if not np.all(np.isin(unique_vals, [0, 1])):
            print(f"Warning: {col} contains values other than 0 and 1: {unique_vals}")
        else:
            print(f"✓ {col} contains only 0 and 1 as expected")
    
    # Check 3: Consistency checks
    print("\n==== Check 3: Consistency Checks ====")
    
    # Weekday features should be consistent with is_weekend
    if 'weekday' in df.columns and 'is_weekend' in df.columns:
        weekend_mask = df['weekday'] >= 5
        is_weekend_mask = df['is_weekend'] == 1
        
        if (weekend_mask != is_weekend_mask).sum() > 0:
            print(f"Warning: 'weekday' and 'is_weekend' are inconsistent in {(weekend_mask != is_weekend_mask).sum()} rows")
        else:
            print("✓ 'weekday' and 'is_weekend' are consistent")
    
    # Ride names should sum to 1 for each row (one ride per observation)
    ride_cols = [col for col in df.columns if col.startswith('ride_name_')]
    ride_sums = df[ride_cols].sum(axis=1)
    
    if not np.all(ride_sums == 1):
        print(f"Warning: Some rows have {(ride_sums != 1).sum()} ride assignments that don't sum to 1")
        print(f"Min: {ride_sums.min()}, Max: {ride_sums.max()}")
    else:
        print("✓ Each observation has exactly one ride assigned")
    
    # Part of day should sum to 1 for each row
    part_of_day_cols = [col for col in df.columns if col.startswith('part_of_day_')]
    part_of_day_sums = df[part_of_day_cols].sum(axis=1)
    
    if not np.all(part_of_day_sums == 1):
        print(f"Warning: Some rows have part_of_day assignments that don't sum to 1")
        print(f"Min: {part_of_day_sums.min()}, Max: {part_of_day_sums.max()}")
    else:
        print("✓ Each observation has exactly one part_of_day assigned")
    
    # Season should sum to 1 for each row
    season_cols = [col for col in df.columns if col.startswith('season_')]
    season_sums = df[season_cols].sum(axis=1)
    
    if not np.all(season_sums == 1):
        print(f"Warning: Some rows have season assignments that don't sum to 1")
        print(f"Min: {season_sums.min()}, Max: {season_sums.max()}")
    else:
        print("✓ Each observation has exactly one season assigned")
    
    # Check 4: Scaled numerical features
    print("\n==== Check 4: Scaled Numerical Features ====")
    num_cols = ['temperature', 'rain', 'wind', 'year']
    num_cols = [col for col in num_cols if col in df.columns]
    
    for col in num_cols:
        mean, std = df[col].mean(), df[col].std()
        if abs(mean) > 0.1 or abs(std - 1) > 0.1:
            print(f"Warning: {col} may not be properly scaled. Mean: {mean:.4f}, Std: {std:.4f}")
        else:
            print(f"✓ {col} appears properly scaled (mean ≈ 0, std ≈ 1)")
    
    # Check 5: Correlations between cyclical features
    print("\n==== Check 5: Cyclical Feature Correlations ====")
    for base in ['month', 'hour', 'weekday', 'minute']:
        sin_col = f'{base}_sin'
        cos_col = f'{base}_cos'
        
        if sin_col in df.columns and cos_col in df.columns:
            corr = df[sin_col].corr(df[cos_col])
            if abs(corr) > 0.1:
                print(f"Warning: Correlation between {sin_col} and {cos_col} is {corr:.4f}, expected near 0")
            else:
                print(f"✓ {sin_col} and {cos_col} have low correlation as expected")
    
    # Check 6: Wait time distribution
    if 'wait_time' in df.columns:
        print("\n==== Check 6: Wait Time Distribution ====")
        wait_time = df['wait_time']
        print(f"Wait time min: {wait_time.min()}, mean: {wait_time.mean():.2f}, max: {wait_time.max()}")
        
        if wait_time.min() < 0:
            print(f"Warning: Negative wait times found: {wait_time[wait_time < 0].count()} values")
        
        # Check for extreme outliers (> 5 std from mean)
        mean, std = wait_time.mean(), wait_time.std()
        outliers = wait_time[(wait_time > mean + 5*std) | (wait_time < mean - 5*std)]
        if len(outliers) > 0:
            print(f"Warning: {len(outliers)} extreme wait time outliers found")
            print(f"Outlier values: {sorted(outliers.unique())}")
        else:
            print("✓ No extreme outliers in wait times")
    
    # Check 7: Closed rides should have wait time 0 or NaN
    if 'closed' in df.columns and 'wait_time' in df.columns:
        print("\n==== Check 7: Closed Rides and Wait Times ====")
        closed_rides = df[df['closed'] == 1]
        if len(closed_rides) > 0:
            invalid_waits = closed_rides[(closed_rides['wait_time'] > 0) & (~closed_rides['wait_time'].isna())]
            if len(invalid_waits) > 0:
                print(f"Warning: {len(invalid_waits)} closed rides have wait times > 0")
                print(f"Example: {invalid_waits[['wait_time']].head()}")
            else:
                print("✓ All closed rides have wait time 0 or NaN as expected")
        else:
            print("No closed rides in the dataset")
    
    # Check 8: inspection of cyclical features
    print("\n==== Check 8: Inspection of Cyclical Features ====")

    cyclical_pairs = []
    for base in ['month', 'hour', 'weekday']:
        if f'{base}_sin' in df.columns and f'{base}_cos' in df.columns:
            cyclical_pairs.append((base, f'{base}_sin', f'{base}_cos'))
    
    print(f"Cyclical encodings should form circular patterns when sin/cos components are plotted against each other")
    for base, sin_col, cos_col in cyclical_pairs:
        circle_check = np.sqrt(df[sin_col]**2 + df[cos_col]**2)
        if (abs(circle_check - 1) > 0.1).any():
            print(f"Warning: {base} cyclical encoding doesn't maintain unit circle (sin²+cos²=1)")
            print(f"Min: {circle_check.min():.4f}, Max: {circle_check.max():.4f}")
        else:
            print(f"✓ {base} cyclical encoding maintains unit circle property")

    print("\n==== Summary ====")
    print("Sanity check complete. Review the warnings above if any.")
    
    return df

sanity_check(output_path)

Loading processed data...



==== Basic Information ====
Data shape: (904296, 67)

==== Check 1: Missing Values ====
temperature      4823
rain           146719
wind            12948
dtype: int64

==== Check 2: Feature Ranges ====
✓ month_sin is within expected range [-1, 1]
✓ month_cos is within expected range [-1, 1]
✓ hour_sin is within expected range [-1, 1]
✓ hour_cos is within expected range [-1, 1]
✓ weekday_sin is within expected range [-1, 1]
✓ weekday_cos is within expected range [-1, 1]
✓ minute_sin is within expected range [-1, 1]
✓ minute_cos is within expected range [-1, 1]
✓ ride_name_alpine express enzian contains only 0 and 1 as expected
✓ ride_name_arena of football  be part of it contains only 0 and 1 as expected
✓ ride_name_arthur contains only 0 and 1 as expected
✓ ride_name_atlantica supersplash contains only 0 and 1 as expected
✓ ride_name_atlantis adventure contains only 0 and 1 as expected
✓ ride_name_baaa express contains only 0 and 1 as expected
✓ ride_name_blue fire megacoaster contain

Unnamed: 0,wait_time,closed,temperature,rain,wind,is_german_holiday,is_swiss_holiday,is_french_holiday,time_bucket,day_of_week,...,season_summer,season_winter,year_2017,year_2018,year_2019,year_2020,year_2021,year_2022,year_2023,year_2024
0,0.0,0,0.043878,-0.283247,-0.987521,0,0,0,2017-05-23 09:00:00,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0,0.043878,-0.283247,-0.987521,0,0,0,2017-05-23 09:00:00,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0,0.043878,-0.283247,-0.987521,0,0,0,2017-05-23 09:00:00,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0,0.043878,-0.283247,-0.987521,0,0,0,2017-05-23 09:00:00,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0,0.043878,-0.283247,-0.987521,0,0,0,2017-05-23 09:00:00,1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
904291,20.0,0,-2.523810,-0.283247,-0.696543,0,0,0,2024-12-31 17:30:00,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
904292,20.0,0,-2.523810,-0.283247,-0.696543,0,0,0,2024-12-31 17:30:00,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
904293,5.0,0,-2.523810,-0.283247,-0.696543,0,0,0,2024-12-31 17:30:00,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
904294,0.0,0,-2.523810,-0.283247,-0.696543,0,0,0,2024-12-31 17:30:00,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Explanation for Warning
- Warning: Correlation between month_sin and month_cos is -0.2893, expected near 0
    - A perfect -1.0 correlation between sin and cos components typically happens when the values are concentrated at specific points (like only 0, 15, 30, 45 minutes). This is the case in the bucket variant
- Warning: Correlation between hour_sin and hour_cos is -0.6495, expected near 0
    - This happens because most data points are from 10 AM to 6 PM, this creates a correlation
- Warning: Correlation between minute_sin and minute_cos is -1.0000, expected near 0
    - Seasonality in the data - the park has have more data points from certain months. Also we dropped 3 months