In [1]:
import os
import re

import pandas as pd

## Load the data

In [2]:
data_input_dir = "../data/raw"
ep_input_dir = os.path.join(data_input_dir, "51")
rulantica_input_dir = os.path.join(data_input_dir, "309")

data_output_dir = "../data/processed"
ep_output_dir = os.path.join(data_output_dir, "ep")
rulantica_output_dir = os.path.join(data_output_dir, "rulantica")

In [3]:
ep_weather_df = pd.read_parquet(os.path.join(ep_input_dir, "merged_weather.parquet"))
ep_rides_df = pd.read_parquet(os.path.join(ep_input_dir, "merged_rides.parquet"))

rulantica_weather_df = pd.read_parquet(os.path.join(rulantica_input_dir, "merged_weather.parquet"))
rulantica_rides_df = pd.read_parquet(os.path.join(rulantica_input_dir, "merged_rides.parquet"))

print(f"Europa Park weather shape: {ep_weather_df.shape}")
print(f"Europa Park rides shape: {ep_rides_df.shape}")
print(f"Rulantica weather shape: {rulantica_weather_df.shape}")
print(f"Rulantica rides shape: {rulantica_rides_df.shape}")

print(ep_weather_df.head())
print(ep_rides_df.head())
print(rulantica_weather_df.head())
print(rulantica_rides_df.head())

Europa Park weather shape: (19943, 4)
Europa Park rides shape: (6268117, 3)
Rulantica weather shape: (18191, 4)
Rulantica rides shape: (2215265, 3)
            timestamp  temperature  rain  wind
0 2024-08-13 09:00:00         22.0   0.0   1.2
1 2024-08-13 10:00:00         22.4   0.0   2.5
2 2024-08-13 11:00:00         25.7   0.0   1.1
3 2024-08-13 12:00:00         25.7   0.0   2.4
4 2024-08-13 13:00:00         25.8   0.0   3.3
            timestamp                ride_name  wait_time
0 2019-06-11 09:00:00  Alpine Express 'Enzian'        5.0
1 2019-06-11 09:05:00  Alpine Express 'Enzian'        5.0
2 2019-06-11 09:10:00  Alpine Express 'Enzian'        1.0
3 2019-06-11 09:15:00  Alpine Express 'Enzian'        1.0
4 2019-06-11 09:20:00  Alpine Express 'Enzian'        1.0
            timestamp  temperature  rain  wind
0 2022-10-28 10:00:00         14.9   0.0   2.6
1 2022-10-28 11:00:00         17.4   0.0   2.8
2 2022-10-28 12:00:00         19.2   0.0   2.9
3 2022-10-28 13:00:00         20.4

## Clean EuropaPark data

### Clean Ride Names

First, list all the uncleaned ride names in the data

In [4]:
unique_rides = ep_rides_df["ride_name"].unique()
unique_rides

array(["Alpine Express 'Enzian'", 'Arena of Football - Be Part of It!',
       'ARTHUR', 'Atlantica SuperSplash', 'Atlantis Adventure',
       'Ba-a-a Express', 'blue fire Megacoaster', 'Castello dei Medici',
       'Dancing Dingie', 'Euro-Mir', 'Euro-Tower',
       'Eurosat - CanCan Coaster', 'Fjord-Rafting',
       'Jim Button – Journey through Morrowland',
       'Josefina’s Magical Imperial Journey', 'Kolumbusjolle',
       'Madame Freudenreich Curiosités', 'Matterhorn-Blitz',
       "Old Mac Donald's Tractor Fun", 'Pegasus', 'Poppy Towers',
       'Silver Star', 'Swiss Bob Run', 'Tirol Log Flume',
       "Vienna Wave Swing - 'Glückspilz'", 'Vindjammer', 'Voletarium',
       'Volo da Vinci', 'Water rollercoaster Poseidon',
       'Whale Adventures - Northern Lights', 'Eurosat Coastiality',
       'Pirates in Batavia', 'Snorri Touren',
       'VirtualLine: blue fire Megacoaster', 'VirtualLine: Euro-Mir',
       'VirtualLine: Voletarium',
       'VirtualLine: Water Rollercoaster Pose

Lowercase all ride names and remove all special characters

In [5]:
def standardize_ride_names(ride_name):
    ride_name = ride_name.lower() # convert to lowercase
    ride_name = re.sub(r"[^a-z0-9\s]", "", ride_name) # remove special characters (keeping letters, numbers, and spaces)
    return ride_name

ep_rides_df["ride_name"] = ep_rides_df["ride_name"].apply(standardize_ride_names)

Remove all "VirtualLine" and "VR" rides as they are not physical rides. Also remove various "missing" rides.

In [6]:
# remove any "VirtualLine" or "VR" rides as we're only interested in waiting times of physical rides
ep_rides_df = ep_rides_df[~ep_rides_df["ride_name"].str.contains("virtualline")]
ep_rides_df = ep_rides_df[~ep_rides_df["ride_name"].str.contains("vr")]

# remove various unknown rides
ep_rides_df = ep_rides_df[ep_rides_df["ride_name"] != "unknown2"]
ep_rides_df = ep_rides_df[ep_rides_df["ride_name"] != "-8"]
ep_rides_df = ep_rides_df[ep_rides_df["ride_name"] != "8"]
ep_rides_df = ep_rides_df[ep_rides_df["ride_name"] != "0"]
ep_rides_df = ep_rides_df[ep_rides_df["ride_name"] != "904"]

Some rides are duplicated with different names, so we will merge them into one name.

In [7]:
# map some rides to their correct names
ride_mapping = {
    "express des alpes enzian": "alpine express enzian",
    "bobsleigh suisse": "swiss bob run",
    "les rapides du tyrol": "tirol log flume",
    "water rollercoaster poseidon": "poseidon",
    "fjordrafting mit der welt der trolle": "fjordrafting",
    "eurosat coastiality": "eurosat  cancan coaster",
    "chaises volantes silvretta nova": "vienna wave swing",
    "a la dcouverte datlantis": "atlantis adventure",
    "old 99": "jim button  journey through morrowland",
    "la grotte des pirates": "pirates in batavia",
    "les golettes de colomb": "kolumbusjolle",
    "les radeaux de la jungle": "josefinas magical imperial journey",
    "tourlicot": "poppy towers",
    "chateau hante": "castello dei medici",
    "univers de lnergie": "madame freudenreich curiosits"
}

ep_rides_df["ride_name"] = ep_rides_df["ride_name"].replace(ride_mapping)

List the ride names after cleaning

In [8]:
unique_rides = ep_rides_df["ride_name"].unique()
unique_rides

array(['alpine express enzian', 'arena of football  be part of it',
       'arthur', 'atlantica supersplash', 'atlantis adventure',
       'baaa express', 'blue fire megacoaster', 'castello dei medici',
       'dancing dingie', 'euromir', 'eurotower',
       'eurosat  cancan coaster', 'fjordrafting',
       'jim button  journey through morrowland',
       'josefinas magical imperial journey', 'kolumbusjolle',
       'madame freudenreich curiosits', 'matterhornblitz',
       'old mac donalds tractor fun', 'pegasus', 'poppy towers',
       'silver star', 'swiss bob run', 'tirol log flume',
       'vienna wave swing  glckspilz', 'vindjammer', 'voletarium',
       'volo da vinci', 'poseidon', 'whale adventures  northern lights',
       'pirates in batavia', 'snorri touren',
       'voltron nevera powered by rimac', 'bellevue ferris wheel',
       'tnnevirvel'], dtype=object)

### Align Timestamps

Count the first and last time for all days. This should correspond to the opening and closing time of the park.

In [9]:
def get_first_and_last_timestamps(df):
  time_summary = df.groupby(ep_rides_df["timestamp"].dt.date)["timestamp"].agg(["min", "max"])
  time_summary["min"] = time_summary["min"].dt.time
  time_summary["max"] = time_summary["max"].dt.time

  return time_summary["min"].value_counts(), time_summary["max"].value_counts()

get_first_and_last_timestamps(ep_rides_df)

(min
 09:00:00    1512
 11:00:00     338
 09:01:00      22
 09:02:00      13
 09:04:00      11
 09:03:00       9
 09:05:00       7
 11:02:00       6
 11:01:00       5
 11:04:00       4
 11:10:00       3
 09:10:00       3
 13:45:00       2
 09:40:00       1
 16:30:00       1
 09:30:00       1
 10:40:00       1
 11:30:00       1
 12:42:00       1
 12:10:00       1
 15:28:00       1
 15:10:00       1
 09:50:00       1
 11:03:00       1
 09:09:00       1
 Name: count, dtype: int64,
 max
 17:55:00    917
 18:55:00    369
 20:00:00    143
 19:00:00    106
 19:55:00    102
 20:30:00     52
 18:30:00     50
 18:00:00     42
 19:30:00     39
 17:59:00     17
 17:56:00     16
 17:58:00     14
 21:00:00     13
 18:58:00      7
 21:30:00      7
 17:54:00      6
 17:57:00      5
 18:57:00      5
 17:50:00      3
 18:56:00      3
 18:59:00      3
 17:35:00      2
 23:55:00      2
 19:10:00      2
 18:50:00      2
 23:50:00      1
 19:50:00      1
 10:00:00      1
 19:20:00      1
 17:07:00      1
 2

We see that almost all days have an opening time after 09:00 and closing time before 21:00. We will remove all data points outside this range.

To check the result, we again print the first and last time for all days after cleaning.

In [10]:
# constrain ride times to 09:00 - 21:00
ep_rides_df = ep_rides_df[(ep_rides_df["timestamp"].dt.time >= pd.to_datetime("09:00").time()) & (ep_rides_df["timestamp"].dt.time <= pd.to_datetime("21:00").time())]

get_first_and_last_timestamps(ep_rides_df)

(min
 09:00:00    1512
 11:00:00     338
 09:01:00      22
 09:02:00      13
 09:04:00      11
 09:03:00       9
 09:05:00       7
 11:02:00       6
 11:01:00       5
 11:04:00       4
 11:10:00       3
 09:10:00       3
 13:45:00       2
 09:40:00       1
 16:30:00       1
 09:30:00       1
 10:40:00       1
 11:30:00       1
 12:42:00       1
 12:10:00       1
 15:28:00       1
 15:10:00       1
 09:50:00       1
 11:03:00       1
 09:09:00       1
 Name: count, dtype: int64,
 max
 17:55:00    917
 18:55:00    369
 20:00:00    143
 19:00:00    106
 19:55:00    102
 20:30:00     52
 18:30:00     50
 18:00:00     42
 19:30:00     39
 21:00:00     24
 17:59:00     17
 17:56:00     16
 17:58:00     14
 18:58:00      7
 17:54:00      6
 17:57:00      5
 18:57:00      5
 18:56:00      3
 17:50:00      3
 18:59:00      3
 17:35:00      2
 18:50:00      2
 19:10:00      2
 19:50:00      1
 19:20:00      1
 17:07:00      1
 19:45:00      1
 19:25:00      1
 12:50:00      1
 10:00:00      1
 1

We want to have a continuous time series from the park opening on the first day to the park closing on the last day withg a measurement for each ride all 5 minutes.
To achieve this, we will first create a continuous time series with all 5 minute intervals from the first to the last time of the data.
Then we will merge the data with the continuous time series to fill in missing values.

In [11]:
def merge_rides_with_full_grid(rides_df, park_open_time, park_close_time, frequency):
    first_data = ep_rides_df["timestamp"].min()
    last_data = ep_rides_df["timestamp"].max()

    all_dates = pd.date_range(start=first_data, end=last_data).date

    # Generate all possible timestamps (every x minutes from the starting to the closing time)
    time_range = pd.date_range(park_open_time, park_close_time, freq=frequency).time

    # Create a DataFrame with all possible combinations of (date, time, ride)
    full_grid = pd.MultiIndex.from_product(
        [all_dates, time_range, unique_rides], 
        names=["date", "time", "ride_name"]
    ).to_frame(index=False)

    # Extract date and time separately from existing data
    rides_df["date"] = rides_df["timestamp"].dt.date
    rides_df["time"] = rides_df["timestamp"].dt.time

    # Merge full grid with original data
    merged_df = full_grid.merge(rides_df, on=["date", "time", "ride_name"], how="left")

    # Reconstruct timestamp column from date and time columns
    merged_df["timestamp"] = pd.to_datetime(merged_df["date"].astype(str) + " " + merged_df["time"].astype(str))

    return merged_df

merged_df = merge_rides_with_full_grid(ep_rides_df, "09:00", "21:00", "5min")
merged_df

Unnamed: 0,date,time,ride_name,timestamp,wait_time
0,2017-05-23,09:00:00,alpine express enzian,2017-05-23 09:00:00,1.0
1,2017-05-23,09:00:00,arena of football be part of it,2017-05-23 09:00:00,
2,2017-05-23,09:00:00,arthur,2017-05-23 09:00:00,1.0
3,2017-05-23,09:00:00,atlantica supersplash,2017-05-23 09:00:00,1.0
4,2017-05-23,09:00:00,atlantis adventure,2017-05-23 09:00:00,1.0
...,...,...,...,...,...
14196898,2024-12-31,21:00:00,pirates in batavia,2024-12-31 21:00:00,
14196899,2024-12-31,21:00:00,snorri touren,2024-12-31 21:00:00,
14196900,2024-12-31,21:00:00,voltron nevera powered by rimac,2024-12-31 21:00:00,
14196901,2024-12-31,21:00:00,bellevue ferris wheel,2024-12-31 21:00:00,


We want to know whether the park is open or closed at each time point. We will add a column "open" with 1 for open and 0 for closed. We choose the first time of the day as the opening time and the last time of the day as the closing time. We also add in completely missing days.

In [12]:
# returns all dates for which there is no data available in the rides DataFrame
def get_dates_with_missing_data(rides_df, merged_df):
  all_dates = merged_df["timestamp"].dt.date.unique()
  dates_with_data = rides_df["timestamp"].dt.date.unique()
  missing_dates = set(all_dates) - set(dates_with_data)
  return missing_dates

missing_dates = get_dates_with_missing_data(ep_rides_df, merged_df)

In [13]:
def add_closed_column(rides_df, merged_df, missing_dates):
  # Find the actual opening and closing time per day
  daily_min_time = rides_df.groupby("date")["time"].min()
  daily_max_time = rides_df.groupby("date")["time"].max()

  # Merge the daily open/close times to full grid
  merged_df = merged_df.merge(daily_min_time, on="date", how="left", suffixes=("", "_open"))
  merged_df = merged_df.merge(daily_max_time, on="date", how="left", suffixes=("", "_close"))

  # Determine when the park is closed
  merged_df["closed"] = (merged_df["time"] < merged_df["time_open"]) | (merged_df["time"] > merged_df["time_close"])

  # Set to closed if there's no data for a compelte day available
  merged_df.loc[merged_df["date"].isin(missing_dates), "closed"] = True

  return merged_df

merged_df = add_closed_column(ep_rides_df, merged_df, missing_dates)
merged_df

Unnamed: 0,date,time,ride_name,timestamp,wait_time,time_open,time_close,closed
0,2017-05-23,09:00:00,alpine express enzian,2017-05-23 09:00:00,1.0,09:00:00,18:30:00,False
1,2017-05-23,09:00:00,arena of football be part of it,2017-05-23 09:00:00,,09:00:00,18:30:00,False
2,2017-05-23,09:00:00,arthur,2017-05-23 09:00:00,1.0,09:00:00,18:30:00,False
3,2017-05-23,09:00:00,atlantica supersplash,2017-05-23 09:00:00,1.0,09:00:00,18:30:00,False
4,2017-05-23,09:00:00,atlantis adventure,2017-05-23 09:00:00,1.0,09:00:00,18:30:00,False
...,...,...,...,...,...,...,...,...
14196898,2024-12-31,21:00:00,pirates in batavia,2024-12-31 21:00:00,,11:00:00,18:55:00,True
14196899,2024-12-31,21:00:00,snorri touren,2024-12-31 21:00:00,,11:00:00,18:55:00,True
14196900,2024-12-31,21:00:00,voltron nevera powered by rimac,2024-12-31 21:00:00,,11:00:00,18:55:00,True
14196901,2024-12-31,21:00:00,bellevue ferris wheel,2024-12-31 21:00:00,,11:00:00,18:55:00,True


We force the wait time to be exactly 0 if the park is determined to be closed.

In [14]:
# Set wait_time = 0 only when the park is closed
merged_df.loc[merged_df["closed"], "wait_time"] = 0
merged_df

Unnamed: 0,date,time,ride_name,timestamp,wait_time,time_open,time_close,closed
0,2017-05-23,09:00:00,alpine express enzian,2017-05-23 09:00:00,1.0,09:00:00,18:30:00,False
1,2017-05-23,09:00:00,arena of football be part of it,2017-05-23 09:00:00,,09:00:00,18:30:00,False
2,2017-05-23,09:00:00,arthur,2017-05-23 09:00:00,1.0,09:00:00,18:30:00,False
3,2017-05-23,09:00:00,atlantica supersplash,2017-05-23 09:00:00,1.0,09:00:00,18:30:00,False
4,2017-05-23,09:00:00,atlantis adventure,2017-05-23 09:00:00,1.0,09:00:00,18:30:00,False
...,...,...,...,...,...,...,...,...
14196898,2024-12-31,21:00:00,pirates in batavia,2024-12-31 21:00:00,0.0,11:00:00,18:55:00,True
14196899,2024-12-31,21:00:00,snorri touren,2024-12-31 21:00:00,0.0,11:00:00,18:55:00,True
14196900,2024-12-31,21:00:00,voltron nevera powered by rimac,2024-12-31 21:00:00,0.0,11:00:00,18:55:00,True
14196901,2024-12-31,21:00:00,bellevue ferris wheel,2024-12-31 21:00:00,0.0,11:00:00,18:55:00,True


As the next action, we restore the timestamp column and drop any intermediate columns that were created during the cleaning process.

In [15]:
def restore_timestamp_and_drop_columns(merged_df):
  # Restore timestamp column
  merged_df["timestamp"] = pd.to_datetime(merged_df["date"].astype(str) + " " + merged_df["time"].astype(str))

  # Drop unnecessary columns
  merged_df = merged_df.drop(columns=["date", "time", "time_open", "time_close"])

  return merged_df

merged_df = restore_timestamp_and_drop_columns(merged_df)
merged_df

Unnamed: 0,ride_name,timestamp,wait_time,closed
0,alpine express enzian,2017-05-23 09:00:00,1.0,False
1,arena of football be part of it,2017-05-23 09:00:00,,False
2,arthur,2017-05-23 09:00:00,1.0,False
3,atlantica supersplash,2017-05-23 09:00:00,1.0,False
4,atlantis adventure,2017-05-23 09:00:00,1.0,False
...,...,...,...,...
14196898,pirates in batavia,2024-12-31 21:00:00,0.0,True
14196899,snorri touren,2024-12-31 21:00:00,0.0,True
14196900,voltron nevera powered by rimac,2024-12-31 21:00:00,0.0,True
14196901,bellevue ferris wheel,2024-12-31 21:00:00,0.0,True


The weather data is only available all 30 minutes, so we will forward fill the weather data to have a value for each 5 minute interval.

In [16]:
# Resample weather data to every 5 minutes using forward fill
def resample_weather_data(weather_df, frequency):
    weather_df = weather_df.set_index("timestamp").resample(frequency).ffill().reset_index()
    return weather_df

ep_weather_df = resample_weather_data(ep_weather_df, "5min")
ep_weather_df

Unnamed: 0,timestamp,temperature,rain,wind
0,2017-05-23 09:00:00,18.7,0.0,1.1
1,2017-05-23 09:05:00,18.7,0.0,1.1
2,2017-05-23 09:10:00,18.7,0.0,1.1
3,2017-05-23 09:15:00,18.7,0.0,1.1
4,2017-05-23 09:20:00,18.7,0.0,1.1
...,...,...,...,...
800468,2024-12-31 18:40:00,-1.0,0.0,2.8
800469,2024-12-31 18:45:00,-1.0,0.0,2.8
800470,2024-12-31 18:50:00,-1.0,0.0,2.8
800471,2024-12-31 18:55:00,-1.0,0.0,2.8


The two dataframes are then merged on the timestamp column to create the final dataset.

In [17]:
# Merge the weather data into the ride schedule
def merge_weather_data(merged_df, weather_df):
    merged_df = merged_df.merge(weather_df, on="timestamp", how="left")
    return merged_df

merged_df = merge_weather_data(merged_df, ep_weather_df)
merged_df

Unnamed: 0,ride_name,timestamp,wait_time,closed,temperature,rain,wind
0,alpine express enzian,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
1,arena of football be part of it,2017-05-23 09:00:00,,False,18.7,0.0,1.1
2,arthur,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
3,atlantica supersplash,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
4,atlantis adventure,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
...,...,...,...,...,...,...,...
14196898,pirates in batavia,2024-12-31 21:00:00,0.0,True,,,
14196899,snorri touren,2024-12-31 21:00:00,0.0,True,,,
14196900,voltron nevera powered by rimac,2024-12-31 21:00:00,0.0,True,,,
14196901,bellevue ferris wheel,2024-12-31 21:00:00,0.0,True,,,


When the park is closed we don't have any weather data available from the source. We filled this using forward fill before while upsampling the weather data. To make it consistent we set the weather data to `Na` when the park is closed.

In [18]:
# Set weather data to missing when the park is closed
merged_df.loc[merged_df["closed"], ["temperature", "rain", "wind"]] = pd.NA
merged_df

Unnamed: 0,ride_name,timestamp,wait_time,closed,temperature,rain,wind
0,alpine express enzian,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
1,arena of football be part of it,2017-05-23 09:00:00,,False,18.7,0.0,1.1
2,arthur,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
3,atlantica supersplash,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
4,atlantis adventure,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
...,...,...,...,...,...,...,...
14196898,pirates in batavia,2024-12-31 21:00:00,0.0,True,,,
14196899,snorri touren,2024-12-31 21:00:00,0.0,True,,,
14196900,voltron nevera powered by rimac,2024-12-31 21:00:00,0.0,True,,,
14196901,bellevue ferris wheel,2024-12-31 21:00:00,0.0,True,,,


In [19]:
ep_merged_df = merged_df.copy()

In [20]:
# sort the data by the timestamp
ep_merged_df = ep_merged_df.sort_values("timestamp")
ep_merged_df

Unnamed: 0,ride_name,timestamp,wait_time,closed,temperature,rain,wind
0,alpine express enzian,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
20,poppy towers,2017-05-23 09:00:00,0.0,False,18.7,0.0,1.1
21,silver star,2017-05-23 09:00:00,0.0,False,18.7,0.0,1.1
22,swiss bob run,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
23,tirol log flume,2017-05-23 09:00:00,1.0,False,18.7,0.0,1.1
...,...,...,...,...,...,...,...
14196880,fjordrafting,2024-12-31 21:00:00,0.0,True,,,
14196881,jim button journey through morrowland,2024-12-31 21:00:00,0.0,True,,,
14196882,josefinas magical imperial journey,2024-12-31 21:00:00,0.0,True,,,
14196884,madame freudenreich curiosits,2024-12-31 21:00:00,0.0,True,,,


## Clean Rulantica data

### Clean Ride Names

First, list all the uncleaned ride names in the data

In [21]:
unique_rides = rulantica_rides_df["ride_name"].unique()
unique_rides

array(['Dugdrob', 'Hugin', 'Isbrekker', 'Munin', "Snorri's Saga",
       'Stormvind', 'Svalgur Rytt', 'Två Fall', 'Vildfål', 'Vildstrøm',
       'Tønnevirvel', 'Vikingløp', 'Falrok'], dtype=object)

Here, it seems that we don't need to remove any rides (no duplicates, no virtual line, no unknown).
Thus we only need to lowercase all ride names and remove all special characters.

In [22]:
rulantica_rides_df["ride_name"] = rulantica_rides_df["ride_name"].apply(standardize_ride_names)

In [23]:
unique_rides = rulantica_rides_df["ride_name"].unique()
unique_rides

array(['dugdrob', 'hugin', 'isbrekker', 'munin', 'snorris saga',
       'stormvind', 'svalgur rytt', 'tv fall', 'vildfl', 'vildstrm',
       'tnnevirvel', 'vikinglp', 'falrok'], dtype=object)

### Align Timestamps

Again, we want to check the opening and closing time of the park.

In [24]:
get_first_and_last_timestamps(rulantica_rides_df)

(min
 10:00:00    241
 09:30:00    188
 09:32:00     17
 10:10:00     16
 09:31:00     15
            ... 
 15:48:00      1
 20:50:00      1
 14:00:00      1
 12:55:00      1
 12:30:00      1
 Name: count, Length: 91, dtype: int64,
 max
 21:55:00    559
 21:57:00     25
 21:56:00     17
 21:58:00     12
 21:59:00      9
 09:50:00      3
 22:00:00      3
 21:00:00      2
 17:15:00      2
 12:45:00      2
 16:30:00      2
 20:55:00      2
 16:25:00      1
 11:35:00      1
 15:10:00      1
 20:00:00      1
 12:40:00      1
 15:25:00      1
 11:00:00      1
 13:55:00      1
 21:45:00      1
 10:10:00      1
 14:55:00      1
 12:00:00      1
 18:55:00      1
 12:05:00      1
 21:30:00      1
 10:30:00      1
 15:40:00      1
 11:05:00      1
 17:50:00      1
 19:00:00      1
 16:15:00      1
 14:10:00      1
 17:02:00      1
 14:08:00      1
 20:02:00      1
 16:07:00      1
 18:45:00      1
 18:00:00      1
 20:35:00      1
 15:05:00      1
 13:00:00      1
 18:35:00      1
 14:45:00      

Here it seems that 09:00 to 21:00 is a good range for the park opening hours. THis also aligns to the opening times communicated by the park.

In [25]:
# constrain ride times to 09:00 - 22:00
rulantica_rides_df = rulantica_rides_df[(rulantica_rides_df["timestamp"].dt.time >= pd.to_datetime("09:00").time()) & (rulantica_rides_df["timestamp"].dt.time <= pd.to_datetime("22:00").time())]

get_first_and_last_timestamps(ep_rides_df)

(min
 09:00:00    1512
 11:00:00     338
 09:01:00      22
 09:02:00      13
 09:04:00      11
 09:03:00       9
 09:05:00       7
 11:02:00       6
 11:01:00       5
 11:04:00       4
 11:10:00       3
 09:10:00       3
 13:45:00       2
 09:40:00       1
 16:30:00       1
 09:30:00       1
 10:40:00       1
 11:30:00       1
 12:42:00       1
 12:10:00       1
 15:28:00       1
 15:10:00       1
 09:50:00       1
 11:03:00       1
 09:09:00       1
 Name: count, dtype: int64,
 max
 17:55:00    917
 18:55:00    369
 20:00:00    143
 19:00:00    106
 19:55:00    102
 20:30:00     52
 18:30:00     50
 18:00:00     42
 19:30:00     39
 21:00:00     24
 17:59:00     17
 17:56:00     16
 17:58:00     14
 18:58:00      7
 17:54:00      6
 17:57:00      5
 18:57:00      5
 18:56:00      3
 17:50:00      3
 18:59:00      3
 17:35:00      2
 18:50:00      2
 19:10:00      2
 19:50:00      1
 19:20:00      1
 17:07:00      1
 19:45:00      1
 19:25:00      1
 12:50:00      1
 10:00:00      1
 1

The same procedure as for the EuropaPark data is applied to create a continuous time series and merge the data

In [26]:
merged_df = merge_rides_with_full_grid(rulantica_rides_df, "09:00", "22:00", "5min")
missing_dates = get_dates_with_missing_data(rulantica_rides_df, merged_df)
merged_df = add_closed_column(rulantica_rides_df, merged_df, missing_dates)
merged_df.loc[merged_df["closed"], "wait_time"] = 0
merged_df = restore_timestamp_and_drop_columns(merged_df)
merged_df

Unnamed: 0,ride_name,timestamp,wait_time,closed
0,dugdrob,2017-05-23 09:00:00,0.0,True
1,hugin,2017-05-23 09:00:00,0.0,True
2,isbrekker,2017-05-23 09:00:00,0.0,True
3,munin,2017-05-23 09:00:00,0.0,True
4,snorris saga,2017-05-23 09:00:00,0.0,True
...,...,...,...,...
5673975,vildfl,2024-12-31 22:00:00,0.0,True
5673976,vildstrm,2024-12-31 22:00:00,0.0,True
5673977,tnnevirvel,2024-12-31 22:00:00,0.0,True
5673978,vikinglp,2024-12-31 22:00:00,0.0,True


In [27]:
rulantica_weather_df = resample_weather_data(rulantica_weather_df, "5min")
merged_df = merge_weather_data(merged_df, rulantica_weather_df)
merged_df.loc[merged_df["closed"], ["temperature", "rain", "wind"]] = pd.NA

rulantica_merged_df = merged_df.copy()
rulantica_merged_df

Unnamed: 0,ride_name,timestamp,wait_time,closed,temperature,rain,wind
0,dugdrob,2017-05-23 09:00:00,0.0,True,,,
1,hugin,2017-05-23 09:00:00,0.0,True,,,
2,isbrekker,2017-05-23 09:00:00,0.0,True,,,
3,munin,2017-05-23 09:00:00,0.0,True,,,
4,snorris saga,2017-05-23 09:00:00,0.0,True,,,
...,...,...,...,...,...,...,...
5673975,vildfl,2024-12-31 22:00:00,0.0,True,,,
5673976,vildstrm,2024-12-31 22:00:00,0.0,True,,,
5673977,tnnevirvel,2024-12-31 22:00:00,0.0,True,,,
5673978,vikinglp,2024-12-31 22:00:00,0.0,True,,,


In [28]:
# sort the data by the timestamp
rulantica_merged_df = rulantica_merged_df.sort_values("timestamp")
rulantica_merged_df

Unnamed: 0,ride_name,timestamp,wait_time,closed,temperature,rain,wind
0,dugdrob,2017-05-23 09:00:00,0.0,True,,,
12,falrok,2017-05-23 09:00:00,0.0,True,,,
11,vikinglp,2017-05-23 09:00:00,0.0,True,,,
10,tnnevirvel,2017-05-23 09:00:00,0.0,True,,,
8,vildfl,2017-05-23 09:00:00,0.0,True,,,
...,...,...,...,...,...,...,...
5673968,hugin,2024-12-31 22:00:00,0.0,True,,,
5673967,dugdrob,2024-12-31 22:00:00,0.0,True,,,
5673978,vikinglp,2024-12-31 22:00:00,0.0,True,,,
5673972,stormvind,2024-12-31 22:00:00,0.0,True,,,


## Save the data to disk

In [31]:
os.makedirs(ep_output_dir, exist_ok=True)
os.makedirs(rulantica_output_dir, exist_ok=True)

ep_merged_df.to_parquet(os.path.join(ep_output_dir, "merged_data.parquet"), index=False)
rulantica_merged_df.to_parquet(os.path.join(rulantica_output_dir, "merged_data.parquet"), index=False)