In [33]:
import pandas as pd

attendance = pd.read_csv("data/attendance.csv")
entity_schedule = pd.read_csv("data/entity_schedule.csv")
link_atrraction_park = pd.read_csv("data/link_attraction_park.csv")
waiting_times = pd.read_csv("data/waiting_times.csv")
weather_data = pd.read_csv("data/weather_data.csv")

In [34]:
# Filtering for only Port Aventura rides
filtered_attractions = link_atrraction_park[
    link_atrraction_park["ATTRACTION;PARK"].str.contains("PortAventura World")
]
ride_names = filtered_attractions["ATTRACTION;PARK"].str.split(";", expand=True)[0]
waiting_times = waiting_times[
    waiting_times["ENTITY_DESCRIPTION_SHORT"].isin(ride_names)
]

# Converting to datetime
entity_schedule["DEB_TIME"] = pd.to_datetime(entity_schedule["DEB_TIME"])
entity_schedule["FIN_TIME"] = pd.to_datetime(entity_schedule["FIN_TIME"])

waiting_times["DEB_TIME"] = pd.to_datetime(waiting_times["DEB_TIME"])
waiting_times["FIN_TIME"] = pd.to_datetime(waiting_times["FIN_TIME"])

# Excluding Tivoli Gardens park and Tivoli Gardens rides
entity_schedule = entity_schedule[
    entity_schedule["ENTITY_DESCRIPTION_SHORT"] != "Tivoli Gardens"
]
park_closures = entity_schedule[entity_schedule["ENTITY_TYPE"] == "PARK"]

entity_schedule_rides = entity_schedule[
    entity_schedule["ENTITY_DESCRIPTION_SHORT"].isin(ride_names)
]
ride_closures = entity_schedule[entity_schedule["ENTITY_TYPE"] == "ATTR"]

# Create an interval index for park closures
park_intervals = pd.IntervalIndex.from_arrays(
    park_closures["DEB_TIME"], park_closures["FIN_TIME"], closed="both"
)

# Find which waiting_times fall into any park closure
mask_park = waiting_times["DEB_TIME"].apply(lambda x: park_intervals.contains(x).any())

# Remove affected waiting times
waiting_times = waiting_times[~mask_park]

# Initialize mask for ride closures
mask_ride = pd.Series(False, index=waiting_times.index)

# Loop through each ride and apply the interval check
for ride_name, ride_group in ride_closures.groupby("ENTITY_DESCRIPTION_SHORT"):
    # Create an IntervalIndex for the ride closure times
    ride_intervals = pd.IntervalIndex.from_arrays(
        ride_group["DEB_TIME"], ride_group["FIN_TIME"], closed="both"
    )

    # Mask waiting times for this specific ride that fall within the ride closure interval
    mask_ride_for_ride = waiting_times[
        (waiting_times["ENTITY_DESCRIPTION_SHORT"] == ride_name)
        & waiting_times["DEB_TIME"].apply(lambda x: ride_intervals.contains(x).any())
    ].index

    # Update the global mask
    mask_ride.loc[mask_ride_for_ride] = True

# Remove waiting times affected by ride closures
waiting_times = waiting_times[~mask_ride]


In [35]:
# Processing weather data
relevant_weather_data = weather_data[
    ["dt_iso", "temp", "humidity", "wind_speed", "clouds_all", "rain_1h", "snow_1h"]
]
relevant_weather_data["dt_iso"] = relevant_weather_data["dt_iso"].str.split("+").str[0]
relevant_weather_data["date"] = pd.to_datetime(
    relevant_weather_data["dt_iso"].str.split(" ").str[0]
)
relevant_weather_data["time"] = relevant_weather_data["dt_iso"].str.split(" ").str[1]
relevant_weather_data["hour"] = (
    relevant_weather_data["time"].str.split(":").str[0].astype(int) + 1
)
relevant_weather_data = relevant_weather_data.drop(columns=["dt_iso", "time"])
relevant_weather_data.fillna(0, inplace=True)

# One hot encoding times
waiting_times["DEB_TIME"] = waiting_times["DEB_TIME"].astype(str)
waiting_times["date"] = pd.to_datetime(waiting_times["DEB_TIME"].str.split(" ").str[0])
waiting_times["time"] = waiting_times["DEB_TIME"].str.split(" ").str[1]

# Encode the hour and quarter linearly in the time column
waiting_times["hour"] = waiting_times["time"].str.split(":").str[0].astype(int)
waiting_times["minute"] = waiting_times["time"].str.split(":").str[1].astype(int)
waiting_times["time_encoded"] = waiting_times["hour"] * 4 + waiting_times["minute"] / 15

# Merge the weather data
waiting_times = pd.merge(
    waiting_times, relevant_weather_data, on=["date", "hour"], how="left"
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_weather_data['dt_iso'] = relevant_weather_data['dt_iso'].str.split('+').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_weather_data['date'] = pd.to_datetime(relevant_weather_data['dt_iso'].str.split(' ').str[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_weathe

In [36]:
import numpy as np

waiting_times["day_of_week"] = waiting_times["date"].dt.dayofweek
waiting_times["day_of_week_sin"] = np.sin(2 * np.pi * waiting_times["day_of_week"] / 7)
waiting_times["day_of_week_cos"] = np.cos(2 * np.pi * waiting_times["day_of_week"] / 7)

waiting_times["year"] = waiting_times["date"].dt.year
waiting_times["month"] = waiting_times["date"].dt.month
waiting_times["day"] = waiting_times["date"].dt.day
waiting_times = pd.get_dummies(waiting_times, columns=["year", "month", "day"])

waiting_times.drop(columns=["time", "hour", "minute", "day_of_week"], inplace=True)

In [37]:
# One hot encoding ride names
ride_names = waiting_times["ENTITY_DESCRIPTION_SHORT"].unique()
ride_names_encoded = pd.get_dummies(waiting_times["ENTITY_DESCRIPTION_SHORT"])
waiting_times = pd.concat([waiting_times, ride_names_encoded], axis=1)

In [38]:
# Dropping any rows with downtime for training
waiting_times = waiting_times[waiting_times["DOWNTIME"] == 0]

# Dropping columns that are not useful for training
waiting_times.drop(
    columns=[
        "WORK_DATE",
        "FIN_TIME",
        "NB_UNITS",
        "GUEST_CARRIED",
        "NB_MAX_UNIT",
        "ADJUST_CAPACITY",
        "OPEN_TIME",
        "UP_TIME",
        "CAPACITY",
        "DOWNTIME",
        "ENTITY_DESCRIPTION_SHORT",
    ],
    inplace=True,
)

In [39]:
# Adding attendance as a feature
attendance = attendance[attendance["FACILITY_NAME"] == "PortAventura Park"].rename(
    columns={"USAGE_DATE": "date"}
)
waiting_times = pd.merge(waiting_times, attendance, on="date", how="left")

In [40]:
waiting_times.columns

Index(['DEB_TIME', 'DEB_TIME_HOUR', 'WAIT_TIME_MAX', 'date', 'time_encoded',
       'temp', 'humidity', 'wind_speed', 'clouds_all', 'rain_1h', 'snow_1h',
       'day_of_week_sin', 'day_of_week_cos', 'year_2018', 'year_2019',
       'year_2020', 'year_2021', 'year_2022', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10', 'month_11', 'month_12', 'day_1', 'day_2', 'day_3', 'day_4',
       'day_5', 'day_6', 'day_7', 'day_8', 'day_9', 'day_10', 'day_11',
       'day_12', 'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18',
       'day_19', 'day_20', 'day_21', 'day_22', 'day_23', 'day_24', 'day_25',
       'day_26', 'day_27', 'day_28', 'day_29', 'day_30', 'day_31',
       'Bumper Cars', 'Bungee Jump', 'Circus Train', 'Crazy Dance',
       'Dizzy Dropper', 'Drop Tower', 'Flying Coaster', 'Free Fall',
       'Giant Wheel', 'Giga Coaster', 'Go-Karts', 'Haunted House',
       'Himalaya Ride', 'Inverted Coaster', 'Ki

In [41]:
waiting_times.to_csv("data/processed_data_waiting_times.csv")

In [1]:
import pandas as pd

waiting_times = pd.read_csv("data/processed_data_waiting_times.csv")


waiting_times.drop(columns=["DEB_TIME", "DEB_TIME_HOUR", "FACILITY_NAME"], inplace=True)

In [13]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold

# Define the number of splits for cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize lists to store the results
train_rmse_list = []
val_rmse_list = []

waiting_times["date"] = pd.to_datetime(waiting_times["date"])
cutoff_date = waiting_times["date"].max() - pd.DateOffset(months=3)
train_set = waiting_times[waiting_times["date"] < cutoff_date]
val_set = waiting_times[waiting_times["date"] >= cutoff_date]
X_train = train_set.drop(columns=["WAIT_TIME_MAX", "date"])
y_train = train_set["WAIT_TIME_MAX"]
X_val = val_set.drop(columns=["WAIT_TIME_MAX", "date"])
y_val = val_set["WAIT_TIME_MAX"]

# Train the final model on the entire training data
model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=10,
    random_state=42,
    subsample=0.9,
    gamma=0.1,
)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=100)

# Predict on the final validation set
y_pred_final = model.predict(X_val)

np.where(y_pred_final < 0, 0, y_pred_final)

# Evaluate the final model
final_rmse = root_mean_squared_error(y_val, y_pred_final)
print(f"Final Validation RMSE: {final_rmse}")

[0]	validation_0-rmse:22.98672
[100]	validation_0-rmse:12.01347
[200]	validation_0-rmse:11.48751
[300]	validation_0-rmse:11.48942
[400]	validation_0-rmse:11.51301
[500]	validation_0-rmse:11.57714
[600]	validation_0-rmse:11.60122
[700]	validation_0-rmse:11.62659
[800]	validation_0-rmse:11.65505
[900]	validation_0-rmse:11.72716
[999]	validation_0-rmse:11.77313
Final Validation RMSE: 11.773126602172852


In [27]:
pred_df = X_val.drop(
    columns=[
        "temp",
        "humidity",
        "wind_speed",
        "clouds_all",
        "rain_1h",
        "snow_1h",
        "day_of_week_sin",
        "day_of_week_cos",
        "attendance",
    ]
)
pred_df["WAIT_TIME_MAX"] = y_pred_final
pred_df.loc[pred_df["WAIT_TIME_MAX"] < 0, "WAIT_TIME_MAX"] = 0

In [61]:
from sklearn.metrics import mean_absolute_error

final_rmse = root_mean_squared_error(y_val, pred_df["WAIT_TIME_MAX"])
final_mae = mean_absolute_error(y_val, pred_df["WAIT_TIME_MAX"])
print(f"Final Validation RMSE: {final_rmse}")
print(f"Final Validation MAE: {final_mae}")

KeyError: 'WAIT_TIME_MAX'

In [59]:
waiting_times = pd.read_csv("data/processed_data_waiting_times.csv")
pred_df = waiting_times[
    [
        "DEB_TIME",
        "date",
        "Bumper Cars",
        "Bungee Jump",
        "Circus Train",
        "Crazy Dance",
        "Dizzy Dropper",
        "Drop Tower",
        "Flying Coaster",
        "Free Fall",
        "Giant Wheel",
        "Giga Coaster",
        "Go-Karts",
        "Haunted House",
        "Himalaya Ride",
        "Inverted Coaster",
        "Kiddie Coaster",
        "Merry Go Round",
        "Oz Theatre",
        "Rapids Ride",
        "Roller Coaster",
        "Spinning Coaster",
        "Spiral Slide",
        "Superman Ride",
        "Swing Ride",
        "Vertical Drop",
        "Water Ride",
        "Zipline",
    ]
]

pred_df["ride"] = pred_df[
    [
        "Bumper Cars",
        "Bungee Jump",
        "Circus Train",
        "Crazy Dance",
        "Dizzy Dropper",
        "Drop Tower",
        "Flying Coaster",
        "Free Fall",
        "Giant Wheel",
        "Giga Coaster",
        "Go-Karts",
        "Haunted House",
        "Himalaya Ride",
        "Inverted Coaster",
        "Kiddie Coaster",
        "Merry Go Round",
        "Oz Theatre",
        "Rapids Ride",
        "Roller Coaster",
        "Spinning Coaster",
        "Spiral Slide",
        "Superman Ride",
        "Swing Ride",
        "Vertical Drop",
        "Water Ride",
        "Zipline",
    ]
].idxmax(axis=1)


pred_df["date"] = pd.to_datetime(pred_df["date"])
pred_df = pred_df[pred_df["date"] >= cutoff_date]
pred_df["pred"] = y_pred_final
pred_df.loc[pred_df["pred"] < 0, "pred"] = 0

pred_df.drop(
    columns=[
        "Bumper Cars",
        "Bungee Jump",
        "Circus Train",
        "Crazy Dance",
        "Dizzy Dropper",
        "Drop Tower",
        "Flying Coaster",
        "Free Fall",
        "Giant Wheel",
        "Giga Coaster",
        "Go-Karts",
        "Haunted House",
        "Himalaya Ride",
        "Inverted Coaster",
        "Kiddie Coaster",
        "Merry Go Round",
        "Oz Theatre",
        "Rapids Ride",
        "Roller Coaster",
        "Spinning Coaster",
        "Spiral Slide",
        "Superman Ride",
        "Swing Ride",
        "Vertical Drop",
        "Water Ride",
        "Zipline",
        "date",
    ],
    inplace=True,
)

pred_df.to_csv("data/waiting_time_predictions.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df['ride'] = pred_df[['Bumper Cars', 'Bungee Jump', 'Circus Train', 'Crazy Dance',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df["date"] = pd.to_datetime(pred_df["date"])


In [62]:
final_rmse = root_mean_squared_error(y_val, pred_df["pred"])
final_mae = mean_absolute_error(y_val, pred_df["pred"])
print(f"Final Validation RMSE: {final_rmse}")
print(f"Final Validation MAE: {final_mae}")

Final Validation RMSE: 11.723296165466309
Final Validation MAE: 5.6422038078308105
