In [1]:
import os
import datetime

import pandas as pd
import holidays

## Load the Data

In [2]:
data_input_dir = "../data/processed"
ep_input_file = os.path.join(data_input_dir, "ep", "merged_data.parquet")

ep_output_file = os.path.join(data_input_dir, "ep", "merged_with_holidays.parquet")

We first read the merged EuropaPark date

In [3]:
ep_df = pd.read_parquet(ep_input_file)
print(ep_df.shape)

(14196903, 7)


We need holiday data from 2017 to 2024 as we have data in this range

In [4]:
years = range(ep_df["timestamp"].dt.year.min(), ep_df["timestamp"].dt.year.max() + 1)
print(years)

range(2017, 2025)


## Swiss Holidays

Since each canton in Switzerland has its own holidays, we will first check which holidays are most common in the cantons of Switzerland. We will then use this data to create a list of holidays for the entire country.

In [5]:
swiss_cantons = ["ZH", "BE", "LU", "UR", "SZ", "OW", "NW", "GL", "ZG", "FR", "SO", "BS", "BL", "SH", "AR", "AI", "SG", "GR", "AG", "TG", "TI", "VD", "VS", "NE", "GE", "JU"]

In [6]:
swiss_holiday_count = {}

for terretory in swiss_cantons:
    german_holidays = holidays.country_holidays("CH", years=years, subdiv=terretory)
    for _, name in german_holidays.items():
        if name not in swiss_holiday_count:
            swiss_holiday_count[name] = 0
        swiss_holiday_count[name] += 1

swiss_holiday_count = sorted(swiss_holiday_count.items(), key=lambda x: x[1], reverse=True)
print(swiss_holiday_count)

[("New Year's Day", 208), ('Ascension Day', 208), ('National Day', 208), ('Christmas Day', 208), ('Good Friday', 184), ('Easter Monday', 152), ('Whit Monday', 152), ("Saint Stephen's Day", 120), ("All Saints' Day", 104), ('Corpus Christi', 96), ('Assumption Day', 88), ('Immaculate Conception', 80), ('Labor Day', 72), ("Saint Berchtold's Day", 58), ("Saint Joseph's Day", 40), ('Epiphany', 24), ('Saint Nicholas of Flüe', 8), ('Battle of Naefels Victory Day', 8), ('Saints Peter and Paul', 8), ('Prayer Monday', 8), ('Republic Day', 8), ('Genevan Fast', 8), ('Restoration Day', 8), ('Independence Day', 8)]


We see that in the range of 2017 to 2024, New Year's Day, Ascension Day, National Day and Christmas day are common holidays in all cantons. However, there are many other days that are present in many cantons.
We use the threshold of 70 occurences during the 8 years to consider a holiday as common.

In [7]:
common_swiss_holidays = [name for name, count in swiss_holiday_count if count >= 70]
print(common_swiss_holidays)

["New Year's Day", 'Ascension Day', 'National Day', 'Christmas Day', 'Good Friday', 'Easter Monday', 'Whit Monday', "Saint Stephen's Day", "All Saints' Day", 'Corpus Christi', 'Assumption Day', 'Immaculate Conception', 'Labor Day']


In [8]:
swiss_holiday_dates = []
for holiday in common_swiss_holidays:
    for year in years:
        bw_holiday_date = holidays.country_holidays("CH", years=year, subdiv="AG").get_named(holiday, lookup="exact") # turns out that Aargau has all the holidays
        assert bw_holiday_date, f"Could not find date for {holiday} in {year}"
        swiss_holiday_dates.append(bw_holiday_date[0])

swiss_holiday_dates = sorted(set(swiss_holiday_dates))
print(swiss_holiday_dates)

[datetime.date(2017, 1, 1), datetime.date(2017, 4, 14), datetime.date(2017, 4, 17), datetime.date(2017, 5, 1), datetime.date(2017, 5, 25), datetime.date(2017, 6, 5), datetime.date(2017, 6, 15), datetime.date(2017, 8, 1), datetime.date(2017, 8, 15), datetime.date(2017, 11, 1), datetime.date(2017, 12, 8), datetime.date(2017, 12, 25), datetime.date(2017, 12, 26), datetime.date(2018, 1, 1), datetime.date(2018, 3, 30), datetime.date(2018, 4, 2), datetime.date(2018, 5, 1), datetime.date(2018, 5, 10), datetime.date(2018, 5, 21), datetime.date(2018, 5, 31), datetime.date(2018, 8, 1), datetime.date(2018, 8, 15), datetime.date(2018, 11, 1), datetime.date(2018, 12, 8), datetime.date(2018, 12, 25), datetime.date(2018, 12, 26), datetime.date(2019, 1, 1), datetime.date(2019, 4, 19), datetime.date(2019, 4, 22), datetime.date(2019, 5, 1), datetime.date(2019, 5, 30), datetime.date(2019, 6, 10), datetime.date(2019, 6, 20), datetime.date(2019, 8, 1), datetime.date(2019, 8, 15), datetime.date(2019, 11, 1)

## German Holidays

We apply the same logic for Germany as for Switzerland.

In [9]:
german_states = ["BW", "BY", "BE", "BB", "HB", "HH", "HE", "MV", "NI", "NW", "RP", "SL", "SN", "ST", "SH", "TH"]

In [10]:
german_holiday_count = {}

for terretory in german_states:
    german_holidays = holidays.country_holidays("DE", years=years, subdiv=terretory)
    for _, name in german_holidays.items():
        if name not in german_holiday_count:
            german_holiday_count[name] = 0
        german_holiday_count[name] += 1

german_holiday_count = sorted(german_holiday_count.items(), key=lambda x: x[1], reverse=True)
print(german_holiday_count)

[("New Year's Day", 128), ('Good Friday', 128), ('Easter Monday', 128), ('Labor Day', 128), ('Ascension Day', 128), ('Whit Monday', 128), ('German Unity Day', 128), ('Christmas Day', 128), ('Second Day of Christmas', 128), ('Reformation Day', 79), ('Corpus Christi', 48), ("All Saints' Day", 40), ('Epiphany', 24), ("International Women's Day", 8), ('Easter Sunday', 8), ('Whit Sunday', 8), ('Assumption Day', 8), ('Repentance and Prayer Day', 8), ("World Children's Day", 6), ('75th anniversary of the liberation from Nazism and the end of the Second World War in Europe', 1)]


In [11]:
common_german_holidays = [name for name, count in german_holiday_count if count >= 40]
print(common_german_holidays)

["New Year's Day", 'Good Friday', 'Easter Monday', 'Labor Day', 'Ascension Day', 'Whit Monday', 'German Unity Day', 'Christmas Day', 'Second Day of Christmas', 'Reformation Day', 'Corpus Christi', "All Saints' Day"]


In [12]:
german_holiday_dates = []
for holiday in common_german_holidays:
    for year in years:
        bw_holiday_date = holidays.country_holidays("DE", years=year, subdiv="BW").get_named(holiday, lookup="exact")
        hh_holiday_date = holidays.country_holidays("DE", years=year, subdiv="HH").get_named(holiday, lookup="exact")
        assert bw_holiday_date or hh_holiday_date, f"Could not find date for {holiday} in {year}"
        if bw_holiday_date:
            german_holiday_dates.append(bw_holiday_date[0])
        elif hh_holiday_date:
            german_holiday_dates.append(hh_holiday_date[0])

german_holiday_dates = sorted(set(german_holiday_dates))
print(german_holiday_dates)

[datetime.date(2017, 1, 1), datetime.date(2017, 4, 14), datetime.date(2017, 4, 17), datetime.date(2017, 5, 1), datetime.date(2017, 5, 25), datetime.date(2017, 6, 5), datetime.date(2017, 6, 15), datetime.date(2017, 10, 3), datetime.date(2017, 10, 31), datetime.date(2017, 11, 1), datetime.date(2017, 12, 25), datetime.date(2017, 12, 26), datetime.date(2018, 1, 1), datetime.date(2018, 3, 30), datetime.date(2018, 4, 2), datetime.date(2018, 5, 1), datetime.date(2018, 5, 10), datetime.date(2018, 5, 21), datetime.date(2018, 5, 31), datetime.date(2018, 10, 3), datetime.date(2018, 10, 31), datetime.date(2018, 11, 1), datetime.date(2018, 12, 25), datetime.date(2018, 12, 26), datetime.date(2019, 1, 1), datetime.date(2019, 4, 19), datetime.date(2019, 4, 22), datetime.date(2019, 5, 1), datetime.date(2019, 5, 30), datetime.date(2019, 6, 10), datetime.date(2019, 6, 20), datetime.date(2019, 10, 3), datetime.date(2019, 10, 31), datetime.date(2019, 11, 1), datetime.date(2019, 12, 25), datetime.date(2019,

## French Holidays

In [13]:
french_territories = ["BL", "GES", "GP", "GY", "MF", "MQ", "NC", "PF", "RE", "WF", "YT"]

In [14]:
french_holiday_count = {}

for terretory in french_territories:
    german_holidays = holidays.country_holidays("FR", years=years, subdiv=terretory)
    for _, name in german_holidays.items():
        if name not in french_holiday_count:
            french_holiday_count[name] = 0
        french_holiday_count[name] += 1

french_holiday_count = sorted(french_holiday_count.items(), key=lambda x: x[1], reverse=True)
print(french_holiday_count)

[("New Year's Day", 88), ('Labor Day', 88), ('Victory Day', 88), ('National Day', 88), ('Armistice Day', 88), ('Easter Monday', 88), ('Ascension Day', 88), ('Assumption Day', 88), ("All Saints' Day", 88), ('Christmas Day', 88), ('Whit Monday', 87), ('Abolition of Slavery', 54), ('Good Friday', 32), ('Feast of Victor Schoelcher', 16), ("Saint Stephen's Day", 8), ('Mi-Careme', 8), ('Citizenship Day', 8), ('Missionary Day', 8), ('Internal Autonomy Day', 8), ('Feast of Saint Peter Chanel', 8), ('Festival of the Territory', 8), ('Abolition of Slavery; Whit Monday', 1)]


In [15]:
common_french_holidays = [name for name, count in french_holiday_count if count >= 30]
print(common_french_holidays)

["New Year's Day", 'Labor Day', 'Victory Day', 'National Day', 'Armistice Day', 'Easter Monday', 'Ascension Day', 'Assumption Day', "All Saints' Day", 'Christmas Day', 'Whit Monday', 'Abolition of Slavery', 'Good Friday']


In [16]:
french_holiday_dates = []
for holiday in common_french_holidays:
    for year in years:
        bw_holiday_date = holidays.country_holidays("FR", years=year, subdiv="GP").get_named(holiday, lookup="exact")
        assert bw_holiday_date, f"Could not find date for {holiday} in {year}"
        french_holiday_dates.append(bw_holiday_date[0])

french_holiday_dates = sorted(set(french_holiday_dates))
print(french_holiday_dates)

[datetime.date(2017, 1, 1), datetime.date(2017, 4, 14), datetime.date(2017, 4, 17), datetime.date(2017, 5, 1), datetime.date(2017, 5, 8), datetime.date(2017, 5, 25), datetime.date(2017, 5, 27), datetime.date(2017, 6, 5), datetime.date(2017, 7, 14), datetime.date(2017, 8, 15), datetime.date(2017, 11, 1), datetime.date(2017, 11, 11), datetime.date(2017, 12, 25), datetime.date(2018, 1, 1), datetime.date(2018, 3, 30), datetime.date(2018, 4, 2), datetime.date(2018, 5, 1), datetime.date(2018, 5, 8), datetime.date(2018, 5, 10), datetime.date(2018, 5, 21), datetime.date(2018, 5, 27), datetime.date(2018, 7, 14), datetime.date(2018, 8, 15), datetime.date(2018, 11, 1), datetime.date(2018, 11, 11), datetime.date(2018, 12, 25), datetime.date(2019, 1, 1), datetime.date(2019, 4, 19), datetime.date(2019, 4, 22), datetime.date(2019, 5, 1), datetime.date(2019, 5, 8), datetime.date(2019, 5, 27), datetime.date(2019, 5, 30), datetime.date(2019, 6, 10), datetime.date(2019, 7, 14), datetime.date(2019, 8, 15)

## Add holiday columns

Finally we add a column for each country marking the day as a holiday or not.

In [17]:
ep_df["is_german_holiday"] = ep_df["timestamp"].dt.date.isin(german_holiday_dates)
ep_df["is_swiss_holiday"] = ep_df["timestamp"].dt.date.isin(swiss_holiday_dates)
ep_df["is_french_holiday"] = ep_df["timestamp"].dt.date.isin(french_holiday_dates)

Check the example of the 1st of November 2017, which is the "Allerheiligen" holiday in all three countries.

In [20]:
ep_df[ep_df["timestamp"].dt.date == datetime.date(2023, 11, 1)].head(5)

Unnamed: 0,ride_name,timestamp,wait_time,closed,temperature,rain,wind,is_german_holiday,is_swiss_holiday,is_french_holiday
12028277,pegasus,2023-11-01 09:00:00,1.0,False,10.0,0.0,1.4,True,True,True
12028278,poppy towers,2023-11-01 09:00:00,,False,10.0,0.0,1.4,True,True,True
12028279,silver star,2023-11-01 09:00:00,5.0,False,10.0,0.0,1.4,True,True,True
12028280,swiss bob run,2023-11-01 09:00:00,1.0,False,10.0,0.0,1.4,True,True,True
12028281,tirol log flume,2023-11-01 09:00:00,,False,10.0,0.0,1.4,True,True,True


Also check the day after that, the 2nd of November 2023, which was a regular Thursday.

In [21]:
ep_df[ep_df["timestamp"].dt.date == datetime.date(2023, 11, 2)].head(5)

Unnamed: 0,ride_name,timestamp,wait_time,closed,temperature,rain,wind,is_german_holiday,is_swiss_holiday,is_french_holiday
12033352,pegasus,2023-11-02 09:00:00,1.0,False,9.9,2.5,2.9,False,False,False
12033353,poppy towers,2023-11-02 09:00:00,,False,9.9,2.5,2.9,False,False,False
12033354,silver star,2023-11-02 09:00:00,5.0,False,9.9,2.5,2.9,False,False,False
12033355,swiss bob run,2023-11-02 09:00:00,1.0,False,9.9,2.5,2.9,False,False,False
12033356,tirol log flume,2023-11-02 09:00:00,,False,9.9,2.5,2.9,False,False,False


In [19]:
ep_df.to_parquet(ep_output_file, index=False)