# Public and School Holidays for Schleswig Holstein 2013-2019
data directory contains now two more files
- official_holidays.csv
- school_holidays.csv

In [None]:
# read both csv files and display their shape
import pandas as pd
import sys
sys.path.append('..')  # Add project root to path for imports
from utils import plot_missing_heatmap

df_official_holidays = pd.read_csv("../data/official_holidays.csv")
df_school_holidays = pd.read_csv("../data/school_holidays.csv")

#display heads and shapes
print(df_official_holidays.shape)
display(df_official_holidays.head())
print(df_school_holidays.shape)
display(df_school_holidays.head())


In [None]:
# produce dataframe with all dates from 2013-01-01 to 2019-12-31

date_range = pd.date_range(start='2013-01-01', end='2019-12-31', freq='D')
df_holidays = pd.DataFrame({'date': date_range})

# Initialize columns with 0
df_holidays['public_holiday'] = 0
df_holidays['school_holiday'] = 0

# Process Public Holidays
# Ensure the date column in official holidays is datetime
df_official_holidays['Date'] = pd.to_datetime(df_official_holidays['Date'])
# Mark dates present in official holidays as 1
df_holidays.loc[df_holidays['date'].isin(df_official_holidays['Date']), 'public_holiday'] = 1

# Process School Holidays
# Ensure start and end dates are datetime
df_school_holidays['Start Date'] = pd.to_datetime(df_school_holidays['Start Date'])
df_school_holidays['End Date'] = pd.to_datetime(df_school_holidays['End Date'])

# Iterate through school holiday ranges and mark them
for _, row in df_school_holidays.iterrows():
    mask = (df_holidays['date'] >= row['Start Date']) & (df_holidays['date'] <= row['End Date'])
    df_holidays.loc[mask, 'school_holiday'] = 1

# Display the result
print(df_holidays.shape)
display(df_holidays.head())
display(df_holidays.tail())

In [None]:
# write to csv
df_holidays.to_csv("../data/processed/df_holidays.csv", index=False)
# create next_day_holiday column
df_holidays['next_day_holiday'] = df_holidays['public_holiday'].shift(-1)

df_holidays.tail()

In [None]:
# Merging df_merged_extended_weather with df_holidays
df_merged_extended_weather = pd.read_csv("../data/processed/df_merged_extended_weather.csv")

# Convert Datum to datetime, extract date, then to string for consistent merging
df_merged_extended_weather['Datum'] = pd.to_datetime(df_merged_extended_weather['Datum']).dt.date.astype(str)

# Ensure df_holidays date column is also date as string
df_holidays['date'] = pd.to_datetime(df_holidays['date']).dt.strftime('%Y-%m-%d')

# Merge the dataframes
df_merged_with_holidays = pd.merge(df_merged_extended_weather, df_holidays, left_on='Datum', right_on='date', how='left')

# Drop the redundant 'date' column
df_merged_with_holidays = df_merged_with_holidays.drop(columns=['date'])
display(df_merged_with_holidays.shape)
df_merged_with_holidays.head()



In [None]:
df_merged_with_holidays['Datum'] = pd.to_datetime(df_merged_with_holidays['Datum'])

# check how many public holidays and school holidays are present
num_public_holidays = df_merged_with_holidays['public_holiday'].sum()
num_school_holidays = df_merged_with_holidays['school_holiday'].sum()
print(f"Number of Public Holidays in Merged Data: {num_public_holidays}")
print(f"Number of School Holidays in Merged Data: {num_school_holidays}")

#plot_missing_heatmap(df_merged_with_holidays, "Missing Values After Merging Holidays")

df_modified = df_merged_with_holidays.copy()
# fill public_holiday and school_holiday 0 with NaN for better visualization of missing values
df_modified['public_holiday'] = df_modified['public_holiday'].replace(1, pd.NA)
df_modified['school_holiday'] = df_modified['school_holiday'].replace(1, pd.NA)
df_modified['next_day_holiday'] = df_modified['next_day_holiday'].replace(1, pd.NA)
plot_missing_heatmap(df_modified[['Datum','public_holiday', 'next_day_holiday', 'school_holiday']], "Missing Values in Holiday Columns After Merging Holidays")

In [None]:
# write to csv
df_merged_with_holidays.to_csv("../data/processed/df_extended_weather_holidays.csv", index=False)
