<h1> Engineer Holidayness Feature </h1>

In [43]:
from pathlib import Path
import sys
import pandas as pd

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import config as cfg
from src import eda_utils
from src import plotting_utils
from src import config as cfg
from src import filter_group_aggr_utils as fga

from typing import Tuple
import matplotlib.pyplot as plt
import numpy as np
import math


<h2> Build the Baden-Württemberg DataFrame </h2>

We load the DataFrames from the `../data/processed` Folder 

In [44]:
DATA_PROCESSED = Path("../data/processed")

# --- Load Parquet files ---
OPSD_60min_DE_LU_df = pd.read_parquet(DATA_PROCESSED / "OPSD_60min_de_lu_df.parquet")
DE_hol_df  = pd.read_parquet(DATA_PROCESSED / "DE_hol_df.parquet")

# --- Quick check ---
print("Loaded DataFrames:")
print(f"OPSD_60min_DE_LU_df.shape: {OPSD_60min_DE_LU_df.shape}")
print(f"DE_hol_df.shape: {DE_hol_df.shape}")


Loaded DataFrames:
OPSD_60min_DE_LU_df.shape: (50401, 45)
DE_hol_df.shape: (7670, 37)


We make copies of the DataFrames, with features relevant to **Baden-Württemberg**  only. 

In [None]:
DE_BW_hol_df = DE_hol_df[[ "local_date", "local_start", "local_end",	"utc_start",	"utc_end",	"DE_BW_hol",	"DE_BW_school_free"]].copy()
DE_BW_hol_df.info()

In [46]:
BW_cols = [col for col in OPSD_60min_DE_LU_df.columns if ("transnetbw" in col) or  ("timestamp" in col)]
print(BW_cols)

['utc_timestamp', 'cet_cest_timestamp', 'DE_transnetbw_load_actual_entsoe_transparency', 'DE_transnetbw_load_forecast_entsoe_transparency', 'DE_transnetbw_solar_generation_actual', 'DE_transnetbw_wind_onshore_generation_actual']


In [47]:
OPSD_60min_BW_df =  OPSD_60min_DE_LU_df[BW_cols].copy()
print(OPSD_60min_BW_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50401 entries, 0 to 50400
Data columns (total 6 columns):
 #   Column                                           Non-Null Count  Dtype                        
---  ------                                           --------------  -----                        
 0   utc_timestamp                                    50401 non-null  datetime64[ns, UTC]          
 1   cet_cest_timestamp                               50401 non-null  datetime64[ns, Europe/Berlin]
 2   DE_transnetbw_load_actual_entsoe_transparency    50400 non-null  float64                      
 3   DE_transnetbw_load_forecast_entsoe_transparency  50376 non-null  float64                      
 4   DE_transnetbw_solar_generation_actual            50386 non-null  float64                      
 5   DE_transnetbw_wind_onshore_generation_actual     50400 non-null  float64                      
dtypes: datetime64[ns, Europe/Berlin](1), datetime64[ns, UTC](1), float64(4)
memory usage: 2

We initialize the holiday and school-free columns of `OPSD_60min_BW_df`.

In [52]:
OPSD_60min_BW_df["DE_BW_hol"] = np.uint8(0)
OPSD_60min_BW_df["DE_BW_school_free"] = np.uint8(0)

We iterate over each `utc_timestamp` in `OPSD_60min_BW_df` and, for each hour, look up the corresponding holiday and school-free flags in `DE_BW_hol_df`, then assign these values to the matching row in `OPSD_60min_BW_df`.
 

In [53]:
for idx, utc_ts in OPSD_60min_BW_df["utc_timestamp"].items():  
    row = DE_BW_hol_df.loc[
        (DE_BW_hol_df["utc_start"] < utc_ts) &
        (utc_ts <= DE_BW_hol_df["utc_end"]),
        ["DE_BW_hol", "DE_BW_school_free"]
    ]
    if not row.empty:
        OPSD_60min_BW_df.at[idx, "DE_BW_hol"] = np.uint8(row["DE_BW_hol"].iloc[0])
        OPSD_60min_BW_df.at[idx, "DE_BW_school_free"] = np.uint8(row["DE_BW_school_free"].iloc[0])


print("Flags assigned by timestamp iteration.")
print(OPSD_60min_BW_df[["utc_timestamp", "DE_BW_hol", "DE_BW_school_free"]].head())


Flags assigned by timestamp iteration.
              utc_timestamp  DE_BW_hol  DE_BW_school_free
0 2014-12-31 23:00:00+00:00          0                  0
1 2015-01-01 00:00:00+00:00          1                  1
2 2015-01-01 01:00:00+00:00          1                  1
3 2015-01-01 02:00:00+00:00          1                  1
4 2015-01-01 03:00:00+00:00          1                  1


In [55]:
BW_load_hol_df = OPSD_60min_BW_df[["utc_timestamp","DE_transnetbw_load_actual_entsoe_transparency", "DE_BW_hol",	"DE_BW_school_free"]]
BW_load_hol_df

Unnamed: 0,utc_timestamp,DE_transnetbw_load_actual_entsoe_transparency,DE_BW_hol,DE_BW_school_free
0,2014-12-31 23:00:00+00:00,,0,0
1,2015-01-01 00:00:00+00:00,5307.0,1,1
2,2015-01-01 01:00:00+00:00,5087.0,1,1
3,2015-01-01 02:00:00+00:00,4906.0,1,1
4,2015-01-01 03:00:00+00:00,4865.0,1,1
...,...,...,...,...
50396,2020-09-30 19:00:00+00:00,7657.0,0,0
50397,2020-09-30 20:00:00+00:00,7072.0,0,0
50398,2020-09-30 21:00:00+00:00,6465.0,0,0
50399,2020-09-30 22:00:00+00:00,5963.0,0,0
