## Create new varible Holidays for Schleswig-Holstein
### Add a variable to identify public holidays in Schleswig-Holstein for the dataset period.


In [None]:
import pandas as pd
from datetime import datetime

# Run the repository script that performs the merge (umsatz + test first),
# then load and print a brief summary of the generated CSV.
import subprocess, sys
from pathlib import Path
import pandas as pd

script = 'scripts/merge_datasets_fixed.py'
print(f"Running merge script: {script}")
res = subprocess.run([sys.executable, script], capture_output=True, text=True)
print(res.stdout)
if res.returncode != 0:
    print('Merge script failed with return code', res.returncode)
    print(res.stderr)
else:
    out = Path('combined_data_outer.csv')
    if out.exists():
        df = pd.read_csv(out, parse_dates=['Datum'])
        print('Merged file loaded: rows=', len(df))
        print('Date range:', df['Datum'].min(), 'to', df['Datum'].max())
        print('Holidays:', df['is_holiday'].sum() if 'is_holiday' in df.columns else 'no column')
    else:
        print('Expected output file not found:', out)

print(f"Defined {len(schleswig_holstein_holidays)} holidays for Schleswig-Holstein (2013-2018)")
print("Year distribution:")
for year in years:
    holidays_in_year = sum(1 for h in schleswig_holstein_holidays if h.year == year)
    print(f"  {year}: {holidays_in_year} holidays")


Defined 90 holidays for Schleswig-Holstein (2013-2018)
Year distribution:
  2013: 15 holidays
  2014: 15 holidays
  2015: 15 holidays
  2016: 15 holidays
  2017: 15 holidays
  2018: 15 holidays


### Create a function to add a holiday indicator to the merged dataset.


In [6]:
def add_holiday_indicator(df, holidays, date_column='Datum'):
    """
    Add a holiday indicator column to a DataFrame.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The DataFrame to which the holiday indicator will be added
    holidays : list
        List of pd.Timestamp objects representing holidays
    date_column : str
        The name of the date column in the DataFrame (default: 'Datum')
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with a new 'is_holiday' column (1 if holiday, 0 otherwise)
    """
    df = df.copy()
    # Create a boolean column checking if each date is in the holidays list
    df['is_holiday'] = df[date_column].isin(holidays).astype(int)
    return df

print("Holiday indicator function defined successfully")
print("This function will add an 'is_holiday' column to the merged dataset")


Holiday indicator function defined successfully
This function will add an 'is_holiday' column to the merged dataset


# This code merges the given CSV Datasets and can be used to combine new datasets to it.

## Load and Merge Datasets (CAREFUL: This cell creats a new csv file)
Now proceed with loading the CSV files and performing the outer join. The holiday indicator will be added after the merge.


In [7]:
import pandas as pd
from pathlib import Path

# --- 1. Define File Names (repo-root-relative paths so nbconvert works) ---
file_umsatz = '1_DatasetCharacteristics/raw_data/umsatzdaten_gekuerzt.csv'
file_test = '1_DatasetCharacteristics/raw_data/test.csv'
file_kiwo = '1_DatasetCharacteristics/raw_data/kiwo.csv'
file_wetter = '1_DatasetCharacteristics/raw_data/wetter.csv'
file_niederschlag = '1_DatasetCharacteristics/raw_data/Niederschlag.csv'

try:
    # --- 2. Load Umsatz and Test, then concatenate them ---
    print(f"Loading {file_umsatz}...")
    df_umsatz = pd.read_csv(file_umsatz, parse_dates=['Datum'])

    print(f"Loading {file_test}...")
    df_test = pd.read_csv(file_test, parse_dates=['Datum'])

    # Concatenate Umsatz and test rows first (append test rows to umsatz)
    df_umsatz_combined = pd.concat([df_umsatz, df_test], ignore_index=True)
    print(f"Combined umsatz + test: {len(df_umsatz_combined)} rows")

    # --- 3. Load other CSVs ---
    print(f"Loading {file_kiwo}...")
    df_kiwo = pd.read_csv(file_kiwo, parse_dates=['Datum'])

    print(f"Loading {file_wetter}...")
    df_wetter = pd.read_csv(file_wetter, parse_dates=['Datum'])

    print(f"Loading {file_niederschlag}...")
    df_niederschlag = pd.read_csv(file_niederschlag, parse_dates=['Datum'])

    # --- 4. Set 'Datum' as the Index for All DataFrames ---
    df_umsatz_combined = df_umsatz_combined.set_index('Datum')
    df_kiwo = df_kiwo.set_index('Datum')
    df_wetter = df_wetter.set_index('Datum')
    df_niederschlag = df_niederschlag.set_index('Datum')

    print("All files loaded and indexed on 'Datum'.")

    # --- 5. Perform the Outer Join ---
    other_dfs = [df_kiwo, df_wetter, df_niederschlag]
    print("Performing outer join with umsatz+test as base...")
    final_df = df_umsatz_combined.join(other_dfs, how='outer')

    # --- 6. Clean Up and Add Holiday Indicator ---
    final_df = final_df.reset_index()
    final_df = final_df.sort_values(by='Datum')
    final_df = add_holiday_indicator(final_df, schleswig_holstein_holidays, date_column='Datum')
    print("Added holiday indicator variable to merged dataset")

    # --- 7. Display Results and Save to File ---
    print("\n--- Join Complete ---")
    print("First 5 rows of the combined data:")
    print(final_df.head())

    print("\nLast 5 rows of the combined data:")
    print(final_df.tail())

    print(f"\nTotal rows in combined file: {len(final_df)}")
    print(f"Holidays found in dataset: {final_df['is_holiday'].sum()}")

    output_filename = 'combined_data_outer.csv'
    final_df.to_csv(output_filename, index=False)
    print(f"\nSuccessfully saved combined data to '{output_filename}'")

except FileNotFoundError as e:
    print(f"\nError: File not found.")
    print(f"Details: {e}")
    print("Please make sure all CSV files are present in '1_DatasetCharacteristics/raw_data/'.")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")


Loading 1_DatasetCharacteristics/raw_data/umsatzdaten_gekuerzt.csv...

Error: File not found.
Details: [Errno 2] No such file or directory: '1_DatasetCharacteristics/raw_data/umsatzdaten_gekuerzt.csv'
Please make sure all CSV files are present in '1_DatasetCharacteristics/raw_data/'.


## Merge test.csv to the Dataset
Append the test data to the merged dataset for extended coverage.


In [8]:
try:
    # The merged file was created in the previous cell: 'combined_data_outer.csv'
    output_filename = 'combined_data_outer.csv'
    print(f"Loading merged file '{output_filename}'...")
    df_combined = pd.read_csv(output_filename, parse_dates=['Datum'])
    print(f"Merged file loaded: {len(df_combined)} rows")
    print("Date range:", df_combined['Datum'].min(), "to", df_combined['Datum'].max())
    if 'is_holiday' in df_combined.columns:
        print(f"Holidays in merged data: {df_combined['is_holiday'].sum()}")
    else:
        print("Holiday indicator column not present in merged file.")
except Exception as e:
    print(f"Error loading merged file: {e}")


Loading merged file 'combined_data_outer.csv'...
Merged file loaded: 11782 rows
Date range: 2012-01-01 00:00:00 to 2019-12-31 00:00:00
Holidays in merged data: 294
