## Create new varible Holidays for Schleswig-Holstein
### Add a variable to identify public holidays in Schleswig-Holstein for the dataset period.


In [1]:
import pandas as pd
from datetime import datetime

# --- Define Holidays for Schleswig-Holstein (2013-2018) ---
# This creates a list of public holidays in Schleswig-Holstein for all years in the dataset
# These dates are used to identify holidays in the dataset

# Fixed holidays (same date every year)
def get_fixed_holidays(years):
    """Generate fixed holidays for given years"""
    fixed = []
    for year in years:
        fixed.extend([
            pd.Timestamp(f'{year}-01-01'),  # New Year's Day
            pd.Timestamp(f'{year}-05-01'),  # Labour Day
            pd.Timestamp(f'{year}-10-03'),  # German Unity Day
            pd.Timestamp(f'{year}-10-31'),  # Reformation Day
            pd.Timestamp(f'{year}-11-01'),  # All Saints' Day
            pd.Timestamp(f'{year}-12-25'),  # Christmas Day
            pd.Timestamp(f'{year}-12-26'),  # Boxing Day
        ])
    return fixed

# Easter-based holidays (moveable dates)
# These are calculated based on Easter Sunday
easter_dates = {
    2013: pd.Timestamp('2013-03-31'),
    2014: pd.Timestamp('2014-04-20'),
    2015: pd.Timestamp('2015-04-05'),
    2016: pd.Timestamp('2016-03-27'),
    2017: pd.Timestamp('2017-04-16'),
    2018: pd.Timestamp('2018-04-01'),
}

def get_easter_holidays(easter_dates):
    """Generate Easter-based holidays from Easter Sunday dates"""
    easter_holidays = []
    for year, easter in easter_dates.items():
        # Good Friday (2 days before Easter)
        easter_holidays.append(easter - pd.Timedelta(days=2))
        # Easter Sunday
        easter_holidays.append(easter)
        # Easter Monday (1 day after Easter)
        easter_holidays.append(easter + pd.Timedelta(days=1))
        # Ascension Day (39 days after Easter)
        easter_holidays.append(easter + pd.Timedelta(days=39))
        # Whit Sunday (49 days after Easter)
        easter_holidays.append(easter + pd.Timedelta(days=49))
        # Whit Monday (50 days after Easter)
        easter_holidays.append(easter + pd.Timedelta(days=50))
        # Corpus Christi (60 days after Easter)
        easter_holidays.append(easter + pd.Timedelta(days=60))
    return easter_holidays

# Carnival Tuesday (moveable, 47 days before Easter)
carnival_dates = {
    2013: pd.Timestamp('2013-02-12'),
    2014: pd.Timestamp('2014-03-04'),
    2015: pd.Timestamp('2015-02-17'),
    2016: pd.Timestamp('2016-02-09'),
    2017: pd.Timestamp('2017-02-28'),
    2018: pd.Timestamp('2018-02-13'),
}

years = [2013, 2014, 2015, 2016, 2017, 2018]

# Combine all holidays
schleswig_holstein_holidays = (
    get_fixed_holidays(years) + 
    get_easter_holidays(easter_dates) + 
    list(carnival_dates.values())
)

# Remove duplicates and sort
schleswig_holstein_holidays = sorted(list(set(schleswig_holstein_holidays)))

print(f"Defined {len(schleswig_holstein_holidays)} holidays for Schleswig-Holstein (2013-2017)")
print("Year distribution:")
for year in years:
    holidays_in_year = sum(1 for h in schleswig_holstein_holidays if h.year == year)
    print(f"  {year}: {holidays_in_year} holidays")


Defined 90 holidays for Schleswig-Holstein (2013-2017)
Year distribution:
  2013: 15 holidays
  2014: 15 holidays
  2015: 15 holidays
  2016: 15 holidays
  2017: 15 holidays
  2018: 15 holidays


### Create a function to add a holiday indicator to the merged dataset.


In [2]:
def add_holiday_indicator(df, holidays, date_column='Datum'):
    """
    Add a holiday indicator column to a DataFrame.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The DataFrame to which the holiday indicator will be added
    holidays : list
        List of pd.Timestamp objects representing holidays
    date_column : str
        The name of the date column in the DataFrame (default: 'Datum')
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with a new 'is_holiday' column (1 if holiday, 0 otherwise)
    """
    df = df.copy()
    # Create a boolean column checking if each date is in the holidays list
    df['is_holiday'] = df[date_column].isin(holidays).astype(int)
    return df

print("Holiday indicator function defined successfully")
print("This function will add an 'is_holiday' column to the merged dataset")


Holiday indicator function defined successfully
This function will add an 'is_holiday' column to the merged dataset


# This code merges the given CSV Datasets and can be used to combine new datasets to it.

## Load and Merge Datasets (CAREFUL: This cell creats a new csv file)
Now proceed with loading the CSV files and performing the outer join. The holiday indicator will be added after the merge.


In [3]:
import pandas as pd

# --- 1. Define File Names ---
file_umsatz = '/workspaces/team3_goodweather/1_DatasetCharacteristics/raw_data/umsatzdaten_gekuerzt.csv'
file_kiwo = '/workspaces/team3_goodweather/1_DatasetCharacteristics/raw_data/kiwo.csv'
file_wetter = '/workspaces/team3_goodweather/1_DatasetCharacteristics/raw_data/wetter.csv'
file_niederschlag = '/workspaces/team3_goodweather/1_DatasetCharacteristics/raw_data/Niederschlag.csv'


try:
    # --- 2. Load CSVs and Parse 'Datum' Column ---
    # We use parse_dates=['Datum'] to ensure Pandas treats 
    # the join column as a date, which is crucial for accuracy.
    print(f"Loading {file_umsatz}...")
    df_umsatz = pd.read_csv(file_umsatz, parse_dates=['Datum'])
    
    print(f"Loading {file_kiwo}...")
    df_kiwo = pd.read_csv(file_kiwo, parse_dates=['Datum'])
    
    print(f"Loading {file_wetter}...")
    df_wetter = pd.read_csv(file_wetter, parse_dates=['Datum'])

    print(f"Loading {file_niederschlag}...")
    df_niederschlag = pd.read_csv(file_niederschlag, parse_dates=['Datum'])

    # --- 3. Set 'Datum' as the Index for All DataFrames ---
    # Joining on the index is a very efficient way to combine DataFrames.
    df_umsatz = df_umsatz.set_index('Datum')
    df_kiwo = df_kiwo.set_index('Datum')
    df_wetter = df_wetter.set_index('Datum')
    df_niederschlag = df_niederschlag.set_index('Datum')
    
    print("All files loaded and indexed on 'Datum'.")

    # --- 4. Perform the Outer Join ---
    # We start with the largest DataFrame (df_umsatz) and join the others to it.
    # how='outer' is the key part: it includes all dates from all three files.
    # If a date exists in one file but not another, the missing data
    # will be filled with 'NaN' (Not a Number).
    
    # We can join a list of DataFrames at once.
    other_dfs = [df_kiwo, df_wetter, df_niederschlag]
    
    print("Performing outer join...")
    final_df = df_umsatz.join(other_dfs, how='outer')

    # --- 5. Clean Up and Add Holiday Indicator ---
    
    # After joining, 'Datum' is the index. Let's make it a regular column again.
    final_df = final_df.reset_index()
    
    # You may want to sort by date to make the final file logical
    final_df = final_df.sort_values(by='Datum')
    
    # Add the holiday indicator variable
    final_df = add_holiday_indicator(final_df, schleswig_holstein_holidays, date_column='Datum')
    print("Added holiday indicator variable to merged dataset")

    # --- 6. Display Results and Save to File ---
    print("\n--- Join Complete ---")
    print("First 5 rows of the combined data:")
    print(final_df.head())
    
    print("\nLast 5 rows of the combined data:")
    print(final_df.tail())

    print(f"\nTotal rows in combined file: {len(final_df)}")
    print(f"Holidays found in dataset: {final_df['is_holiday'].sum()}")
    
    # Save the final result to a new CSV
    output_filename = 'combined_data_outer.csv'
    final_df.to_csv(output_filename, index=False)
    
    print(f"\nSuccessfully saved combined data to '{output_filename}'")

except FileNotFoundError as e:
    print(f"\nError: File not found.")
    print(f"Details: {e}")
    print("Please make sure all CSV files (umsatzdaten_gekuerzt.csv, kiwo.csv, wetter.csv, Niederschlag.csv) are in the same directory as the script.")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")


Loading /workspaces/team3_goodweather/1_DatasetCharacteristics/raw_data/umsatzdaten_gekuerzt.csv...
Loading /workspaces/team3_goodweather/1_DatasetCharacteristics/raw_data/kiwo.csv...
Loading /workspaces/team3_goodweather/1_DatasetCharacteristics/raw_data/wetter.csv...
Loading /workspaces/team3_goodweather/1_DatasetCharacteristics/raw_data/Niederschlag.csv...
All files loaded and indexed on 'Datum'.
Performing outer join...
Added holiday indicator variable to merged dataset

--- Join Complete ---
First 5 rows of the combined data:
       Datum  id  Warengruppe  Umsatz  KielerWoche  Bewoelkung  Temperatur  \
0 2012-01-01 NaN          NaN     NaN          NaN         8.0      9.8250   
1 2012-01-02 NaN          NaN     NaN          NaN         7.0      7.4375   
2 2012-01-03 NaN          NaN     NaN          NaN         8.0      5.5375   
3 2012-01-04 NaN          NaN     NaN          NaN         4.0      5.6875   
4 2012-01-05 NaN          NaN     NaN          NaN         6.0      5.300