# Data preparation
This notebook contains the scripts corresponding to data preparation

## Import libraries

In [3]:
import numpy as np 
from pathlib import Path
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

## Define file paths

In [4]:
DATA_DIR = Path(".")

SALES_PATH = DATA_DIR / "umsatzdaten_gekuerzt.csv"
WEATHER_PATH = DATA_DIR / "wetter.csv"
KIELER_PATH = DATA_DIR / "kiwo.csv"

OUTPUT_PATH = DATA_DIR / "merged_daily_sales_weather.csv"


## Load datasets

In [6]:
sales = pd.read_csv(SALES_PATH)
weather = pd.read_csv(WEATHER_PATH)
kieler = pd.read_csv(KIELER_PATH)

print("Sales:", sales.shape)
print("Weather:", weather.shape)
print("Kieler Woche:", kieler.shape)

print(sales.columns)
print(weather.columns)
print(kieler.columns)



Sales: (9334, 4)
Weather: (2601, 5)
Kieler Woche: (72, 2)
Index(['id', 'Datum', 'Warengruppe', 'Umsatz'], dtype='object')
Index(['Datum', 'Bewoelkung', 'Temperatur', 'Windgeschwindigkeit',
       'Wettercode'],
      dtype='object')
Index(['Datum', 'KielerWoche'], dtype='object')


## Clean dataset

In [7]:
sales["Datum"] = pd.to_datetime(sales["Datum"], errors="coerce")
weather["Datum"] = pd.to_datetime(weather["Datum"], errors="coerce")
kieler["Datum"] = pd.to_datetime(kieler["Datum"], errors="coerce")

assert sales["Datum"].isna().sum() == 0, "Sales has invalid dates"
assert weather["Datum"].isna().sum() == 0, "Weather has invalid dates"
assert kieler["Datum"].isna().sum() == 0, "Kieler Woche has invalid dates"


## Merge datasets

In [10]:
merged = pd.merge(
    sales,
    weather,
    on="Datum",
    how="left",
    validate="many_to_one"
)

merged = pd.merge(
    merged,
    kieler,
    on="Datum",
    how="left",
    validate="many_to_one"
)

merged["KielerWoche"] = merged["KielerWoche"].fillna(0).astype(int)

weather_cols = ["Bewoelkung", "Temperatur", "Windgeschwindigkeit", "Wettercode"]

print("Missing values per weather feature:")
print(merged[weather_cols].isna().sum())

print("Final dataset shape:", merged.shape)
print(merged.head())

assert "Umsatz" in merged.columns, "Target variable missing"
assert merged["Umsatz"].isna().sum() == 0, "Sales should not have missing values"

merged.to_csv(OUTPUT_PATH, index=False)
print(f"Saved merged dataset to {OUTPUT_PATH}")


Missing values per weather feature:
Bewoelkung               70
Temperatur               16
Windgeschwindigkeit      16
Wettercode             2325
dtype: int64
Final dataset shape: (9334, 9)
        id      Datum  Warengruppe      Umsatz  Bewoelkung  Temperatur  \
0  1307011 2013-07-01            1  148.828353         6.0     17.8375   
1  1307021 2013-07-02            1  159.793757         3.0     17.3125   
2  1307031 2013-07-03            1  111.885594         7.0     21.0750   
3  1307041 2013-07-04            1  168.864941         7.0     18.8500   
4  1307051 2013-07-05            1  171.280754         5.0     19.9750   

   Windgeschwindigkeit  Wettercode  KielerWoche  
0                 15.0        20.0            0  
1                 10.0         NaN            0  
2                  6.0        61.0            0  
3                  7.0        20.0            0  
4                 12.0         NaN            0  
Saved merged dataset to merged_daily_sales_weather.csv
