In [1]:
# import of modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import os

# create dataframe using relative path
input_path = "../processed_data/combined_data_imputed.csv"
print(f"Loading data from: {os.path.abspath(input_path)}")
df = pd.read_csv(input_path)

Loading data from: /home/codespace/team3_goodweather-1/1_DatasetCharacteristics/processed_data/combined_data_imputed.csv


In [2]:
# Hot encode the variable Warengruppe

# First, let's check the unique values in Warengruppe
print("Unique values in Warengruppe:")
print(df['Warengruppe'].unique())
print(f"\nNumber of unique categories: {df['Warengruppe'].nunique()}")
print(f"Value counts (including NaN):\n{df['Warengruppe'].value_counts(dropna=False)}")

# Check for missing values
missing_count = df['Warengruppe'].isna().sum()
print(f"\nMissing values in Warengruppe: {missing_count}")

# Create hot encoded (dummy) variables for Warengruppe
# Using dummy_na=True to create a column for missing values as well
# dtype=int ensures we get 1/0 instead of True/False
warengruppe_dummies = pd.get_dummies(df['Warengruppe'], prefix='Warengruppe', dummy_na=True, dtype=int)

# Add the dummy variables to the dataframe
df = pd.concat([df, warengruppe_dummies], axis=1)

# Display the new columns
print(f"\nNew hot encoded columns added:")
hot_encoded_cols = [col for col in df.columns if col.startswith('Warengruppe_')]
print(hot_encoded_cols)

# Show a sample of the data with hot encoded variables (including non-NaN values)
print(f"\nSample of data with hot encoded Warengruppe variables (showing rows with valid Warengruppe):")
non_nan_sample = df[df['Warengruppe'].notna()][['Warengruppe'] + hot_encoded_cols].head(10)
print(non_nan_sample)

# Show summary of hot encoded columns
print(f"\nSummary of hot encoded variables:")
for col in hot_encoded_cols:
    ones_count = df[col].sum()
    print(f"{col}: {ones_count} ones (1s)")

Unique values in Warengruppe:
[ 1.  3.  4.  5.  2.  6. nan]

Number of unique categories: 6
Value counts (including NaN):
Warengruppe
1.0    2174
3.0    2174
5.0    2174
2.0    2174
4.0    2120
6.0     348
NaN      47
Name: count, dtype: int64

Missing values in Warengruppe: 47

New hot encoded columns added:
['Warengruppe_1.0', 'Warengruppe_2.0', 'Warengruppe_3.0', 'Warengruppe_4.0', 'Warengruppe_5.0', 'Warengruppe_6.0', 'Warengruppe_nan']

Sample of data with hot encoded Warengruppe variables (showing rows with valid Warengruppe):
   Warengruppe  Warengruppe_1.0  Warengruppe_2.0  Warengruppe_3.0  \
0          1.0                1                0                0   
1          3.0                0                0                1   
2          4.0                0                0                0   
3          5.0                0                0                0   
4          2.0                0                1                0   
5          2.0                0                

In [3]:
# Hot encode weekdays from the Datum column

# Ensure Datum is in datetime format (it should already be from previous cells)
df['Datum'] = pd.to_datetime(df['Datum'])

# Extract the day of week (0=Monday, 1=Tuesday, ..., 6=Sunday)
df['weekday'] = df['Datum'].dt.dayofweek

# Create hot encoded columns for each weekday
# Monday = 0, Tuesday = 1, Wednesday = 2, Thursday = 3, Friday = 4, Saturday = 5, Sunday = 6
weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

for i, day_name in enumerate(weekday_names):
    df[f'Weekday_{day_name}'] = (df['weekday'] == i).astype(int)

# Display the distribution of weekdays
print("Weekday distribution:")
weekday_counts = df['weekday'].value_counts().sort_index()
for i, day_name in enumerate(weekday_names):
    count = weekday_counts.get(i, 0)
    percentage = (count / len(df)) * 100
    print(f"{day_name}: {count} days ({percentage:.1f}%)")

# Show the new hot encoded columns
weekday_columns = [f'Weekday_{day}' for day in weekday_names]
print(f"\nNew weekday columns created:")
for col in weekday_columns:
    print(f"  - {col}")

# Verify hot encoding (exactly one weekday should be 1 for each row)
weekday_sum = df[weekday_columns].sum(axis=1)
print(f"\nVerification - Each row should sum to 1:")
print(f"All rows sum to 1: {(weekday_sum == 1).all()}")
print(f"Min sum: {weekday_sum.min()}, Max sum: {weekday_sum.max()}")

# Show sample of the weekday hot encoding
print(f"\nSample of weekday hot encoding:")
sample_cols = ['Datum', 'weekday'] + weekday_columns
print(df[sample_cols].head(10))

# Drop the temporary weekday column as we only need the hot encoded versions
df = df.drop('weekday', axis=1)

Weekday distribution:
Monday: 1599 days (14.3%)
Tuesday: 1608 days (14.3%)
Wednesday: 1587 days (14.2%)
Thursday: 1609 days (14.4%)
Friday: 1585 days (14.1%)
Saturday: 1608 days (14.3%)
Sunday: 1615 days (14.4%)

New weekday columns created:
  - Weekday_Monday
  - Weekday_Tuesday
  - Weekday_Wednesday
  - Weekday_Thursday
  - Weekday_Friday
  - Weekday_Saturday
  - Weekday_Sunday

Verification - Each row should sum to 1:
All rows sum to 1: True
Min sum: 1, Max sum: 1

Sample of weekday hot encoding:
       Datum  weekday  Weekday_Monday  Weekday_Tuesday  Weekday_Wednesday  \
0 2013-07-01        0               1                0                  0   
1 2013-07-01        0               1                0                  0   
2 2013-07-01        0               1                0                  0   
3 2013-07-01        0               1                0                  0   
4 2013-07-01        0               1                0                  0   
5 2013-07-02        1            

In [4]:
# Calculation of 7-day rolling average for Temperature

# Bringing 'Datum' in datetime format and sort by date
df['Datum'] = pd.to_datetime(df['Datum'])
df = df.sort_values('Datum')

# Calculate rolling average over 7-day window for Temperature
df['Temperatur_7day_rolling'] = df['Temperatur'].rolling(window=7, min_periods=1).mean()

# Display first few rows to verify the calculation
print("First 10 rows with rolling average:")
print(df[['Datum', 'Temperatur', 'Temperatur_7day_rolling']].head(10))

# Display basic statistics of the rolling average
print("\nBasic statistics for 7-day rolling average:")
print(df['Temperatur_7day_rolling'].describe())

First 10 rows with rolling average:
       Datum  Temperatur  Temperatur_7day_rolling
0 2013-07-01     17.8375                  17.8375
1 2013-07-01     17.8375                  17.8375
2 2013-07-01     17.8375                  17.8375
3 2013-07-01     17.8375                  17.8375
4 2013-07-01     17.8375                  17.8375
5 2013-07-02     17.3125                  17.7500
6 2013-07-02     17.3125                  17.6875
7 2013-07-02     17.3125                  17.6125
8 2013-07-02     17.3125                  17.5375
9 2013-07-02     17.3125                  17.4625

Basic statistics for 7-day rolling average:
count    11211.000000
mean        12.035811
std          7.086378
min         -8.064286
25%          6.301786
50%         11.575000
75%         17.926786
max         31.379591
Name: Temperatur_7day_rolling, dtype: float64


In [5]:
# Categorize Niederschlag variable into dry and wet categories

# First, let's examine the Niederschlag variable
print("Niederschlag variable analysis:")
print(f"Data type: {df['Niederschlag'].dtype}")
print(f"Basic statistics:\n{df['Niederschlag'].describe()}")
print(f"\nMissing values: {df['Niederschlag'].isna().sum()}")
print(f"Number of zero values: {(df['Niederschlag'] == 0).sum()}")
print(f"Number of positive values: {(df['Niederschlag'] > 0).sum()}")

# Create hot encoded categories based on Niederschlag values
# Niederschlag_trocken = 1 when Niederschlag = 0 (dry conditions)
# Niederschlag_nass = 1 when Niederschlag > 0 (wet conditions)

df['Niederschlag_trocken'] = (df['Niederschlag'] == 0).astype(int)
df['Niederschlag_nass'] = (df['Niederschlag'] > 0).astype(int)

# Verify the categorization
print(f"\nCategorization results:")
print(f"Niederschlag_trocken (dry days): {df['Niederschlag_trocken'].sum()} days")
print(f"Niederschlag_nass (wet days): {df['Niederschlag_nass'].sum()} days")
print(f"Total rows: {len(df)}")

# Verify that the sum equals total rows (excluding NaN values if any)
valid_niederschlag = df['Niederschlag'].notna().sum()
total_categorized = df['Niederschlag_trocken'].sum() + df['Niederschlag_nass'].sum()
print(f"Valid Niederschlag values: {valid_niederschlag}")
print(f"Total categorized: {total_categorized}")

# Show sample data
print(f"\nSample of categorized data:")
sample_cols = ['Datum', 'Niederschlag', 'Niederschlag_trocken', 'Niederschlag_nass']
print(df[sample_cols].head(10))

# Show distribution
print(f"\nDistribution of categories:")
print(f"Proportion of dry days: {df['Niederschlag_trocken'].mean():.3f}")
print(f"Proportion of wet days: {df['Niederschlag_nass'].mean():.3f}")

Niederschlag variable analysis:
Data type: float64
Basic statistics:
count    11211.000000
mean         2.041459
std          4.037744
min          0.000000
25%          0.000000
50%          0.100000
75%          2.200000
max         37.700000
Name: Niederschlag, dtype: float64

Missing values: 0
Number of zero values: 5314
Number of positive values: 5897

Categorization results:
Niederschlag_trocken (dry days): 5314 days
Niederschlag_nass (wet days): 5897 days
Total rows: 11211
Valid Niederschlag values: 11211
Total categorized: 11211

Sample of categorized data:
       Datum  Niederschlag  Niederschlag_trocken  Niederschlag_nass
0 2013-07-01           0.3                     0                  1
1 2013-07-01           0.3                     0                  1
2 2013-07-01           0.3                     0                  1
3 2013-07-01           0.3                     0                  1
4 2013-07-01           0.3                     0                  1
5 2013-07-02        

In [6]:
# Calculation of 7-day rolling average for Niederschlag

# Bringing 'Datum' in datetime format and sort by date
df['Datum'] = pd.to_datetime(df['Datum'])
df = df.sort_values('Datum')

# Calculate rolling average over 7-day window for Niederschlag
df['Niederschlag_7day_rolling'] = df['Niederschlag'].rolling(window=7, min_periods=1).mean()

# Display first few rows to verify the calculation
print("First 10 rows with rolling average:")
print(df[['Datum', 'Niederschlag', 'Niederschlag_7day_rolling']].head(10))

# Display basic statistics of the rolling average
print("\nBasic statistics for 7-day rolling average:")
print(df['Niederschlag_7day_rolling'].describe())

First 10 rows with rolling average:
       Datum  Niederschlag  Niederschlag_7day_rolling
0 2013-07-01           0.3                   0.300000
1 2013-07-01           0.3                   0.300000
2 2013-07-01           0.3                   0.300000
3 2013-07-01           0.3                   0.300000
4 2013-07-01           0.3                   0.300000
5 2013-07-02           0.1                   0.266667
6 2013-07-02           0.1                   0.242857
7 2013-07-02           0.1                   0.214286
8 2013-07-02           0.1                   0.185714
9 2013-07-02           0.1                   0.157143

Basic statistics for 7-day rolling average:
count    11211.000000
mean         2.040442
std          3.293091
min          0.000000
25%          0.000000
50%          0.600000
75%          2.771429
max         36.014286
Name: Niederschlag_7day_rolling, dtype: float64


In [7]:
#weathercode category encoding

print("--- Analysis of 'Wettercode' Column ---")

# 1. Check for Missing Values
# isnull() checks every row, and sum() counts how many are True.
missing_count = df['Wettercode'].isnull().sum()
print(f"Number of missing values: {missing_count}")

# 2. Get the percentage of missing values
# It is often helpful to know if the missing data is 1% or 50% of the dataset.
missing_percentage = (missing_count / len(df)) * 100
print(f"Percentage missing: {missing_percentage:.2f}%")

# 3. Get a General Summary
# .describe() is smart:
# - If the column is Numbers: It gives mean, min, max, std.
# - If the column is Text/Categories: It gives count, unique, top, frequency.
print("\n--- Summary Statistics ---")
print(df['Wettercode'].describe())

# 4. Check the Distribution (Value Counts)
# Since this is a "Code" (likely categories like 1=Sun, 2=Cloud), 
# value_counts() is usually more useful than mean/average.
# dropna=False ensures we see the missing values in this list too.
print("\n--- Distribution of Codes (Top 10) ---")
print(df['Wettercode'].value_counts(dropna=False).head(10))

--- Analysis of 'Wettercode' Column ---
Number of missing values: 0
Percentage missing: 0.00%

--- Summary Statistics ---
count    11211.000000
mean        35.432343
std         27.121781
min          0.000000
25%         10.000000
50%         21.000000
75%         61.000000
max         95.000000
Name: Wettercode, dtype: float64

--- Distribution of Codes (Top 10) ---
Wettercode
61.0    3578
21.0    1804
0.0     1510
5.0     1041
10.0     958
63.0     768
20.0     390
95.0     203
65.0     195
22.0     142
Name: count, dtype: int64


In [8]:
def categorize_wetter(code):
    """
    Maps WMO codes (0-99) to 10 simplified categories.
    Handles NaN values safely.
    """
    if pd.isna(code):
        return -1  # Indicate missing value with -1
    
    # Ensure code is an integer (in case it's stored as float 10.0 or string "10")
    try:
        c = int(code)
    except ValueError:
        return "Invalid Format"

    # --- 1. Cloud / Stable ---
    if 0 <= c <= 3:
        return 1 

    # --- 2. Haze / Dust / Sand ---
    # 04-09 (Haze/Dust), 30-35 (Sandstorms)
    elif (4 <= c <= 9) or (30 <= c <= 35):
        return 2 

    # --- 3. Fog / Mist ---
    # 10-12 (Mist), 40-49 (Fog)
    elif (10 <= c <= 12) or (40 <= c <= 49):
        return 3

    # --- 10. Thunderstorm (Checked early to catch special codes 17, 19, 29) ---
    # 17 (Thunder audible), 19 (Tornado), 29 (Past Thunder), 91-99 (Current Thunder)
    elif c in [17, 19, 29] or (91 <= c <= 99):
        return 10

    # --- 4. Past / Vicinity ---
    # 13-16 (Vicinity), 18 (Squalls), 20-28 (Past Precip/Fog)
    elif (13 <= c <= 18) or (20 <= c <= 28):
        return 4

    # --- 7. Freezing / Mix (Checked before Drizzle/Rain to catch freezing variants) ---
    # 56-57 (Freezing Drizzle), 66-67 (Freezing Rain), 68-69 (Sleet)
    elif c in [56, 57] or (66 <= c <= 69):
        return 7

    # --- 5. Drizzle ---
    # 50-55, 58-59 (Remaining Drizzle codes)
    elif 50 <= c <= 59:
        return 5

    # --- 6. Rain ---
    # 60-65 (Rain)
    elif 60 <= c <= 65:
        return 6

    # --- 8. Snow ---
    # 36-39 (Drifting Snow), 70-79 (Snowfall)
    elif (36 <= c <= 39) or (70 <= c <= 79):
        return 8

    # --- 9. Showers ---
    # 80-90 (Rain, Snow, or Hail Showers)
    elif 80 <= c <= 90:
        return 9

    else:
        return "Other"

# Apply the function to create a new column
# We use .apply() to run the function on every row
df['Weather_Category'] = df['Wettercode'].apply(categorize_wetter)

# --- Analysis of the new Categories ---

print("--- New Category Distribution ---")
# Check how many rows fall into each of our 10 categories
print(df['Weather_Category'].value_counts())

print("\n--- Sample Check ---")
# Show original code next to new category to verify
print(df[['Wettercode', 'Weather_Category']].sample(10))

--- New Category Distribution ---
Weather_Category
6     4541
4     2412
1     1515
3     1134
2     1041
8      236
10     213
7       60
5       59
Name: count, dtype: int64

--- Sample Check ---
      Wettercode  Weather_Category
237         63.0                 6
8760        71.0                 8
4232         0.0                 1
4095        61.0                 6
6051        61.0                 6
6752        61.0                 6
5813        63.0                 6
2156        61.0                 6
3358        63.0                 6
9909        10.0                 3


In [9]:
#one-hot encoding of the new Weather_Category_ID column

print("--- One-Hot Encoding Weather Categories ---")

# 1. Perform One-Hot Encoding
# columns=['Weather_Category']: Only encode this specific column
# prefix='W_Cat': Adds this text to the new column names (e.g., W_Cat_1, W_Cat_2)
# dtype=int: Ensures the result is 0 and 1 (not True/False)
df_encoded = pd.get_dummies(df, columns=['Weather_Category'], prefix='W_Cat', dtype=int)

# 2. Force all 11 columns to exist (Robustness Step)
# If your specific dataset is missing a category (e.g., no "Tornadoes" happened),
# get_dummies won't create a column for it. Machine Learning models often crash
# if the number of columns changes. This block fixes that.

expected_categories = [-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

for cat in expected_categories:
    col_name = f"W_Cat_{cat}"
    if col_name not in df_encoded.columns:
        print(f"Category {cat} was missing in data. Creating empty column: {col_name}")
        df_encoded[col_name] = 0

# 3. Reorder columns for neatness (Optional)
# This sorts them so W_Cat_-1 comes first or last, and 1-10 are in order.
encoded_cols = [f"W_Cat_{cat}" for cat in expected_categories]
# We keep all original columns + the sorted encoded columns
other_cols = [c for c in df_encoded.columns if c not in encoded_cols]
df = df_encoded[other_cols + encoded_cols]

# --- Verification ---

print("\n--- New Columns Created ---")
# Filter just to show the new columns
weather_columns = [c for c in df.columns if 'W_Cat_' in c]
print(df[weather_columns].head())

print("\n--- Summary of Hot Encoding ---")
# Check the sums to see how many 1s are in each column
print(df[weather_columns].sum())

--- One-Hot Encoding Weather Categories ---
Category -1 was missing in data. Creating empty column: W_Cat_-1
Category 9 was missing in data. Creating empty column: W_Cat_9

--- New Columns Created ---
   W_Cat_-1  W_Cat_1  W_Cat_2  W_Cat_3  W_Cat_4  W_Cat_5  W_Cat_6  W_Cat_7  \
0         0        0        0        0        1        0        0        0   
1         0        0        0        0        1        0        0        0   
2         0        0        0        0        1        0        0        0   
3         0        0        0        0        1        0        0        0   
4         0        0        0        0        1        0        0        0   

   W_Cat_8  W_Cat_9  W_Cat_10  
0        0        0         0  
1        0        0         0  
2        0        0         0  
3        0        0         0  
4        0        0         0  

--- Summary of Hot Encoding ---
W_Cat_-1       0
W_Cat_1     1515
W_Cat_2     1041
W_Cat_3     1134
W_Cat_4     2412
W_Cat_5       59
W_C

In [10]:
# Create hot encoded temperature categories based on monthly temperature averages

# First, ensure we have the date in datetime format and extract month
df['Datum'] = pd.to_datetime(df['Datum'])

# Calculate monthly mean temperature for the entire dataset
monthly_temp_means = df.groupby(df['Datum'].dt.month)['Temperatur'].mean()
print("Monthly mean temperatures across entire dataset:")
print(monthly_temp_means)

# Create a mapping of month to mean temperature for easy lookup
month_temp_dict = monthly_temp_means.to_dict()

# Define thresholds for categorization (in degrees Celsius)
# Cold: more than 3°C below monthly average
# Normal: within ±3°C of monthly average  
# Hot: more than 3°C above monthly average
cold_threshold = -3.0
hot_threshold = 3.0

# Categorize temperature based on difference from monthly mean
def categorize_temperature(row):
    month = row['Datum'].month
    monthly_mean = month_temp_dict[month]
    temp_diff = row['Temperatur'] - monthly_mean
    
    if temp_diff <= cold_threshold:
        return 'cold'
    elif temp_diff >= hot_threshold:
        return 'hot'
    else:
        return 'normal'

# Apply categorization and create hot-encoded columns directly
temp_categories = df.apply(categorize_temperature, axis=1)

# Create hot-encoded columns for temperature categories
df['Temperatur_kalt'] = (temp_categories == 'cold').astype(int)
df['Temperatur_normal'] = (temp_categories == 'normal').astype(int)
df['Temperatur_warm'] = (temp_categories == 'hot').astype(int)

# Display results
print(f"\nTemperature categorization results:")
print(f"Cold days (Temperatur_kalt): {df['Temperatur_kalt'].sum()}")
print(f"Normal days (Temperatur_normal): {df['Temperatur_normal'].sum()}")
print(f"Hot days (Temperatur_warm): {df['Temperatur_warm'].sum()}")
print(f"Total days: {len(df)}")

# Show distribution by category
print(f"\nDistribution of temperature categories:")
category_counts = temp_categories.value_counts()
for category in ['cold', 'normal', 'hot']:
    count = category_counts.get(category, 0)
    percentage = (count / len(df)) * 100
    print(f"{category.capitalize()}: {count} days ({percentage:.1f}%)")

# Show sample data with new features
print(f"\nSample of data with new temperature features:")
temp_cols = ['Temperatur_kalt', 'Temperatur_normal', 'Temperatur_warm']
print(df[temp_cols].head(10))

# Verify hot encoding (should sum to total number of rows)
total_encoded = df['Temperatur_kalt'].sum() + df['Temperatur_normal'].sum() + df['Temperatur_warm'].sum()
print(f"\nVerification - Total hot-encoded values: {total_encoded} (should equal {len(df)})")

Monthly mean temperatures across entire dataset:
Datum
1      2.728366
2      4.358531
3      7.103450
4     10.960906
5     15.542939
6     19.035405
7     21.071860
8     20.345694
9     17.185659
10    12.918950
11     7.444665
12     5.513308
Name: Temperatur, dtype: float64

Temperature categorization results:
Cold days (Temperatur_kalt): 2092
Normal days (Temperatur_normal): 6956
Hot days (Temperatur_warm): 2163
Total days: 11211

Distribution of temperature categories:
Cold: 2092 days (18.7%)
Normal: 6956 days (62.0%)
Hot: 2163 days (19.3%)

Sample of data with new temperature features:
   Temperatur_kalt  Temperatur_normal  Temperatur_warm
0                1                  0                0
1                1                  0                0
2                1                  0                0
3                1                  0                0
4                1                  0                0
5                1                  0                0
6             

In [11]:
# Save the updated dataframe with new features to CSV file

# Define the output path (relative to notebooks directory)
output_path = "../processed_data/combined_data_final_imputed.csv"

# Save the dataframe with new features
df.to_csv(output_path, index=False)

# Verify the save was successful
print(f"Dataframe saved successfully to: {output_path}")
print(f"Full path: {os.path.abspath(output_path)}")
print(f"Shape of saved dataframe: {df.shape}")
print(f"Columns in final dataframe: {len(df.columns)}")

# Display all the new feature columns that were added
warengruppe_features = [col for col in df.columns if col.startswith('Warengruppe_')]
niederschlag_features = ['Niederschlag_trocken', 'Niederschlag_nass', 'Niederschlag_7day_rolling']
temperature_features = ['Temperatur_7day_rolling', 'Temperatur_kalt', 'Temperatur_normal', 'Temperatur_warm']
weather_features = ['Weather_Category']

all_new_features = warengruppe_features + niederschlag_features + temperature_features + weather_features

print(f"\nAll new features added ({len(all_new_features)} total):")
print("Warengruppe features:")
for feature in warengruppe_features:
    print(f"  - {feature}")
print("Niederschlag features:")
for feature in niederschlag_features:
    print(f"  - {feature}")
print("Temperature features:")
for feature in temperature_features:
    print(f"  - {feature}")
print("Weather features:")
for feature in weather_features:
    print(f"  - {feature}")

# Show a preview of the final dataframe structure
print(f"\nPreview of final dataframe:")
print(df.head())

Dataframe saved successfully to: ../processed_data/combined_data_final_imputed.csv
Full path: /home/codespace/team3_goodweather-1/1_DatasetCharacteristics/processed_data/combined_data_final_imputed.csv
Shape of saved dataframe: (11211, 48)
Columns in final dataframe: 48

All new features added (15 total):
Warengruppe features:
  - Warengruppe_1.0
  - Warengruppe_2.0
  - Warengruppe_3.0
  - Warengruppe_4.0
  - Warengruppe_5.0
  - Warengruppe_6.0
  - Warengruppe_nan
Niederschlag features:
  - Niederschlag_trocken
  - Niederschlag_nass
  - Niederschlag_7day_rolling
Temperature features:
  - Temperatur_7day_rolling
  - Temperatur_kalt
  - Temperatur_normal
  - Temperatur_warm
Weather features:
  - Weather_Category

Preview of final dataframe:
       Datum         id  Warengruppe      Umsatz  KielerWoche  Bewoelkung  \
0 2013-07-01  1307011.0          1.0  148.828353          0.0         6.0   
1 2013-07-01  1307013.0          3.0  201.198426          0.0         6.0   
2 2013-07-01  130701