In [2]:
# import of modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

# create dataframe
df = pd.read_csv("/workspaces/team3_goodweather/1_DatasetCharacteristics/processed_data/combined_data_outer_with_test.csv")

In [3]:
# Hot encode the variable Warengruppe

# First, let's check the unique values in Warengruppe
print("Unique values in Warengruppe:")
print(df['Warengruppe'].unique())
print(f"\nNumber of unique categories: {df['Warengruppe'].nunique()}")
print(f"Value counts (including NaN):\n{df['Warengruppe'].value_counts(dropna=False)}")

# Check for missing values
missing_count = df['Warengruppe'].isna().sum()
print(f"\nMissing values in Warengruppe: {missing_count}")

# Create hot encoded (dummy) variables for Warengruppe
# Using dummy_na=True to create a column for missing values as well
# dtype=int ensures we get 1/0 instead of True/False
warengruppe_dummies = pd.get_dummies(df['Warengruppe'], prefix='Warengruppe', dummy_na=True, dtype=int)

# Add the dummy variables to the dataframe
df = pd.concat([df, warengruppe_dummies], axis=1)

# Display the new columns
print(f"\nNew hot encoded columns added:")
hot_encoded_cols = [col for col in df.columns if col.startswith('Warengruppe_')]
print(hot_encoded_cols)

# Show a sample of the data with hot encoded variables (including non-NaN values)
print(f"\nSample of data with hot encoded Warengruppe variables (showing rows with valid Warengruppe):")
non_nan_sample = df[df['Warengruppe'].notna()][['Warengruppe'] + hot_encoded_cols].head(10)
print(non_nan_sample)

# Show summary of hot encoded columns
print(f"\nSummary of hot encoded variables:")
for col in hot_encoded_cols:
    ones_count = df[col].sum()
    print(f"{col}: {ones_count} ones (1s)")

Unique values in Warengruppe:
[nan  1.  3.  4.  5.  2.  6.]

Number of unique categories: 6
Value counts (including NaN):
Warengruppe
1.0    2174
3.0    2174
5.0    2174
2.0    2174
4.0    2120
NaN     618
6.0     348
Name: count, dtype: int64

Missing values in Warengruppe: 618

New hot encoded columns added:
['Warengruppe_1.0', 'Warengruppe_2.0', 'Warengruppe_3.0', 'Warengruppe_4.0', 'Warengruppe_5.0', 'Warengruppe_6.0', 'Warengruppe_nan']

Sample of data with hot encoded Warengruppe variables (showing rows with valid Warengruppe):
     Warengruppe  Warengruppe_1.0  Warengruppe_2.0  Warengruppe_3.0  \
417          1.0                1                0                0   
418          3.0                0                0                1   
419          4.0                0                0                0   
420          5.0                0                0                0   
421          2.0                0                1                0   
422          2.0                0 

In [4]:
# Calculation of 7-day rolling average for Temperature

# Bringing 'Datum' in datetime format and sort by date
df['Datum'] = pd.to_datetime(df['Datum'])
df = df.sort_values('Datum')

# Calculate rolling average over 7-day window for Temperature
df['Temperatur_7day_rolling'] = df['Temperatur'].rolling(window=7, min_periods=1).mean()

# Display first few rows to verify the calculation
print("First 10 rows with rolling average:")
print(df[['Datum', 'Temperatur', 'Temperatur_7day_rolling']].head(10))

# Display basic statistics of the rolling average
print("\nBasic statistics for 7-day rolling average:")
print(df['Temperatur_7day_rolling'].describe())

First 10 rows with rolling average:
       Datum  Temperatur  Temperatur_7day_rolling
0 2012-01-01    9.825000                 9.825000
1 2012-01-02    7.437500                 8.631250
2 2012-01-03    5.537500                 7.600000
3 2012-01-04    5.687500                 7.121875
4 2012-01-05    5.300000                 6.757500
5 2012-01-06    2.625000                 6.068750
6 2012-01-07    6.528571                 6.134439
7 2012-01-08    5.962500                 5.582653
8 2012-01-09    5.150000                 5.255867
9 2012-01-10    6.112500                 5.338010

Basic statistics for 7-day rolling average:
count    11596.000000
mean        12.007136
std          7.076709
min         -8.064286
25%          6.300893
50%         11.607143
75%         17.918750
max         31.379591
Name: Temperatur_7day_rolling, dtype: float64


In [5]:
# Categorize Niederschlag variable into dry and wet categories

# First, let's examine the Niederschlag variable
print("Niederschlag variable analysis:")
print(f"Data type: {df['Niederschlag'].dtype}")
print(f"Basic statistics:\n{df['Niederschlag'].describe()}")
print(f"\nMissing values: {df['Niederschlag'].isna().sum()}")
print(f"Number of zero values: {(df['Niederschlag'] == 0).sum()}")
print(f"Number of positive values: {(df['Niederschlag'] > 0).sum()}")

# Create hot encoded categories based on Niederschlag values
# Niederschlag_trocken = 1 when Niederschlag = 0 (dry conditions)
# Niederschlag_nass = 1 when Niederschlag > 0 (wet conditions)

df['Niederschlag_trocken'] = (df['Niederschlag'] == 0).astype(int)
df['Niederschlag_nass'] = (df['Niederschlag'] > 0).astype(int)

# Verify the categorization
print(f"\nCategorization results:")
print(f"Niederschlag_trocken (dry days): {df['Niederschlag_trocken'].sum()} days")
print(f"Niederschlag_nass (wet days): {df['Niederschlag_nass'].sum()} days")
print(f"Total rows: {len(df)}")

# Verify that the sum equals total rows (excluding NaN values if any)
valid_niederschlag = df['Niederschlag'].notna().sum()
total_categorized = df['Niederschlag_trocken'].sum() + df['Niederschlag_nass'].sum()
print(f"Valid Niederschlag values: {valid_niederschlag}")
print(f"Total categorized: {total_categorized}")

# Show sample data
print(f"\nSample of categorized data:")
sample_cols = ['Datum', 'Niederschlag', 'Niederschlag_trocken', 'Niederschlag_nass']
print(df[sample_cols].head(10))

# Show distribution
print(f"\nDistribution of categories:")
print(f"Proportion of dry days: {df['Niederschlag_trocken'].mean():.3f}")
print(f"Proportion of wet days: {df['Niederschlag_nass'].mean():.3f}")

Niederschlag variable analysis:
Data type: float64
Basic statistics:
count    11779.000000
mean         2.051481
std          4.077174
min          0.000000
25%          0.000000
50%          0.100000
75%          2.200000
max         61.600000
Name: Niederschlag, dtype: float64

Missing values: 3
Number of zero values: 5546
Number of positive values: 6233

Categorization results:
Niederschlag_trocken (dry days): 5546 days
Niederschlag_nass (wet days): 6233 days
Total rows: 11782
Valid Niederschlag values: 11779
Total categorized: 11779

Sample of categorized data:
       Datum  Niederschlag  Niederschlag_trocken  Niederschlag_nass
0 2012-01-01          14.0                     0                  1
1 2012-01-02           0.0                     1                  0
2 2012-01-03          20.8                     0                  1
3 2012-01-04          19.7                     0                  1
4 2012-01-05           3.3                     0                  1
5 2012-01-06        

In [6]:
# Calculation of 7-day rolling average for Niederschlag

# Bringing 'Datum' in datetime format and sort by date
df['Datum'] = pd.to_datetime(df['Datum'])
df = df.sort_values('Datum')

# Calculate rolling average over 7-day window for Niederschlag
df['Niederschlag_7day_rolling'] = df['Niederschlag'].rolling(window=7, min_periods=1).mean()

# Display first few rows to verify the calculation
print("First 10 rows with rolling average:")
print(df[['Datum', 'Niederschlag', 'Niederschlag_7day_rolling']].head(10))

# Display basic statistics of the rolling average
print("\nBasic statistics for 7-day rolling average:")
print(df['Niederschlag_7day_rolling'].describe())

First 10 rows with rolling average:
       Datum  Niederschlag  Niederschlag_7day_rolling
0 2012-01-01          14.0                  14.000000
1 2012-01-02           0.0                   7.000000
2 2012-01-03          20.8                  11.600000
3 2012-01-04          19.7                  13.625000
4 2012-01-05           3.3                  11.560000
5 2012-01-06           0.7                   9.750000
6 2012-01-07           5.8                   9.185714
7 2012-01-08           0.3                   7.228571
8 2012-01-09           3.0                   7.657143
9 2012-01-10           0.2                   4.714286

Basic statistics for 7-day rolling average:
count    11782.000000
mean         2.054044
std          3.253750
min          0.000000
25%          0.000000
50%          0.657143
75%          2.800000
max         36.014286
Name: Niederschlag_7day_rolling, dtype: float64


In [7]:
#weathercode category encoding

print("--- Analysis of 'Wettercode' Column ---")

# 1. Check for Missing Values
# isnull() checks every row, and sum() counts how many are True.
missing_count = df['Wettercode'].isnull().sum()
print(f"Number of missing values: {missing_count}")

# 2. Get the percentage of missing values
# It is often helpful to know if the missing data is 1% or 50% of the dataset.
missing_percentage = (missing_count / len(df)) * 100
print(f"Percentage missing: {missing_percentage:.2f}%")

# 3. Get a General Summary
# .describe() is smart:
# - If the column is Numbers: It gives mean, min, max, std.
# - If the column is Text/Categories: It gives count, unique, top, frequency.
print("\n--- Summary Statistics ---")
print(df['Wettercode'].describe())

# 4. Check the Distribution (Value Counts)
# Since this is a "Code" (likely categories like 1=Sun, 2=Cloud), 
# value_counts() is usually more useful than mean/average.
# dropna=False ensures we see the missing values in this list too.
print("\n--- Distribution of Codes (Top 10) ---")
print(df['Wettercode'].value_counts(dropna=False).head(10))

--- Analysis of 'Wettercode' Column ---
Number of missing values: 2997
Percentage missing: 25.44%

--- Summary Statistics ---
count    8785.000000
mean       36.456346
std        27.293699
min         0.000000
25%        10.000000
50%        22.000000
75%        61.000000
max        95.000000
Name: Wettercode, dtype: float64

--- Distribution of Codes (Top 10) ---
Wettercode
NaN     2997
61.0    2836
21.0    1220
0.0     1051
10.0     890
5.0      783
63.0     599
20.0     315
95.0     179
22.0     118
Name: count, dtype: int64


In [10]:
def categorize_wetter(code):
    """
    Maps WMO codes (0-99) to 10 simplified categories.
    Handles NaN values safely.
    """
    if pd.isna(code):
        return -1  # Indicate missing value with -1
    
    # Ensure code is an integer (in case it's stored as float 10.0 or string "10")
    try:
        c = int(code)
    except ValueError:
        return "Invalid Format"

    # --- 1. Cloud / Stable ---
    if 0 <= c <= 3:
        return 1 

    # --- 2. Haze / Dust / Sand ---
    # 04-09 (Haze/Dust), 30-35 (Sandstorms)
    elif (4 <= c <= 9) or (30 <= c <= 35):
        return 2 

    # --- 3. Fog / Mist ---
    # 10-12 (Mist), 40-49 (Fog)
    elif (10 <= c <= 12) or (40 <= c <= 49):
        return 3

    # --- 10. Thunderstorm (Checked early to catch special codes 17, 19, 29) ---
    # 17 (Thunder audible), 19 (Tornado), 29 (Past Thunder), 91-99 (Current Thunder)
    elif c in [17, 19, 29] or (91 <= c <= 99):
        return 10

    # --- 4. Past / Vicinity ---
    # 13-16 (Vicinity), 18 (Squalls), 20-28 (Past Precip/Fog)
    elif (13 <= c <= 18) or (20 <= c <= 28):
        return 4

    # --- 7. Freezing / Mix (Checked before Drizzle/Rain to catch freezing variants) ---
    # 56-57 (Freezing Drizzle), 66-67 (Freezing Rain), 68-69 (Sleet)
    elif c in [56, 57] or (66 <= c <= 69):
        return 7

    # --- 5. Drizzle ---
    # 50-55, 58-59 (Remaining Drizzle codes)
    elif 50 <= c <= 59:
        return 5

    # --- 6. Rain ---
    # 60-65 (Rain)
    elif 60 <= c <= 65:
        return 6

    # --- 8. Snow ---
    # 36-39 (Drifting Snow), 70-79 (Snowfall)
    elif (36 <= c <= 39) or (70 <= c <= 79):
        return 8

    # --- 9. Showers ---
    # 80-90 (Rain, Snow, or Hail Showers)
    elif 80 <= c <= 90:
        return 9

    else:
        return "Other"

# Apply the function to create a new column
# We use .apply() to run the function on every row
df['Weather_Category'] = df['Wettercode'].apply(categorize_wetter)

# --- Analysis of the new Categories ---

print("--- New Category Distribution ---")
# Check how many rows fall into each of our 10 categories
print(df['Weather_Category'].value_counts())

print("\n--- Sample Check ---")
# Show original code next to new category to verify
print(df[['Wettercode', 'Weather_Category']].sample(10))

--- New Category Distribution ---
Weather_Category
 6     3551
-1     2997
 4     1749
 1     1067
 3     1053
 2      783
 8      225
 10     194
 5       72
 7       51
 9       40
Name: count, dtype: int64

--- Sample Check ---
      Wettercode  Weather_Category
4115         NaN                -1
2560         NaN                -1
4941        21.0                 4
4105         NaN                -1
7845         NaN                -1
4565         0.0                 1
1867         NaN                -1
1336        61.0                 6
3407         NaN                -1
7646         NaN                -1


In [11]:
# Save the updated dataframe with new features to CSV file

# Define the output path
output_path = "/workspaces/team3_goodweather/1_DatasetCharacteristics/processed_data/combined_data_final.csv"

# Save the dataframe with new features
df.to_csv(output_path, index=False)

# Verify the save was successful
print(f"Dataframe saved successfully to: {output_path}")
print(f"Shape of saved dataframe: {df.shape}")
print(f"Columns in final dataframe: {len(df.columns)}")

# Display the new feature columns that were added
new_features = ['Temperatur_7day_rolling'] + [col for col in df.columns if col.startswith('Warengruppe_')]
print(f"\nNew features added:")
for feature in new_features:
    print(f"  - {feature}")

# Show a preview of the final dataframe structure
print(f"\nPreview of final dataframe:")
print(df.head())

Dataframe saved successfully to: /workspaces/team3_goodweather/1_DatasetCharacteristics/processed_data/combined_data_final.csv
Shape of saved dataframe: (11782, 23)
Columns in final dataframe: 23

New features added:
  - Temperatur_7day_rolling
  - Warengruppe_1.0
  - Warengruppe_2.0
  - Warengruppe_3.0
  - Warengruppe_4.0
  - Warengruppe_5.0
  - Warengruppe_6.0
  - Warengruppe_nan

Preview of final dataframe:
       Datum  id  Warengruppe  Umsatz  KielerWoche  Bewoelkung  Temperatur  \
0 2012-01-01 NaN          NaN     NaN          NaN         8.0      9.8250   
1 2012-01-02 NaN          NaN     NaN          NaN         7.0      7.4375   
2 2012-01-03 NaN          NaN     NaN          NaN         8.0      5.5375   
3 2012-01-04 NaN          NaN     NaN          NaN         4.0      5.6875   
4 2012-01-05 NaN          NaN     NaN          NaN         6.0      5.3000   

   Windgeschwindigkeit  Wettercode  Niederschlag  ...  Warengruppe_3.0  \
0                 14.0        58.0         