In [1]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer

In [2]:
# load data (latest version of output from script/notebook before) - "6.2_missing_value_imputation.csv" the latest version?

df = pd.read_csv("/workspaces/bakery_sales_prediction/sourcedata/cleaned_data/6.2_missing_value_imputation.csv")
df_old = pd.read_csv("/workspaces/bakery_sales_prediction/sourcedata/cleaned_data/6.1_weather_parameters.csv")

print(df.head(), "\n")
print(df.info())

        Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0  2013-07-01         6.0     17.8375                 15.0        20.0   
1  2013-07-01         6.0     17.8375                 15.0        20.0   
2  2013-07-01         6.0     17.8375                 15.0        20.0   
3  2013-07-01         6.0     17.8375                 15.0        20.0   
4  2013-07-01         6.0     17.8375                 15.0        20.0   

   KielerWoche  Warengruppe      Umsatz  Feiertage  Ferientage  Niederschlag  \
0          0.0          1.0  148.828353        0.0         1.0           0.3   
1          0.0          2.0  535.856285        0.0         1.0           0.3   
2          0.0          3.0  201.198426        0.0         1.0           0.3   
3          0.0          4.0   65.890169        0.0         1.0           0.3   
4          0.0          5.0  317.475875        0.0         1.0           0.3   

   imp_mask_Bewoelkung  imp_mask_Temperatur  imp_mask_Windgeschwindigkeit 

### Decode weather codes and divide into new classes

In [3]:
# Print the frequency of each value of the weather codes (which ones occure?)
value_counts = df['Wettercode'].value_counts().sort_index()
#print(value_counts)

value_counts = df_old['Wettercode'].value_counts().sort_index()
print(value_counts)

Wettercode
0.0      815
3.0        5
5.0      562
10.0     742
17.0       5
20.0     244
21.0    1013
22.0      88
28.0      63
45.0      96
49.0      48
53.0      31
61.0    2271
63.0     535
65.0     111
68.0      12
69.0      27
71.0      81
73.0      48
75.0      25
77.0      45
79.0       5
95.0     137
Name: count, dtype: int64


In [4]:
# Dictionary with weather codes and their descriptions from http://www.seewetter-kiel.de/seewetter/daten_symbole.htm
weather_code_descriptions = {
    0: "Bewoelkungsentwicklung nicht beobachtet",
    3: "Zunehmende Bewoelkung",
    5: "Trockener Dunst",
    10: "Feuchter Dunst",
    17: "Gewitter mit hoerbarem Donner, aber kein Niederschlag an der Station",
    20: "Spruehregen oder Schneegriesel hat aufgehoert",
    21: "Regen hat aufgehoert",
    22: "Schneefall hat aufgehoert",
    28: "Nebel hat sich aufgeloest",
    45: "Nebel, Himmel ist nicht erkennbar, Nebel unveraendert",
    49: "Nebel mit Reifbildung, Himmel ist nicht erkennbar",
    53: "Durchgehender maeßiger und nicht gefrierender Spruehregen",
    61: "Durchgehender leichter und nicht gefrierender Regen",
    63: "Durchgehender maeßiger nicht gefrierender Regen",
    65: "Durchgehender starker und nicht gefrierender Regen",
    68: "Leichter Schneeregen",
    69: "Maeßiger oder starker Schneeregen",
    71: "Durchgehender leichter Schneefall",
    73: "Durchgehender maeßiger Schneefall",
    75: "Durchgehender starker Schneefall",
    77: "Schneegrieseln mit oder ohne Nebel",
    79: "Eiskoerner (gefrorene Regentropfen)",
    95: "Leichtes oder maeßiges Gewitter mit Regen oder Schnee"
}

# Function to get the description of a weather code from list above
def get_weather_description(code):
    return weather_code_descriptions.get(code, "Description not available")

# for example:
get_weather_description(53)

'Durchgehender maeßiger und nicht gefrierender Spruehregen'

In [5]:
# new classes for weathercodes in a dictionary:
new_weather_code_descriptions = {
    0 : "nicht_beobachtet",
    1 : "Wolken",
    2 : "Dunst_oder_Nebel",
    3 : "sich_ankuendigender_Regen",
    4 : "Gewitter",
    5 : "Boeen_und_Sturm",
    6 : "Niederschlag_nass_leicht", # Regen # leicht # inkl. 'Niederschlag hat aufgehört', weil es ja nicht den ganzen Tag lang 'aufgehört' haben kann, sondern dann vermutlich vorher geregnet hat
    7 : "Niederschlag_nass_stark", # mäßiger oder starker Regen
    8 : "Niederschlag_trocken", # Schnee; leichtes oder mäßiges Schneetreiben
    9 : "Schneesturm_Hagel", # starkes Schneefegen
    10 : "Sandsturm"
}

# Define lists of codes for each weather class
nicht_beobachtet =  [0] # 0
Wolken = [1, 2, 3] # 1
Dunst_oder_Nebel = [4, 5, 6, 7, 8, 9, 10, 11, 12, 28, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49] # 2
sich_ankuendigender_Regen = [14, 15, 16] # 3
Gewitter = [13, 17, 29, 91, 92, 93, 94, 95, 96, 97, 98, 99] # 4
Boeen_und_Sturm = [18, 19] # 5
Niederschlag_nass_leicht = [20, 21, 23, 24, 25, 27, 50, 51, 56, 58, 60, 61, 66, 68, 80, 83, 87] # 6
Niederschlag_nass_stark = [52, 53, 54, 55, 57, 59, 62, 63, 64, 65, 67, 69, 81, 82, 84, 88] # 7
Niederschlag_trocken = [22, 26, 36, 38, 70, 71, 72, 73, 74, 75, 77, 78, 85, 86] # 8
Schneesturm_Hagel = [37, 39, 76, 79, 89, 90] # 9
Sandsturm = [30, 31, 32, 33, 34, 35] # 10


In [6]:
# Create a combined dictionary for mapping original codes to new numeric codes
weather_code_mapping = {}

# Combine the lists and dictionary
weather_classes = {
    "nicht_beobachtet" : nicht_beobachtet,
    "Wolken" : Wolken,
    "Dunst_oder_Nebel" : Dunst_oder_Nebel,
    "sich_ankuendigender_Regen" : sich_ankuendigender_Regen,
    "Gewitter" : Gewitter,
    "Boeen_und_Sturm" : Boeen_und_Sturm,
    "Niederschlag_nass_leicht" : Niederschlag_nass_leicht,
    "Niederschlag_nass_stark" : Niederschlag_nass_stark,
    "Niederschlag_trocken" : Niederschlag_trocken,
    "Schneesturm_Hagel" : Schneesturm_Hagel,
    "Sandsturm" : Sandsturm
}

# Iterate over the new_weather_code_descriptions to build the mapping dictionary
for new_code, description in new_weather_code_descriptions.items():
    if description in weather_classes:
        for code in weather_classes[description]:
            weather_code_mapping[code] = new_code

# Debugging: Print the mapping dictionary to ensure it is correct
print("Weather Code Mapping:", weather_code_mapping)

# Map the weather codes to the new numeric codes
df['Wetterklasse'] = df['Wettercode'].map(weather_code_mapping)

# Check for rows where 'Wetterklasse' is NaN
missing_wetterklasse = df[df['Wetterklasse'].isna()]

# If there are any missing 'Wetterklasse', print those rows
if not missing_wetterklasse.empty:
    print("Rows with missing 'Wetterklasse':")
    print(missing_wetterklasse)
else:
    print("No rows with missing 'Wetterklasse'.")

# Replace NaN values with -9999 for codes not in the mapping
#df_wetter['Wetterklasse'] = df_wetter['Wetterklasse'].fillna(-9999)

# If the new column is still not appearing, check the DataFrame's columns
print("DataFrame Columns:", df.columns)
# Print the updated DataFrame
print(df)

Weather Code Mapping: {0: 0, 1: 1, 2: 1, 3: 1, 4: 2, 5: 2, 6: 2, 7: 2, 8: 2, 9: 2, 10: 2, 11: 2, 12: 2, 28: 2, 40: 2, 41: 2, 42: 2, 43: 2, 44: 2, 45: 2, 46: 2, 47: 2, 48: 2, 49: 2, 14: 3, 15: 3, 16: 3, 13: 4, 17: 4, 29: 4, 91: 4, 92: 4, 93: 4, 94: 4, 95: 4, 96: 4, 97: 4, 98: 4, 99: 4, 18: 5, 19: 5, 20: 6, 21: 6, 23: 6, 24: 6, 25: 6, 27: 6, 50: 6, 51: 6, 56: 6, 58: 6, 60: 6, 61: 6, 66: 6, 68: 6, 80: 6, 83: 6, 87: 6, 52: 7, 53: 7, 54: 7, 55: 7, 57: 7, 59: 7, 62: 7, 63: 7, 64: 7, 65: 7, 67: 7, 69: 7, 81: 7, 82: 7, 84: 7, 88: 7, 22: 8, 26: 8, 36: 8, 38: 8, 70: 8, 71: 8, 72: 8, 73: 8, 74: 8, 75: 8, 77: 8, 78: 8, 85: 8, 86: 8, 37: 9, 39: 9, 76: 9, 79: 9, 89: 9, 90: 9, 30: 10, 31: 10, 32: 10, 33: 10, 34: 10, 35: 10}
No rows with missing 'Wetterklasse'.
DataFrame Columns: Index(['Datum', 'Bewoelkung', 'Temperatur', 'Windgeschwindigkeit',
       'Wettercode', 'KielerWoche', 'Warengruppe', 'Umsatz', 'Feiertage',
       'Ferientage', 'Niederschlag', 'imp_mask_Bewoelkung',
       'imp_mask_Tempera

In [7]:
print("describe: \n", df.describe(), "\n")

print("NaN: \n", df.isnull().value_counts(), "\n") # how to count the isnull values?

describe: 
         Bewoelkung   Temperatur  Windgeschwindigkeit   Wettercode  \
count  9334.000000  9334.000000          9334.000000  9334.000000   
mean      4.737840    12.028093            10.975145    32.387615   
std       2.643655     7.229432             4.130766    27.354896   
min       0.000000    -8.475000             3.000000     0.000000   
25%       3.000000     6.237500             8.000000     5.000000   
50%       6.000000    11.625000            10.000000    21.000000   
75%       7.000000    17.962500            13.000000    61.000000   
max       8.000000    31.437500            35.000000    95.000000   

       KielerWoche  Warengruppe       Umsatz    Feiertage   Ferientage  \
count  9334.000000  9334.000000  9334.000000  9334.000000  9334.000000   
mean      0.023891     3.088172   206.749044     0.018856     0.166060   
std       0.152718     1.489002   144.545189     0.136023     0.372154   
min       0.000000     1.000000     7.051201     0.000000     0.000000

In [9]:
# control shape
print(df.shape)

(9334, 17)


### Filter classes with too few objects

In [10]:
# Find a frequency threshold for frequent classes (which ones to include?) to reduce noise (small object numbers per class might be too little to include into the model --> find threshold)

# Calculate value counts
value_counts = df['Wetterklasse'].value_counts().sort_index()

# Calculate total number of occurrences
total_counts = value_counts.sum()

# Define the threshold percentage (e.g., 0.5, 1, 2, 5%)
threshold_percentage = 0.01

# Calculate the minimum number of occurrences to be included
threshold_count = total_counts * threshold_percentage

# Filter weather codes based on the threshold count and only include the 'valid'
frequent_classes = value_counts[value_counts >= threshold_count]

print("Total occurrences per weather code:")
print(value_counts)
print(f"\nFiltered weather codes (above threshold {threshold_percentage}):")
print(frequent_classes)
# Pretty print the mapping dictionary
print("\nReminder: Weather Code Mapping:")
for key, value in new_weather_code_descriptions.items():
    print(f"{key}: {value}")


Total occurrences per weather code:
Wetterklasse
0    1497
1      25
2    2234
4     177
6    4317
7     766
8     308
9      10
Name: count, dtype: int64

Filtered weather codes (above threshold 0.01):
Wetterklasse
0    1497
2    2234
4     177
6    4317
7     766
8     308
Name: count, dtype: int64

Reminder: Weather Code Mapping:
0: nicht_beobachtet
1: Wolken
2: Dunst_oder_Nebel
3: sich_ankuendigender_Regen
4: Gewitter
5: Boeen_und_Sturm
6: Niederschlag_nass_leicht
7: Niederschlag_nass_stark
8: Niederschlag_trocken
9: Schneesturm_Hagel
10: Sandsturm


In [12]:
# check structure
print(df.shape)

(9334, 17)


### Save result:

In [13]:
 # Save the updated data to new CSV files
df.to_csv('../sourcedata/cleaned_data/6.3_weather_class.csv', sep=',', index=False)