# Feature engineering for weather data

In [1]:
# Read in packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import csv

In [2]:
# read in weather data
weather = pd.read_csv("/workspaces/bakery_sales_prediction/sourcedata/wetter.csv")
weather["Datum"] = pd.to_datetime(weather["Datum"])

# Define the path to additional weather data from DWD
file_path = "/workspaces/bakery_sales_prediction/sourcedata/produkt_klima_tag_19740101_20231231_02564.txt"

# Read the file into a DataFrame, parse dates, and treat -999 as NaN
prec = pd.read_csv(file_path, sep=';', na_values='-999', parse_dates=['MESS_DATUM'])

# Rename the MESS_DATUM column to Datum
prec.rename(columns={'MESS_DATUM': 'Datum'}, inplace=True)
prec.rename(columns={" RSK": "Niederschlag"}, inplace = True)
# prec.rename(columns={"  FM": "Windgeschwindigkeit_m-per-s"}, inplace = True)

# Select relevant columns
prec = prec[["Datum", "Niederschlag"]]

# Display the first few rows of the DataFrame
print(prec.head())

# Merge both datasets on the Datum column
weather_data = pd.merge(weather, prec, on='Datum', how='inner')

# Display the merged DataFrame
print(weather_data)

       Datum  Niederschlag
0 1974-01-01           NaN
1 1974-01-02           NaN
2 1974-01-03           NaN
3 1974-01-04           NaN
4 1974-01-05           NaN
          Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0    2012-01-01         8.0      9.8250                   14        58.0   
1    2012-01-02         7.0      7.4375                   12         NaN   
2    2012-01-03         8.0      5.5375                   18        63.0   
3    2012-01-04         4.0      5.6875                   19        80.0   
4    2012-01-05         6.0      5.3000                   23        80.0   
...         ...         ...         ...                  ...         ...   
2594 2019-07-28         3.0     23.3500                   14         5.0   
2595 2019-07-29         6.0     25.2500                    7        61.0   
2596 2019-07-30         7.0     20.7375                    8        61.0   
2597 2019-07-31         6.0     20.4500                    7        61.0   
25

In [3]:
# extract date features
weather_data['year'] = weather_data['Datum'].dt.year
weather_data['month'] = weather_data['Datum'].dt.month
weather_data['week'] = weather_data['Datum'].dt.isocalendar().week
weather_data['day_of_week'] = weather_data['Datum'].dt.dayofweek
weather_data['day_of_month'] = weather_data['Datum'].dt.day
print(weather_data.head())


       Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0 2012-01-01         8.0      9.8250                   14        58.0   
1 2012-01-02         7.0      7.4375                   12         NaN   
2 2012-01-03         8.0      5.5375                   18        63.0   
3 2012-01-04         4.0      5.6875                   19        80.0   
4 2012-01-05         6.0      5.3000                   23        80.0   

   Niederschlag  year  month  week  day_of_week  day_of_month  
0          14.0  2012      1    52            6             1  
1           0.0  2012      1     1            0             2  
2          20.8  2012      1     1            1             3  
3          19.7  2012      1     1            2             4  
4           3.3  2012      1     1            3             5  


In [4]:
# define the season
def get_season(month):
	    if month in [12, 1, 2]:
	        return 'winter'
	    elif month in [3, 4, 5]:
	        return 'spring'
	    elif month in [6, 7, 8]:
	        return 'summer'
	    else:
	        return 'autumn'
	
weather_data['season'] = weather_data['month'].apply(get_season)

# Create binary features for seasons
# probably to generic for predictions?
weather_data['winter'] = (weather_data['season'] == 'winter').astype(int)
weather_data['spring'] = (weather_data['season'] == 'spring').astype(int)
weather_data['summer'] = (weather_data['season'] == 'summer').astype(int)
weather_data['autumn'] = (weather_data['season'] == 'autumn').astype(int)

print(weather_data.head())


       Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0 2012-01-01         8.0      9.8250                   14        58.0   
1 2012-01-02         7.0      7.4375                   12         NaN   
2 2012-01-03         8.0      5.5375                   18        63.0   
3 2012-01-04         4.0      5.6875                   19        80.0   
4 2012-01-05         6.0      5.3000                   23        80.0   

   Niederschlag  year  month  week  day_of_week  day_of_month  season  winter  \
0          14.0  2012      1    52            6             1  winter       1   
1           0.0  2012      1     1            0             2  winter       1   
2          20.8  2012      1     1            1             3  winter       1   
3          19.7  2012      1     1            2             4  winter       1   
4           3.3  2012      1     1            3             5  winter       1   

   spring  summer  autumn  
0       0       0       0  
1       0       0 

In [98]:
# check what exactly this means and does! (Suggestion by ChatGPT)
# Create rolling averages for temperature, precipitation and and wind speed
# weather_data['temp_7d_avg'] = weather_data['Temperatur'].rolling(window=7).mean()
# weather_data['wind_7d_avg'] = weather_data['Windgeschwindigkeit'].rolling(window=7).mean()
# weather_data['prec_7d_avg'] = weather_data['Niederschlag'].rolling(window=7).mean()

# Create lag features
# weather_data['temp_lag_1'] = weather_data['Temperatur'].shift(1)
# weather_data['wind_lag_1'] = weather_data['Windgeschwindigkeit'].shift(1)
# weather_data['prec_lag_1'] = weather_data['Niederschlag'].shift(1)

# Fill NA values created by rolling and lag features
# weather_data.fillna(method='bfill', inplace=True)

# Normalize the weather features
# from sklearn.preprocessing import StandardScaler

# weather_features = ['Temperatur', 'Windgeschwindigkeit', 'Niederschlag', 'temp_7d_avg', 'wind_7d_avg', 'temp_lag_1', 'wind_lag_1', 'prec_7d_avg', 'prec_lag_1']
# scaler = StandardScaler()
# weather_data[weather_features] = scaler.fit_transform(weather_data[weather_features])
# print(weather_data.head())


In [None]:
# Open ToDos:
# Create categories for temperature depending on season
# or calculate the deviation from monthly temperature
# or define warm / cold day depending on the season (e.g. in winter 10°C is a warm day, in summer that would be a cold day)

In [5]:
# Define thresholds for extreme weather conditions
VERY_HOT_THRESHOLD = 25  # Example: temperature > 25 degrees Celsius
VERY_COLD_THRESHOLD = 0  # Example: temperature < 0 degrees Celsius
VERY_RAINY_THRESHOLD = 10 # Example: precipitation > 10 mm
HIGH_WIND_THRESHOLD = 15  # Example: wind speed > 15 m/s
SUNNY_DAY_THRESHOLD = 2  # Example: cloudiness <= 2 - how is the parameter defined?

# Create binary features for extreme weather conditions
weather_data['very_hot'] = (weather_data['Temperatur'] > VERY_HOT_THRESHOLD).astype(int)
weather_data['very_cold'] = (weather_data['Temperatur'] < VERY_COLD_THRESHOLD).astype(int)
weather_data['very_rainy'] =  (weather_data['Niederschlag'] > VERY_RAINY_THRESHOLD).astype(int)
weather_data['high_wind'] = (weather_data['Windgeschwindigkeit'] > HIGH_WIND_THRESHOLD).astype(int)
weather_data['sunny_days'] = (weather_data['Bewoelkung'] <= SUNNY_DAY_THRESHOLD).astype(int)

# Display the first few rows to check all features
print(weather_data)


          Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0    2012-01-01         8.0      9.8250                   14        58.0   
1    2012-01-02         7.0      7.4375                   12         NaN   
2    2012-01-03         8.0      5.5375                   18        63.0   
3    2012-01-04         4.0      5.6875                   19        80.0   
4    2012-01-05         6.0      5.3000                   23        80.0   
...         ...         ...         ...                  ...         ...   
2594 2019-07-28         3.0     23.3500                   14         5.0   
2595 2019-07-29         6.0     25.2500                    7        61.0   
2596 2019-07-30         7.0     20.7375                    8        61.0   
2597 2019-07-31         6.0     20.4500                    7        61.0   
2598 2019-08-01         5.0     21.0625                    9        61.0   

      Niederschlag  year  month  week  day_of_week  ...  season winter  \
0            

In [6]:
# Calculate "feels like" temperature
weather_data["Windgeschwindigkeit_km_h"] = weather_data["Windgeschwindigkeit"] * 3.6
weather_data["feels_like_temperature"] = 13.12 + 0.6215 * weather_data["Temperatur"] - 11.37 * weather_data["Windgeschwindigkeit_km_h"]**0.16 + 0.3965*weather_data["Temperatur"]*weather_data["Windgeschwindigkeit_km_h"]**0.16
print(weather_data.head())

# export file again
# weather_data.to_csv("Files/weather_data.csv", index = False)


       Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0 2012-01-01         8.0      9.8250                   14        58.0   
1 2012-01-02         7.0      7.4375                   12         NaN   
2 2012-01-03         8.0      5.5375                   18        63.0   
3 2012-01-04         4.0      5.6875                   19        80.0   
4 2012-01-05         6.0      5.3000                   23        80.0   

   Niederschlag  year  month  week  day_of_week  ...  spring summer  autumn  \
0          14.0  2012      1    52            6  ...       0      0       0   
1           0.0  2012      1     1            0  ...       0      0       0   
2          20.8  2012      1     1            1  ...       0      0       0   
3          19.7  2012      1     1            2  ...       0      0       0   
4           3.3  2012      1     1            3  ...       0      0       0   

   very_hot  very_cold  very_rainy  high_wind  sunny_days  \
0         0          0   