In [1]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer

In [2]:
# load data (latest version of output from script/notebook before) - "6.4_temperature_deviation"

df = pd.read_csv("/workspaces/bakery_sales_prediction/sourcedata/cleaned_data/6.4_temperature_deviation.csv")

print(df.head(), "\n")

        Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0  2013-07-01         6.0     17.8375                 15.0        20.0   
1  2013-07-01         6.0     17.8375                 15.0        20.0   
2  2013-07-01         6.0     17.8375                 15.0        20.0   
3  2013-07-01         6.0     17.8375                 15.0        20.0   
4  2013-07-01         6.0     17.8375                 15.0        20.0   

   KielerWoche  Warengruppe      Umsatz  Feiertage  Ferientage  ...  \
0          0.0          1.0  148.828353        0.0         1.0  ...   
1          0.0          2.0  535.856285        0.0         1.0  ...   
2          0.0          3.0  201.198426        0.0         1.0  ...   
3          0.0          4.0   65.890169        0.0         1.0  ...   
4          0.0          5.0  317.475875        0.0         1.0  ...   

   imp_mask_Bewoelkung  imp_mask_Temperatur  imp_mask_Windgeschwindigkeit  \
0                    0                    0        

### Create variables from date

In [9]:
df_w = df.copy()

In [10]:
# Ensure the 'Datum' column is in datetime format
df_w['Datum'] = pd.to_datetime(df_w['Datum'], errors='coerce')

# Check if 'Datum' column has been converted correctly
print(df_w.dtypes)

Datum                           datetime64[ns]
Bewoelkung                             float64
Temperatur                             float64
Windgeschwindigkeit                    float64
Wettercode                             float64
KielerWoche                            float64
Warengruppe                            float64
Umsatz                                 float64
Feiertage                              float64
Ferientage                             float64
Niederschlag                           float64
imp_mask_Bewoelkung                      int64
imp_mask_Temperatur                      int64
imp_mask_Windgeschwindigkeit             int64
imp_mask_Niederschlag                    int64
imp_mask_Wettercode                      int64
Wetterklasse                             int64
temp_7d_avg                            float64
temp_diff_next_day                     float64
warmer_than_week_before                float64
colder_than_week_before                float64
dtype: object

In [11]:
# extract date features
df_w['year'] = df_w['Datum'].dt.year
df_w['month'] = df_w['Datum'].dt.month
df_w['week'] = df_w['Datum'].dt.isocalendar().week
df_w['weekday'] = df_w['Datum'].dt.dayofweek
df_w['day_month'] = df_w['Datum'].dt.day
print(df_w.head())


       Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0 2013-07-01         6.0     17.8375                 15.0        20.0   
1 2013-07-01         6.0     17.8375                 15.0        20.0   
2 2013-07-01         6.0     17.8375                 15.0        20.0   
3 2013-07-01         6.0     17.8375                 15.0        20.0   
4 2013-07-01         6.0     17.8375                 15.0        20.0   

   KielerWoche  Warengruppe      Umsatz  Feiertage  Ferientage  ...  \
0          0.0          1.0  148.828353        0.0         1.0  ...   
1          0.0          2.0  535.856285        0.0         1.0  ...   
2          0.0          3.0  201.198426        0.0         1.0  ...   
3          0.0          4.0   65.890169        0.0         1.0  ...   
4          0.0          5.0  317.475875        0.0         1.0  ...   

   Wetterklasse  temp_7d_avg  temp_diff_next_day  warmer_than_week_before  \
0             6      18.7625             -0.9250         

In [12]:
print(df_w["month"].describe())

count    9334.000000
mean        6.654810
std         3.461902
min         1.000000
25%         4.000000
50%         7.000000
75%        10.000000
max        12.000000
Name: month, dtype: float64


In [13]:
# define the season
def get_season(month):
	    if month in [12, 1, 2]:
	        return 'winter', 4
	    elif month in [3, 4, 5]:
	        return 'spring', 1
	    elif month in [6, 7, 8]:
	        return 'summer', 2
	    else:
	        return 'autumn', 3
	
df_w['season_str'], df_w['season'] = zip(*df_w['month'].apply(get_season))


# Create binary features for seasons --> creates one column per season and then 0/1
# probably to generic for predictions?

#weather_data['winter'] = (weather_data['season'] == 'winter').astype(int)
#weather_data['spring'] = (weather_data['season'] == 'spring').astype(int)
#weather_data['summer'] = (weather_data['season'] == 'summer').astype(int)
#weather_data['autumn'] = (weather_data['season'] == 'autumn').astype(int)


print(df_w.head())


       Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0 2013-07-01         6.0     17.8375                 15.0        20.0   
1 2013-07-01         6.0     17.8375                 15.0        20.0   
2 2013-07-01         6.0     17.8375                 15.0        20.0   
3 2013-07-01         6.0     17.8375                 15.0        20.0   
4 2013-07-01         6.0     17.8375                 15.0        20.0   

   KielerWoche  Warengruppe      Umsatz  Feiertage  Ferientage  ...  \
0          0.0          1.0  148.828353        0.0         1.0  ...   
1          0.0          2.0  535.856285        0.0         1.0  ...   
2          0.0          3.0  201.198426        0.0         1.0  ...   
3          0.0          4.0   65.890169        0.0         1.0  ...   
4          0.0          5.0  317.475875        0.0         1.0  ...   

   temp_diff_next_day  warmer_than_week_before  colder_than_week_before  year  \
0             -0.9250                   0.0000       

In [14]:
# dataframe after the previous steps:
df_w


Unnamed: 0,Datum,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,Warengruppe,Umsatz,Feiertage,Ferientage,...,temp_diff_next_day,warmer_than_week_before,colder_than_week_before,year,month,week,weekday,day_month,season_str,season
0,2013-07-01,6.0,17.8375,15.0,20.0,0.0,1.0,148.828353,0.0,1.0,...,-0.925000,0.000000,0.925,2013,7,27,0,1,summer,2
1,2013-07-01,6.0,17.8375,15.0,20.0,0.0,2.0,535.856285,0.0,1.0,...,-0.925000,0.000000,0.925,2013,7,27,0,1,summer,2
2,2013-07-01,6.0,17.8375,15.0,20.0,0.0,3.0,201.198426,0.0,1.0,...,-0.925000,0.000000,0.925,2013,7,27,0,1,summer,2
3,2013-07-01,6.0,17.8375,15.0,20.0,0.0,4.0,65.890169,0.0,1.0,...,-0.925000,0.000000,0.925,2013,7,27,0,1,summer,2
4,2013-07-01,6.0,17.8375,15.0,20.0,0.0,5.0,317.475875,0.0,1.0,...,2.312500,2.312500,0.000,2013,7,27,0,1,summer,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9329,2018-07-31,5.0,31.2875,11.0,21.0,0.0,1.0,123.075682,0.0,1.0,...,5.733929,5.733929,0.000,2018,7,31,1,31,summer,2
9330,2018-07-31,5.0,31.2875,11.0,21.0,0.0,2.0,586.081666,0.0,1.0,...,4.812500,4.812500,0.000,2018,7,31,1,31,summer,2
9331,2018-07-31,5.0,31.2875,11.0,21.0,0.0,3.0,285.872616,0.0,1.0,...,3.850000,3.850000,0.000,2018,7,31,1,31,summer,2
9332,2018-07-31,5.0,31.2875,11.0,21.0,0.0,4.0,57.102795,0.0,1.0,...,2.887500,2.887500,0.000,2018,7,31,1,31,summer,2


### Save result:

In [15]:
 # Save the updated data to new CSV files
df_w.to_csv('../sourcedata/cleaned_data/6.5_date_variables.csv', sep=',', index=False)