In [1]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer

In [2]:
# load data (latest version of output from script/notebook before) - "6.3_weather_class"

df = pd.read_csv("/workspaces/bakery_sales_prediction/sourcedata/cleaned_data/6.3_weather_class.csv")

print("df:", "\n", df.head(), "\n")


df: 
         Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0  2013-07-01         6.0     17.8375                 15.0        20.0   
1  2013-07-01         6.0     17.8375                 15.0        20.0   
2  2013-07-01         6.0     17.8375                 15.0        20.0   
3  2013-07-01         6.0     17.8375                 15.0        20.0   
4  2013-07-01         6.0     17.8375                 15.0        20.0   

   KielerWoche  Warengruppe      Umsatz  Feiertage  Ferientage  Niederschlag  \
0          0.0          1.0  148.828353        0.0         1.0           0.3   
1          0.0          2.0  535.856285        0.0         1.0           0.3   
2          0.0          3.0  201.198426        0.0         1.0           0.3   
3          0.0          4.0   65.890169        0.0         1.0           0.3   
4          0.0          5.0  317.475875        0.0         1.0           0.3   

   imp_mask_Bewoelkung  imp_mask_Temperatur  imp_mask_Windgeschwindi

### (Seasonal) Temperature deviation

We tried to includ a sudden change in temperature as a variable because a sudden weather change might influence the behaviour of people.

In [3]:
# Step 1: Calculate the 7-day rolling average for temperature
df['temp_7d_avg'] = df['Temperatur'].rolling(window=7).mean()

# Step 2: Fill NA values created by rolling average
df['temp_7d_avg'].fillna(method='bfill', inplace=True)

# Step 3: Calculate the difference for the next day
df['temp_diff_next_day'] = df['Temperatur'].shift(-1) - df['temp_7d_avg']

# Step 4: Create 'warmer than week before' and 'colder than week before' columns
df['warmer_than_week_before'] = df['temp_diff_next_day'].apply(lambda x: x if x > 0 else 0)
df['colder_than_week_before'] = df['temp_diff_next_day'].apply(lambda x: -x if x < 0 else 0)

# Display the first few rows to check the results
print(df[['Temperatur', 'temp_7d_avg', 'temp_diff_next_day', 'warmer_than_week_before', 'colder_than_week_before']].head(15))
df.describe()


    Temperatur  temp_7d_avg  temp_diff_next_day  warmer_than_week_before  \
0      17.8375    18.762500           -0.925000                 0.000000   
1      17.8375    18.762500           -0.925000                 0.000000   
2      17.8375    18.762500           -0.925000                 0.000000   
3      17.8375    18.762500           -0.925000                 0.000000   
4      17.8375    18.762500            2.312500                 2.312500   
5      21.0750    18.762500            2.312500                 2.312500   
6      21.0750    18.762500            2.312500                 2.312500   
7      21.0750    19.225000            1.850000                 1.850000   
8      21.0750    19.687500            1.387500                 1.387500   
9      21.0750    20.150000           -1.300000                 0.000000   
10     18.8500    20.294643           -1.444643                 0.000000   
11     18.8500    20.439286           -1.589286                 0.000000   
12     18.85

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['temp_7d_avg'].fillna(method='bfill', inplace=True)
  df['temp_7d_avg'].fillna(method='bfill', inplace=True)


Unnamed: 0,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,KielerWoche,Warengruppe,Umsatz,Feiertage,Ferientage,Niederschlag,imp_mask_Bewoelkung,imp_mask_Temperatur,imp_mask_Windgeschwindigkeit,imp_mask_Niederschlag,imp_mask_Wettercode,Wetterklasse,temp_7d_avg,temp_diff_next_day,warmer_than_week_before,colder_than_week_before
count,9334.0,9334.0,9334.0,9334.0,9334.0,9334.0,9334.0,9334.0,9334.0,9334.0,9334.0,9334.0,9334.0,9334.0,9334.0,9334.0,9334.0,9333.0,9334.0,9334.0
mean,4.73784,12.028093,10.975145,32.387615,0.023891,3.088172,206.749044,0.018856,0.16606,2.112246,0.007499,0.001714,0.001714,0.0,0.249089,4.180309,12.024418,0.00491,0.764356,0.759447
std,2.643655,7.229432,4.130766,27.354896,0.152718,1.489002,144.545189,0.136023,0.372154,4.146853,0.086279,0.041369,0.041369,0.0,0.432509,2.580527,7.085437,2.148256,1.310568,1.317505
min,0.0,-8.475,3.0,0.0,0.0,1.0,7.051201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-7.528571,-16.475,0.0,0.0
25%,3.0,6.2375,8.0,5.0,0.0,2.0,96.897441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,6.248661,-1.035714,0.0,0.0
50%,6.0,11.625,10.0,21.0,0.0,3.0,161.900831,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,6.0,11.713393,-0.032398,0.0,0.03227
75%,7.0,17.9625,13.0,61.0,0.0,4.0,280.644663,0.0,0.0,2.3,0.0,0.0,0.0,0.0,0.0,6.0,18.035714,1.089286,1.089286,1.035714
max,8.0,31.4375,35.0,95.0,1.0,6.0,1879.461831,1.0,1.0,37.7,1.0,1.0,1.0,0.0,1.0,9.0,29.3625,14.910714,14.910714,16.475


In [4]:
# Save the updated data to new CSV files
df.to_csv('../sourcedata/cleaned_data/6.4_temperature_deviation.csv', sep=',', index=False)