In [12]:
# import packages
import pandas as pd
import csv

In [13]:
# load data: is "5_has_wettercode" the latest version?

df_wetter = pd.read_csv("/workspaces/bakery_sales_prediction/sourcedata/cleaned_data/5_has_wettercode.csv")
df_no_wetter = pd.read_csv("/workspaces/bakery_sales_prediction/sourcedata/cleaned_data/5_missing_wettercode.csv")

print("df_wetter:", "\n", df_wetter.head(), "\n")
print("df_no_wetter:", "\n",df_no_wetter.head(), "\n")

df_wetter: 
         Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0  2013-07-01         6.0     17.8375                 15.0        20.0   
1  2013-07-01         6.0     17.8375                 15.0        20.0   
2  2013-07-01         6.0     17.8375                 15.0        20.0   
3  2013-07-01         6.0     17.8375                 15.0        20.0   
4  2013-07-01         6.0     17.8375                 15.0        20.0   

   KielerWoche  Warengruppe      Umsatz  Feiertage  Ferientage  
0          0.0          1.0  148.828353        0.0         1.0  
1          0.0          2.0  535.856285        0.0         1.0  
2          0.0          3.0  201.198426        0.0         1.0  
3          0.0          4.0   65.890169        0.0         1.0  
4          0.0          5.0  317.475875        0.0         1.0   

df_no_wetter: 
         Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0  2013-07-02         3.0     17.3125                 10.0  

### Merge data to have all together again (valid values and NaN):

In [14]:
# concatenate dataframes
merged_df = pd.concat([df_wetter, df_no_wetter])

# reset index to ensure the index is sequential after concatenation
merged_df.reset_index(drop=True, inplace=True)

merged_df["Datum"] = pd.to_datetime(merged_df["Datum"])

print(merged_df.head(), "\n")
print(merged_df.info())

       Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0 2013-07-01         6.0     17.8375                 15.0        20.0   
1 2013-07-01         6.0     17.8375                 15.0        20.0   
2 2013-07-01         6.0     17.8375                 15.0        20.0   
3 2013-07-01         6.0     17.8375                 15.0        20.0   
4 2013-07-01         6.0     17.8375                 15.0        20.0   

   KielerWoche  Warengruppe      Umsatz  Feiertage  Ferientage  
0          0.0          1.0  148.828353        0.0         1.0  
1          0.0          2.0  535.856285        0.0         1.0  
2          0.0          3.0  201.198426        0.0         1.0  
3          0.0          4.0   65.890169        0.0         1.0  
4          0.0          5.0  317.475875        0.0         1.0   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9334 entries, 0 to 9333
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype         
---  

### Add precipitation data from external source:

DWD:
https://opendata.dwd.de/climate_environment/CDC/observations_germany/climate/daily/kl/historical/
Station: Kiel Holtenau

In [15]:
# Define the path to additional weather data from DWD
file_path = "/workspaces/bakery_sales_prediction/sourcedata/produkt_klima_tag_19740101_20231231_02564.txt"

# Read the file into a DataFrame, parse dates, and treat -999 as NaN
df_prec = pd.read_csv(file_path, sep=';', na_values='-999', parse_dates=['MESS_DATUM'])
df_prec["MESS_DATUM"] = pd.to_datetime(df_prec["MESS_DATUM"])
print(f"Niederschlagsdaten: \n{df_prec.head()}")

# Rename the MESS_DATUM column to Datum
df_prec.rename(columns={'MESS_DATUM': 'Datum', ' RSK': 'Niederschlag'}, inplace=True)

# Select relevant columns
df_prec = df_prec[['Datum', 'Niederschlag']]

# Display the first few rows of the DataFrame
print(f"\nNiederschlagsdaten Auswahl: \n{df_prec.head()}")

# Perform an inner merge to keep only the rows with matching dates
merged_df = pd.merge(merged_df, df_prec, on='Datum', how='left')

# Display the merged DataFrame
print(f"\nNeuer df Wetter mit Niederschlag: \n{merged_df}")

# Check for statistics
print(f"\nStatistics for precipitation: \n{merged_df['Niederschlag'].describe()} \n")

print(merged_df.info())


Niederschlagsdaten: 
   STATIONS_ID MESS_DATUM  QN_3    FX    FM  QN_4   RSK  RSKF   SDK  SHK_TAG  \
0         2564 1974-01-01   5.0   6.2   2.7   NaN   NaN   NaN   NaN      NaN   
1         2564 1974-01-02   5.0   7.7   3.2   NaN   NaN   NaN   NaN      NaN   
2         2564 1974-01-03   5.0   7.2   3.2   NaN   NaN   NaN   NaN      NaN   
3         2564 1974-01-04   5.0   6.7   3.5   NaN   NaN   NaN   NaN      NaN   
4         2564 1974-01-05   5.0   9.2   3.4   NaN   NaN   NaN   NaN      NaN   

     NM   VPM    PM   TMK   UPM   TXK   TNK   TGK  eor  
0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  eor  
1   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  eor  
2   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  eor  
3   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  eor  
4   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  eor  

Niederschlagsdaten Auswahl: 
       Datum  Niederschlag
0 1974-01-01           NaN
1 1974-01-02           NaN
2 1974-01-03           NaN
3 1974-01-04      

### Ergebnis speichern:

In [16]:
print(merged_df)

          Datum  Bewoelkung  Temperatur  Windgeschwindigkeit  Wettercode  \
0    2013-07-01         6.0     17.8375                 15.0        20.0   
1    2013-07-01         6.0     17.8375                 15.0        20.0   
2    2013-07-01         6.0     17.8375                 15.0        20.0   
3    2013-07-01         6.0     17.8375                 15.0        20.0   
4    2013-07-01         6.0     17.8375                 15.0        20.0   
...         ...         ...         ...                  ...         ...   
9329 2018-07-31         5.0     31.2875                 11.0         NaN   
9330 2018-07-31         5.0     31.2875                 11.0         NaN   
9331 2018-07-31         5.0     31.2875                 11.0         NaN   
9332 2018-07-31         5.0     31.2875                 11.0         NaN   
9333 2018-07-31         5.0     31.2875                 11.0         NaN   

      KielerWoche  Warengruppe      Umsatz  Feiertage  Ferientage  \
0             0.0 

In [17]:
# Save the updated data to new CSV files
merged_df.to_csv('../sourcedata/cleaned_data/6.1_weather_parameters-2.csv', sep=',', index=False)