In [2]:
import pandas as pd
import numpy as np
import os

In [5]:
# Assuming the new dataset is saved as 'bakery_sales.csv'
# The dataset has columns: 'date', 'category', 'sales_volume', 'temperature', 'cloud_coverage'
dataf = pd.read_csv("https://raw.githubusercontent.com/FabsenMc/bakery_prediction/main/0_DataPreparation/dataf.csv")

# Convert the date column to datetime
dataf['Datum'] = pd.to_datetime(dataf['Datum'])

# Pivot the data to have one row per date and one column per category
pivot_dataf = dataf.pivot_table(index='Datum', columns='Warengruppe', values='Umsatz').reset_index()

#New created columns get the prefix 'Warengruppe_'
pivot_dataf.columns = ['Datum'] + ['Warengruppe_' + str(col) for col in pivot_dataf.columns if col != 'Datum']

print(pivot_dataf.head())

# Add temperature and cloud coverage columns to the pivoted data
pivot_dataf = pivot_dataf.merge(dataf[['Datum', 'Temperatur_Kategorie', 'Bewoelkung']].drop_duplicates(), on='Datum')

# Display the first few rows of the prepared data
print(pivot_dataf.head())

# Ensure no missing values by filling them with zeros
pivot_dataf.fillna(0, inplace=True)

print(pivot_dataf.head())

# Define the feature columns and target columns
feature_columns = ['Temperatur_Kategorie', 'Bewoelkung']
target_columns = pivot_dataf.columns[pivot_dataf.columns.str.startswith('Warengruppe_')]

# Definieren der Datumsgrenzen
train_start_date = '2013-07-01'
train_end_date = '2017-07-31'
validation_end_date = '2018-07-31'

# Split the data into training (3 years) and validation (1 year)
train_data = pivot_dataf[(pivot_dataf['Datum']>= train_start_date) & (dataf['Datum'] <= train_end_date)]
val_data = pivot_dataf[(pivot_dataf['Datum']> train_end_date) & (dataf['Datum'] <= validation_end_date)]

# Überprüfen der Dimensionen der Datensätze
print("Training dataset dimensions:", train_data.shape)
print("Validation dataset dimensions:", val_data.shape)

       Datum  Warengruppe_1.0  Warengruppe_2.0  Warengruppe_3.0  \
0 2013-07-01       148.828353       535.856285       201.198426   
1 2013-07-02       159.793757       546.780787       265.261254   
2 2013-07-03       111.885594       427.343259       210.260241   
3 2013-07-04       168.864941       454.859641       190.686641   
4 2013-07-05       171.280754       492.818804       181.644870   

   Warengruppe_4.0  Warengruppe_5.0  Warengruppe_6.0  
0        65.890169       317.475875              NaN  
1        74.543917       383.628682              NaN  
2        69.262728       305.523072              NaN  
3        61.490175       308.408168              NaN  
4        86.759861       355.518770              NaN  
       Datum  Warengruppe_1.0  Warengruppe_2.0  Warengruppe_3.0  \
0 2013-07-01       148.828353       535.856285       201.198426   
1 2013-07-02       159.793757       546.780787       265.261254   
2 2013-07-03       111.885594       427.343259       210.260241   

  train_data = pivot_dataf[(pivot_dataf['Datum']>= train_start_date) & (dataf['Datum'] <= train_end_date)]
  val_data = pivot_dataf[(pivot_dataf['Datum']> train_end_date) & (dataf['Datum'] <= validation_end_date)]
