### Import Libraries and Data

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os

# Import Data
data = pd.read_csv("/workspaces/bakery_prediction/0_DataPreparation/dataf_knn.csv", usecols=['Umsatz', 'Datum', 'Warengruppe', 'Temperatur', 'Wochentag_MDMDFSS', 'Verbraucherpreisindex', 'FerienSH', 'Feiertag', 'Jahreszeit_FSHW', 'Wochenende', 'Windgeschwindigkeit', 'Wettercode'])


data['Temperatur_Cluster'] = 0  # Alle Werte auf 0 setzen

# Setzen Sie die Werte in 'Temperatur_Cluster' basierend auf den Werten in 'Temperatur'
data['Temperatur_Cluster'] = data['Temperatur'].apply(lambda x: 1 if x < 10 else (2 if x < 20 else 3))

# Konvertieren Sie die 'Datum'-Spalte in ein Datumsformat
data['Datum'] = pd.to_datetime(data['Datum'])

data['Jahreszeit'] = 0  # Alle Werte auf 0 setzen

# Setzen Sie die Werte in 'Jahreszeit' auf 1, wenn das Datum von Dezember bis Februar liegt
data['Jahreszeit'] = data['Datum'].apply(lambda x: 4 if x.month in [12, 1, 2] else 1 if x.month in [3, 4, 5] else 2 if x.month in [6, 7, 8] else 3)



### Data Preparation

In [3]:
# Define categorical features
categorical_features = ['Warengruppe', 'Temperatur_Cluster', 'Jahreszeit', 'Wochentag_MDMDFSS', 'FerienSH', 'Feiertag', 'Jahreszeit_FSHW', 'Wochenende', 'Feiertag']

# Inspect data types and unique values for categorical columns
print(data[categorical_features].dtypes)
print("Unique Values:\n", data[categorical_features].apply(lambda x: x.unique()))

# Ensure categorical columns are treated as categories
for col in categorical_features:
    data[col] = data[col].astype('category')

# Encode categorical variables using pd.get_dummies
features = pd.get_dummies(data[categorical_features], drop_first=True, dtype=int)

# Include any numeric columns that are not categorical
numerical_features = data[['Verbraucherpreisindex', 'Windgeschwindigkeit', 'Temperatur', 'Wettercode']]

# Construct the prepared data set including the dependent variable ('label')
prepared_data = pd.concat([data[['Umsatz']], features, numerical_features], axis=1).dropna()

# Handle missing values by removing rows with any missing values
prepared_data = prepared_data.dropna()

# Display the shape of the prepared data set
print(prepared_data.shape)
# Display the first few rows of the prepared data set
prepared_data.head()


Warengruppe           int64
Temperatur_Cluster    int64
Jahreszeit            int64
Wochentag_MDMDFSS     int64
FerienSH              int64
Feiertag              int64
Jahreszeit_FSHW       int64
Wochenende            int64
Feiertag              int64
dtype: object
Unique Values:
 Warengruppe           [4, 3, 2, 1, 5, 6]
Temperatur_Cluster             [1, 2, 3]
Jahreszeit                  [4, 1, 2, 3]
Wochentag_MDMDFSS     [6, 1, 2, 3, 4, 5]
FerienSH                          [1, 0]
Feiertag                          [1, 0]
Jahreszeit_FSHW             [4, 1, 2, 3]
Wochenende                        [1, 0]
Feiertag                          [1, 0]
dtype: object
(10437, 27)


Unnamed: 0,Umsatz,Warengruppe_2,Warengruppe_3,Warengruppe_4,Warengruppe_5,Warengruppe_6,Temperatur_Cluster_2,Temperatur_Cluster_3,Jahreszeit_2,Jahreszeit_3,...,Feiertag_1,Jahreszeit_FSHW_2,Jahreszeit_FSHW_3,Jahreszeit_FSHW_4,Wochenende_1,Feiertag_1.1,Verbraucherpreisindex,Windgeschwindigkeit,Temperatur,Wettercode
0,129.473516,0,0,1,0,0,0,0,0,0,...,1,0,0,1,1,1,90.6,14,9.825,58
1,179.073831,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,90.6,12,7.4375,65
2,181.361481,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,90.6,18,5.5375,63
3,161.093549,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,90.6,19,5.6875,80
4,184.942302,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,90.6,23,5.3,80


### Selection of Training and Validation Data

In [4]:
# Set a random seed for reproducibility
np.random.seed(42)

# Shuffle the data
prepared_data = prepared_data.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for each dataset
n_total = len(prepared_data)
# n_training = int(0.7 * n_total)
n_training = int(0.8 * n_total)
n_validation = int(0.20 * n_total)

# Split the features and labels for training, validation, and test
training_data = prepared_data.iloc[:n_training]
validation_data = prepared_data.iloc[n_training:n_training+n_validation]
# test_data = prepared_data.iloc[n_training+n_validation:]

# Separating features and labels
training_features = training_data.drop('Umsatz', axis=1)
validation_features = validation_data.drop('Umsatz', axis=1)
# test_features = test_data.drop('Umsatz', axis=1)

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
# test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
# print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
# print("Test labels dimensions:", test_labels.shape)


Training features dimensions: (8349, 26)
Validation features dimensions: (2087, 26)

Training labels dimensions: (8349, 1)
Validation labels dimensions: (2087, 1)


#### Data Export

In [5]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
# test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
# test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")