### Import Libraries and Data

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os

# Import Data
# data = pd.read_csv("/workspaces/bakery_prediction/3_Model/Bjoerns_NN-Vorschlag/data_import.csv", usecols=['Datum', 'Warengruppe', 'FerienSH', 'Feiertag', 'Weihnachtsmarkt', 'Temperatur', 'Wochenende' ,'Umsatz'])

data = pd.read_csv("/workspaces/bakery_prediction/3_Model/Bjoerns_NN-Vorschlag/no_missing_data_gesamt.csv", usecols=['Datum', 'Warengruppe', 'FerienSH', 'Feiertag', 'Weihnachtsmarkt', 'Temperatur', 'Wochenende' ,'Umsatz'])


data['Temperatur_Cluster'] = 0  # Alle Werte auf 0 setzen

# Setzen Sie die Werte in 'Temperatur_Cluster' basierend auf den Werten in 'Temperatur'
data['Temperatur_Cluster'] = data['Temperatur'].apply(lambda x: 1 if x < 10 else (2 if x < 20 else 3))

data['Jahreszeit'] = 0  # Alle Werte auf 0 setzen

# Funktion zur Bestimmung der Jahreszeit basierend auf dem Monat
def get_season(date):
    month = date.month
    if month in [3, 4, 5]:
        return 1  # Frühling
    elif month in [6, 7, 8]:
        return 2  # Sommer
    elif month in [9, 10, 11]:
        return 3  # Herbst
    else:
        return 4  # Winter

# Konvertieren Sie die 'Datum'-Spalte in ein Datumsformat
data['Datum'] = pd.to_datetime(data['Datum'])

# Setzen Sie die Werte in 'Jahreszeit' basierend auf den Werten in 'Datum'
data['Jahreszeit'] = data['Datum'].apply(get_season)

data.head()  # Print first few rows to verify

Unnamed: 0,Datum,Warengruppe,Umsatz,Temperatur,FerienSH,Feiertag,Weihnachtsmarkt,Wochenende,Temperatur_Cluster,Jahreszeit
0,2013-07-01,1,148.828353,17.8375,1,0,0,0,2,2
1,2013-07-01,2,535.856285,17.8375,1,0,0,0,2,2
2,2013-07-01,3,201.198426,17.8375,1,0,0,0,2,2
3,2013-07-01,4,65.890169,17.8375,1,0,0,0,2,2
4,2013-07-01,5,317.475875,17.8375,1,0,0,0,2,2


### Data Preparation

In [2]:
# Define categorical features
categorical_features = ['Warengruppe', 'Temperatur_Cluster', 'Jahreszeit']

# Inspect data types and unique values for categorical columns
print(data[categorical_features].dtypes)
print("Unique Values:\n",data[categorical_features].apply(lambda x: x.unique()))

# Ensure categorical columns are treated as categories
for col in categorical_features:
    data[col] = data[col].astype('category')

# Encode categorical variables using pd.get_dummies
features = pd.get_dummies(data[categorical_features], drop_first=True, dtype=int)

# Include any numeric columns that are not categorical
numeric_features = ['FerienSH', 'Feiertag', 'Weihnachtsmarkt', 'Wochenende']

# Construct the prepared data set including the dependent variable ('label')
prepared_data = pd.concat([data[['Umsatz']], features], axis=1).dropna()

# Handle missing values by removing rows with any missing values
prepared_data = prepared_data.dropna()

# Display the shape of the prepared data set
print(prepared_data.shape)
# Display the first few rows of the prepared data set
prepared_data.head()


Warengruppe           int64
Temperatur_Cluster    int64
Jahreszeit            int64
dtype: object
Unique Values:
 Warengruppe           [1, 2, 3, 4, 5, 6, 0]
Temperatur_Cluster                [2, 3, 1]
Jahreszeit                     [2, 3, 4, 1]
dtype: object
(9372, 12)


Unnamed: 0,Umsatz,Warengruppe_1,Warengruppe_2,Warengruppe_3,Warengruppe_4,Warengruppe_5,Warengruppe_6,Temperatur_Cluster_2,Temperatur_Cluster_3,Jahreszeit_2,Jahreszeit_3,Jahreszeit_4
0,148.828353,1,0,0,0,0,0,1,0,1,0,0
1,535.856285,0,1,0,0,0,0,1,0,1,0,0
2,201.198426,0,0,1,0,0,0,1,0,1,0,0
3,65.890169,0,0,0,1,0,0,1,0,1,0,0
4,317.475875,0,0,0,0,1,0,1,0,1,0,0


### Selection of Training, Validation and Test Data

In [3]:
# Set a random seed for reproducibility
np.random.seed(42)

# Shuffle the data
prepared_data = prepared_data.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for each dataset
n_total = len(prepared_data)
n_training = int(0.7 * n_total)
n_validation = int(0.20 * n_total)

# Split the features and labels for training, validation, and test
training_data = prepared_data.iloc[:n_training]
validation_data = prepared_data.iloc[n_training:n_training+n_validation]
test_data = prepared_data.iloc[n_training+n_validation:]

# Separating features and labels
training_features = training_data.drop('Umsatz', axis=1)
validation_features = validation_data.drop('Umsatz', axis=1)
test_features = test_data.drop('Umsatz', axis=1)

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)


Training features dimensions: (6560, 11)
Validation features dimensions: (1874, 11)
Test features dimensions: (938, 11)

Training labels dimensions: (6560, 1)
Validation labels dimensions: (1874, 1)
Test labels dimensions: (938, 1)


#### Data Export

In [4]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")