## Model Data Preparation

Diese Data Preparation gilt für jede Warengruppe. Vorher immer ausführen, bevor eigens Modell durchgelaufen wird.

Basiert auf eine Kopie von neural_net_data_preparation.ipynb mit unseren Daten.

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os

# Import Data
data = pd.read_csv('/workspaces/DS_ML_Gr_1.5/2_BaselineModel/merged_data_new.csv')
data.head()  # Print first few rows to verify

Unnamed: 0,Datum,Inflationsrate,Heimspiel,Weihnachtsmarkt,Markt,Faehrverkaehr,Kreuzfahrverkehr,Temperatur,Monat,Jahreszeit,...,Sonnenaufgang,Sonnenuntergang,Tageslaenge,Niederschlag,Sonnenschein (h),Schneehoehe,Sonnenschein,Tageslaenge (dezimal),KielerWoche,Werktag
0,2013-07-01,1.53,0.0,0.0,0.0,1.576.718,419.447,17.8375,7.0,Sommer,...,4:50:12,21:56:46,17:06:34,0.3,6,-,0.350877,17.1,,1
1,2013-07-01,1.53,0.0,0.0,0.0,1.576.718,419.447,17.8375,7.0,Sommer,...,4:50:12,21:56:46,17:06:34,0.3,6,-,0.350877,17.1,,1
2,2013-07-01,1.53,0.0,0.0,0.0,1.576.718,419.447,17.8375,7.0,Sommer,...,4:50:12,21:56:46,17:06:34,0.3,6,-,0.350877,17.1,,1
3,2013-07-01,1.53,0.0,0.0,0.0,1.576.718,419.447,17.8375,7.0,Sommer,...,4:50:12,21:56:46,17:06:34,0.3,6,-,0.350877,17.1,,1
4,2013-07-01,1.53,0.0,0.0,0.0,1.576.718,419.447,17.8375,7.0,Sommer,...,4:50:12,21:56:46,17:06:34,0.3,6,-,0.350877,17.1,,1


In [3]:
# Define categorical features
categorical_features = ['Heimspiel',
                        'Weihnachtsmarkt',
                        'Markt',
                        'Frühling',
                        'Sommer',
                        'Herbst',
                        'Winter',
                        'Temp_warm',
                        'Temp_cold',
                        'Temp_average',
                        'Monday',
                        'Tuesday',
                        'Wednesday',
                        'Thursday',
                        'Friday',
                        'Saturday',
                        'Sunday',
                        'Schulferien',
                        'Semesterferien',
                        'Feiertage',
                        'KielerWoche',
                        'Werktag']

# Inspect data types and unique values for categorical columns
print(data[categorical_features].dtypes)
print("Unique Values:\n",data[categorical_features].apply(lambda x: x.unique()))

# Ensure categorical columns are treated as categories
for col in categorical_features:
    data[col] = data[col].astype('category')

# Encode categorical variables using pd.get_dummies
features = pd.get_dummies(data[categorical_features], drop_first=True, dtype=int)

# Include any numeric columns that are not categorical
features['Inflationsrate'] = data['Inflationsrate']
features['Temperatur'] = data['Temperatur']
features['Niederschlag'] = data['Niederschlag']
features['Schneehoehe'] = data['Schneehoehe']
features['Sonnenschein'] = data['Sonnenschein']

# Fehlende Features: Kreuzfahrt & Fährverkehr

# Construct the prepared data set including the dependent variable --> Umsatz
prepared_data = pd.concat([data[['Umsatz']], features], axis=1)


# Wenn Data imputation fertig ist, #wegnehmen

# Display the shape of the prepared data set
#print(prepared_data.shape)
# Display the first few rows of the prepared data set
#prepared_data.head()

Heimspiel          float64
Weihnachtsmarkt    float64
Markt              float64
Frühling           float64
Sommer             float64
Herbst             float64
Winter             float64
Temp_warm          float64
Temp_cold          float64
Temp_average       float64
Monday               int64
Tuesday              int64
Wednesday            int64
Thursday             int64
Friday               int64
Saturday             int64
Sunday               int64
Schulferien        float64
Semesterferien     float64
Feiertage          float64
KielerWoche        float64
Werktag              int64
dtype: object
Unique Values:
 Heimspiel               [0.0, 1.0]
Weihnachtsmarkt         [0.0, 1.0]
Markt                   [0.0, 1.0]
Frühling           [0.0, 1.0, nan]
Sommer             [1.0, 0.0, nan]
Herbst             [0.0, 1.0, nan]
Winter             [0.0, 1.0, nan]
Temp_warm          [0.0, 1.0, nan]
Temp_cold          [0.0, 1.0, nan]
Temp_average       [1.0, 0.0, nan]
Monday                    

In [None]:
#Vorläufig wegen fehlender data imputation

# Dieser Teil muss für die Imputationsverfahren unbedingt raus/angepasst werden!
prepared_data = prepared_data.dropna()
prepared_data = prepared_data.replace(to_replace=r'NaN', value='0', regex=True)
prepared_data = prepared_data.replace(to_replace=r'^-$', value='0', regex=True)
prepared_data = prepared_data.fillna(0)

# Display the shape of the prepared data set
print(prepared_data.shape)
# Display the first few rows of the prepared data set
prepared_data.head()

In [18]:
# Set a random seed for reproducibility
np.random.seed(42)

# Shuffle the data
prepared_data = prepared_data.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for each dataset
n_total = len(prepared_data)
n_training = int(0.7 * n_total)
n_validation = int(0.20 * n_total)

# Split the features and labels for training, validation, and test
training_data = prepared_data.iloc[:n_training]
validation_data = prepared_data.iloc[n_training:n_training+n_validation]
test_data = prepared_data.iloc[n_training+n_validation:]

# Separating features and labels
training_features = training_data.drop('Umsatz', axis=1)
validation_features = validation_data.drop('Umsatz', axis=1)
test_features = test_data.drop('Umsatz', axis=1)

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)

Training features dimensions: (5780, 25)
Validation features dimensions: (1651, 25)
Test features dimensions: (827, 25)

Training labels dimensions: (5780, 1)
Validation labels dimensions: (1651, 1)
Test labels dimensions: (827, 1)


In [19]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")