## Model Data Preparation

Diese Data Preparation gilt für jede Warengruppe. Vorher immer ausführen, bevor eigens Modell durchgelaufen wird.

Basiert auf eine Kopie von neural_net_data_preparation.ipynb mit unseren Daten.

In [79]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os

# Import Data
data = pd.read_csv('/workspaces/DS_ML_Gr_1.5/2_BaselineModel/merged_data_new.csv')
data.head()  # Print first few rows to verify

Unnamed: 0,Datum,Inflationsrate,Heimspiel,Weihnachtsmarkt,Markt,Faehrverkaehr,Kreuzfahrverkehr,Temperatur,Monat,Jahreszeit,...,Sonnenaufgang,Sonnenuntergang,Tageslaenge,Niederschlag,Sonnenschein (h),Schneehoehe,Sonnenschein,Tageslaenge (dezimal),KielerWoche,Werktag
0,2013-07-01,1.53,0.0,0.0,0.0,1.576.718,419.447,17.8375,7,Sommer,...,4:50:12,21:56:46,17:06:34,0.3,6,-,0.350877,17.1,0.0,1
1,2013-07-01,1.53,0.0,0.0,0.0,1.576.718,419.447,17.8375,7,Sommer,...,4:50:12,21:56:46,17:06:34,0.3,6,-,0.350877,17.1,0.0,1
2,2013-07-01,1.53,0.0,0.0,0.0,1.576.718,419.447,17.8375,7,Sommer,...,4:50:12,21:56:46,17:06:34,0.3,6,-,0.350877,17.1,0.0,1
3,2013-07-01,1.53,0.0,0.0,0.0,1.576.718,419.447,17.8375,7,Sommer,...,4:50:12,21:56:46,17:06:34,0.3,6,-,0.350877,17.1,0.0,1
4,2013-07-01,1.53,0.0,0.0,0.0,1.576.718,419.447,17.8375,7,Sommer,...,4:50:12,21:56:46,17:06:34,0.3,6,-,0.350877,17.1,0.0,1


In [80]:
# Define categorical features
categorical_features = ['Heimspiel',
                        'Weihnachtsmarkt',
                        'Markt',
                        'Frühling',
                        'Sommer',
                        'Herbst',
                        'Winter',
                        'Temp_warm',
                        'Temp_cold',
                        'Temp_average',
                        'Monday',
                        'Tuesday',
                        'Wednesday',
                        'Thursday',
                        'Friday',
                        'Saturday',
                        'Sunday',
                        'Schulferien',
                        'Semesterferien',
                        'Feiertage',
                        'KielerWoche',
                        'Werktag']

# Inspect data types and unique values for categorical columns
print(data[categorical_features].dtypes)
print("Unique Values:\n",data[categorical_features].apply(lambda x: x.unique()))

# Ensure categorical columns are treated as categories
for col in categorical_features:
    data[col] = data[col].astype('category')

# Encode categorical variables using pd.get_dummies
features = pd.get_dummies(data[categorical_features], drop_first=True, dtype=int)

# Include any numeric columns that are not categorical
features['Inflationsrate'] = data['Inflationsrate']
features['Temperatur'] = data['Temperatur']
features['Niederschlag'] = data['Niederschlag']
features['Schneehoehe'] = data['Schneehoehe']
features['Sonnenschein'] = data['Sonnenschein']
features['Kreuzfahrverkehr'] = data['Kreuzfahrverkehr']
features['Faehrverkaehr'] = data['Faehrverkaehr']
features['Warengruppe'] = data['Warengruppe']
features['Datum'] = data['Datum']



# Construct the prepared data set including the dependent variable --> Umsatz
prepared_data = pd.concat([data[['Umsatz']], features], axis=1)


# Wenn Data imputation fertig ist, #wegnehmen

# Display the shape of the prepared data set
#print(prepared_data.shape)
# Display the first few rows of the prepared data set
#prepared_data.head()

Heimspiel          float64
Weihnachtsmarkt    float64
Markt              float64
Frühling             int64
Sommer               int64
Herbst               int64
Winter               int64
Temp_warm          float64
Temp_cold          float64
Temp_average       float64
Monday               int64
Tuesday              int64
Wednesday            int64
Thursday             int64
Friday               int64
Saturday             int64
Sunday               int64
Schulferien        float64
Semesterferien     float64
Feiertage          float64
KielerWoche        float64
Werktag              int64
dtype: object
Unique Values:
 Heimspiel               [0.0, 1.0]
Weihnachtsmarkt         [0.0, 1.0]
Markt                   [0.0, 1.0]
Frühling                    [0, 1]
Sommer                      [1, 0]
Herbst                      [0, 1]
Winter                      [0, 1]
Temp_warm          [0.0, 1.0, nan]
Temp_cold          [0.0, 1.0, nan]
Temp_average       [1.0, 0.0, nan]
Monday                    

In [81]:


# Dieser Teil muss für die Imputationsverfahren unbedingt raus/angepasst werden!
prepared_data = prepared_data.replace(to_replace=r'NaN', value='0', regex=True)
prepared_data = prepared_data.replace(to_replace=r'^-$', value='0', regex=True)
prepared_data = prepared_data.fillna(0)
prepared_data = prepared_data.dropna()

# Display the shape of the prepared data set
print(prepared_data.shape)
# Display the first few rows of the prepared data set
print(prepared_data.head())

# Kontrolle: Überprüfen, ob noch fehlende Werte vorhanden sind
missing_values = prepared_data.isnull().sum()
print("Anzahl fehlender Werte pro Spalte:")
print(missing_values)



(9737, 32)
       Umsatz  Heimspiel_1.0  Weihnachtsmarkt_1.0  Markt_1.0  Frühling_1  \
0  148.828353              0                    0          0           0   
1  535.856285              0                    0          0           0   
2  201.198426              0                    0          0           0   
3   65.890169              0                    0          0           0   
4  317.475875              0                    0          0           0   

   Sommer_1  Herbst_1  Winter_1  Temp_warm_1.0  Temp_cold_1.0  ...  Werktag_1  \
0         1         0         0              0              0  ...          1   
1         1         0         0              0              0  ...          1   
2         1         0         0              0              0  ...          1   
3         1         0         0              0              0  ...          1   
4         1         0         0              0              0  ...          1   

   Inflationsrate  Temperatur  Niederschlag  

In [82]:
#Splitting des Datensets in die jeweiligen Warengruppen

#Bennenung der Warengruppne für dataframe Namen
warengruppe_namen = {
    1: 'Brot',
    2: 'Broetchen',
    3: 'Crossaint',
    4: 'Konditorei',
    5: 'Kuchen',
    6: 'Saisonbrot'
}

# Ursprüngliche DataFrames filtern nach Warengruppe
warengruppe_dataframes = {}
for i, name in warengruppe_namen.items():
    var_name = f"df_{name}_W{i}"  # Name erstellen nach: df_Brot_W1
    warengruppe_dataframes[var_name] = prepared_data[
        (prepared_data['Warengruppe'] == i) &
        (prepared_data['Datum'] <= '2018-07-31')
    ]

# Daten bis 2019-07-31 unabhängig der Warengruppe hinzufügen
new_data = prepared_data[
    (prepared_data['Datum'] > '2018-07-31') &
    (prepared_data['Datum'] <= '2019-08-31')
]

for var_name, df in warengruppe_dataframes.items():
    updated_df = pd.concat([df, new_data], ignore_index=True)
    globals()[var_name] = updated_df

column_names = df_Brot_W1.columns.tolist()
print(column_names)

# Ergebnisse ausgeben
print("df_Brot_W1:")
print(df_Brot_W1.tail()) #Luisa

print("df_Broetchen_W2:")
print(df_Broetchen_W2.tail()) #Luisa

print("df_Crossaint_W3:")
print(df_Crossaint_W3.tail()) #Nina

print("df_Konditorei_W4:")
print(df_Konditorei_W4.tail()) #Wiebke

print("df_Kuchen_W5:")
print(df_Kuchen_W5.tail()) #Nina

print("df_Saisonbrot_W6:")
print(df_Saisonbrot_W6.tail()) #Wiebke



['Umsatz', 'Heimspiel_1.0', 'Weihnachtsmarkt_1.0', 'Markt_1.0', 'Frühling_1', 'Sommer_1', 'Herbst_1', 'Winter_1', 'Temp_warm_1.0', 'Temp_cold_1.0', 'Temp_average_1.0', 'Monday_1', 'Tuesday_1', 'Wednesday_1', 'Thursday_1', 'Friday_1', 'Saturday_1', 'Sunday_1', 'Schulferien_1.0', 'Semesterferien_1.0', 'Feiertage_1.0', 'KielerWoche_1.0', 'Werktag_1', 'Inflationsrate', 'Temperatur', 'Niederschlag', 'Schneehoehe', 'Sonnenschein', 'Kreuzfahrverkehr', 'Faehrverkaehr', 'Warengruppe', 'Datum']
df_Brot_W1:
      Umsatz  Heimspiel_1.0  Weihnachtsmarkt_1.0  Markt_1.0  Frühling_1  \
2179     0.0              1                    0          0           0   
2180     0.0              0                    0          0           0   
2181     0.0              0                    0          0           0   
2182     0.0              0                    0          0           0   
2183     0.0              0                    0          0           0   

      Sommer_1  Herbst_1  Winter_1  Temp_warm_1

#### W1_Brot

In [83]:
data = df_Brot_W1


# Define your date thresholds
train_end_date = '2017-07-31'
validation_end_date = '2018-07-31'
test_end_date='2019-07-30'

#Keep only wanted features
data=data[['Datum',
           'Umsatz',
           'Warengruppe',
            #Hier Features hinzufügen und Namen anpassen:
           'Sommer_1',
            'Winter_1',
            'Temp_average_1.0',
            'Temp_warm_1.0',
            'Temp_cold_1.0',
            'Feiertage_1.0',
            'Schulferien_1.0',
            'Semesterferien_1.0',
            'Monday_1',
            'Tuesday_1',
            'Wednesday_1',
            'Thursday_1',
            'Friday_1',
            'Saturday_1',
            'Sunday_1',
            'Weihnachtsmarkt_1.0',
            'Markt_1.0',
            'Heimspiel_1.0',
            'KielerWoche_1.0']]

# Convert to datetime if not already
data['Datum'] = pd.to_datetime(data['Datum'])

# Split the data based on the date thresholds
training_data = data[data['Datum'] <= train_end_date]
validation_data = data[(data['Datum'] > train_end_date) & (data['Datum'] <= validation_end_date)]
test_data = data[(data['Datum'] > validation_end_date) & (data['Datum'] <= test_end_date)]

# Separating features and labels
training_features = training_data.drop('Umsatz', axis=1).drop('Datum', axis=1).drop('Warengruppe', axis=1)
validation_features = validation_data.drop('Umsatz', axis=1).drop('Datum', axis=1).drop('Warengruppe', axis=1)
test_features = test_data.drop('Umsatz', axis=1).drop('Datum', axis=1).drop('Warengruppe', axis=1)

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)




Training features dimensions: (1462, 19)
Validation features dimensions: (357, 19)
Test features dimensions: (364, 19)

Training labels dimensions: (1462, 1)
Validation labels dimensions: (357, 1)
Test labels dimensions: (364, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Datum'] = pd.to_datetime(data['Datum'])


In [84]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data//W1_Brot"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")

#### W2_Broetchen

In [85]:
data = df_Broetchen_W2

# Define your date thresholds
train_end_date = '2017-07-31'
validation_end_date = '2018-07-31'
test_end_date='2019-07-30'

#Keep only wanted features
data=data[['Umsatz',
           'Datum',
           'Warengruppe',
            #Hier Features hinzufügen und Namen anpassen:
            'Werktag_1',
            'Weihnachtsmarkt_1.0',
            'Markt_1.0',
            'Herbst_1',
            'Frühling_1',
            'Sommer_1',
            'Winter_1',
            'Temp_average_1.0',
            'Temp_warm_1.0',
            'Temp_cold_1.0',
            'Feiertage_1.0',
            'Schulferien_1.0',
            'Semesterferien_1.0',
            'Heimspiel_1.0',
            'KielerWoche_1.0']]

# Convert to datetime if not already
data['Datum'] = pd.to_datetime(data['Datum'])

# Split the data based on the date thresholds
training_data = data[data['Datum'] <= train_end_date]
validation_data = data[(data['Datum'] > train_end_date) & (data['Datum'] <= validation_end_date)]
test_data = data[(data['Datum'] > validation_end_date) & (data['Datum'] <= test_end_date)]

# Separating features and labels
training_features = training_data.drop('Umsatz', axis=1).drop('Datum', axis=1).drop('Warengruppe', axis=1)
validation_features = validation_data.drop('Umsatz', axis=1).drop('Datum', axis=1).drop('Warengruppe', axis=1)
test_features = test_data.drop('Umsatz', axis=1).drop('Datum', axis=1).drop('Warengruppe', axis=1)

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)


Training features dimensions: (1462, 15)
Validation features dimensions: (357, 15)
Test features dimensions: (364, 15)

Training labels dimensions: (1462, 1)
Validation labels dimensions: (357, 1)
Test labels dimensions: (364, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Datum'] = pd.to_datetime(data['Datum'])


In [86]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data//W2_Broetchen"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")

#### W3_Croissants

In [87]:
print(column_names)

['Umsatz', 'Heimspiel_1.0', 'Weihnachtsmarkt_1.0', 'Markt_1.0', 'Frühling_1', 'Sommer_1', 'Herbst_1', 'Winter_1', 'Temp_warm_1.0', 'Temp_cold_1.0', 'Temp_average_1.0', 'Monday_1', 'Tuesday_1', 'Wednesday_1', 'Thursday_1', 'Friday_1', 'Saturday_1', 'Sunday_1', 'Schulferien_1.0', 'Semesterferien_1.0', 'Feiertage_1.0', 'KielerWoche_1.0', 'Werktag_1', 'Inflationsrate', 'Temperatur', 'Niederschlag', 'Schneehoehe', 'Sonnenschein', 'Kreuzfahrverkehr', 'Faehrverkaehr', 'Warengruppe', 'Datum']


In [88]:
data = df_Crossaint_W3

# Define your date thresholds
train_end_date = '2017-07-31'
validation_end_date = '2018-07-31'
test_end_date='2019-07-30'

#Keep only wanted features
data=data[['Umsatz',
           'Datum',
           'Warengruppe',
            #Hier Features hinzufügen und Namen anpassen:
           'Sommer_1',
           'Winter_1',
           'Temp_cold_1.0',
           'Temp_warm_1.0',
           'Temp_average_1.0',
           'Monday_1',
           'Wednesday_1',
           'Thursday_1',
           'Saturday_1',
           'Sunday_1',
           'Schulferien_1.0',
           'Semesterferien_1.0',
           'Feiertage_1.0',
           'Markt_1.0',   
           'Sonnenschein',
           'Niederschlag',
           'Tageslaenge',
           'Weihnachtsmarkt_1.0']]




# Convert to datetime if not already
data['Datum'] = pd.to_datetime(data['Datum'])

# Split the data based on the date thresholds
training_data = data[data['Datum'] <= train_end_date]
validation_data = data[(data['Datum'] > train_end_date) & (data['Datum'] <= validation_end_date)]
test_data = data[(data['Datum'] > validation_end_date) & (data['Datum'] <= test_end_date)]

# Separating features and labels
training_features = training_data.drop('Umsatz', axis=1).drop('Datum', axis=1).drop('Warengruppe', axis=1)
validation_features = validation_data.drop('Umsatz', axis=1).drop('Datum', axis=1).drop('Warengruppe', axis=1)
test_features = test_data.drop('Umsatz', axis=1).drop('Datum', axis=1).drop('Warengruppe', axis=1)

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)


Training features dimensions: (1462, 17)
Validation features dimensions: (357, 17)
Test features dimensions: (364, 17)

Training labels dimensions: (1462, 1)
Validation labels dimensions: (357, 1)
Test labels dimensions: (364, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Datum'] = pd.to_datetime(data['Datum'])


In [89]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data//W3_Croissants"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")

#### W4_Konditorei

In [90]:
data = df_Konditorei_W4

# Define your date thresholds
train_end_date = '2017-07-31'
validation_end_date = '2018-07-31'
test_end_date='2019-07-30'

#Keep only wanted features
data=data[['Umsatz','Datum','Warengruppe','Inflationsrate','Heimspiel_1.0','Markt_1.0', 'Temperatur', 'Schulferien_1.0','Sommer_1.0','Herbst_1.0','Saturday_1.0','KielerWoche_1.0','Feiertage_1.0', 'Niederschlag','Schneehoehe','Sonnenschein','Weihnachtsmarkt_1.0','Semesterferien_1.0','Werktag_1.0']]

# Convert to datetime if not already
data['Datum'] = pd.to_datetime(data['Datum'])

# Split the data based on the date thresholds
training_data = data[data['Datum'] <= train_end_date]
validation_data = data[(data['Datum'] > train_end_date) & (data['Datum'] <= validation_end_date)]
test_data = data[(data['Datum'] > validation_end_date) & (data['Datum'] <= test_end_date)]

# Separating features and labels
training_features = training_data.drop('Umsatz', axis=1).drop('Datum', axis=1).drop('Warengruppe', axis=1)
validation_features = validation_data.drop('Umsatz', axis=1).drop('Datum', axis=1).drop('Warengruppe', axis=1)
test_features = test_data.drop('Umsatz', axis=1).drop('Datum', axis=1).drop('Warengruppe', axis=1)

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)


KeyError: "['Sommer_1.0', 'Herbst_1.0', 'Saturday_1.0', 'Werktag_1.0'] not in index"

In [91]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data//W4_Konditorei"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")

#### W5_Kuchen

In [92]:
data = df_Kuchen_W5

# Define your date thresholds
train_end_date = '2017-07-31'
validation_end_date = '2018-07-31'
test_end_date='2019-07-30'

#Keep only wanted features
data=data[['Umsatz',
           'Datum',
           'Warengruppe',
            #Hier Features hinzufügen und Namen anpassen:
           'Markt_1.0',
           'Sonnenschein',
           'Niederschlag',
           'Tageslaenge',
           'Sommer_1',
           'Herbst_1',
           'Winter_1',
           'Wednesday_1',
           'Friday_1',
           'Saturday_1',
           'Sunday_1',
           'Semesterferien_1.0',
           'Temp_average_1.0',
           'KielerWoche_1.0',
           'Werktag_1']]

# Convert to datetime if not already
data['Datum'] = pd.to_datetime(data['Datum'])

# Split the data based on the date thresholds
training_data = data[data['Datum'] <= train_end_date]
validation_data = data[(data['Datum'] > train_end_date) & (data['Datum'] <= validation_end_date)]
test_data = data[(data['Datum'] > validation_end_date) & (data['Datum'] <= test_end_date)]

# Separating features and labels
training_features = training_data.drop(['Umsatz','Datum','Warengruppe'], axis=1)
validation_features = validation_data.drop(['Umsatz','Datum','Warengruppe'], axis=1)
test_features = test_data.drop(['Umsatz','Datum','Warengruppe'], axis=1)

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)


Training features dimensions: (1462, 14)
Validation features dimensions: (357, 14)
Test features dimensions: (364, 14)

Training labels dimensions: (1462, 1)
Validation labels dimensions: (357, 1)
Test labels dimensions: (364, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Datum'] = pd.to_datetime(data['Datum'])


In [93]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data//W5_Kuchen"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")

#### W6_Saisonbrot

In [94]:
data = df_Saisonbrot_W6
print (data.columns)
# Define your date thresholds
train_end_date = '2017-07-31'
validation_end_date = '2018-07-31'
test_end_date='2019-07-30'

#Keep only wanted features
data=data[['Umsatz','Datum','Warengruppe','Weihnachtsmarkt_1.0','Temperatur' ,'Friday_1.0','Sonnenschein','Werktag_1.0']]

# Convert to datetime if not already
data['Datum'] = pd.to_datetime(data['Datum'])

# Split the data based on the date thresholds
training_data = data[data['Datum'] <= train_end_date]
validation_data = data[(data['Datum'] > train_end_date) & (data['Datum'] <= validation_end_date)]
test_data = data[(data['Datum'] > validation_end_date) & (data['Datum'] <= test_end_date)]

# Separating features and labels
training_features = training_data.drop(['Umsatz','Datum','Warengruppe'], axis=1)
validation_features = validation_data.drop(['Umsatz','Datum','Warengruppe'], axis=1)
test_features = test_data.drop(['Umsatz','Datum','Warengruppe'], axis=1)



training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)


Index(['Umsatz', 'Heimspiel_1.0', 'Weihnachtsmarkt_1.0', 'Markt_1.0',
       'Frühling_1', 'Sommer_1', 'Herbst_1', 'Winter_1', 'Temp_warm_1.0',
       'Temp_cold_1.0', 'Temp_average_1.0', 'Monday_1', 'Tuesday_1',
       'Wednesday_1', 'Thursday_1', 'Friday_1', 'Saturday_1', 'Sunday_1',
       'Schulferien_1.0', 'Semesterferien_1.0', 'Feiertage_1.0',
       'KielerWoche_1.0', 'Werktag_1', 'Inflationsrate', 'Temperatur',
       'Niederschlag', 'Schneehoehe', 'Sonnenschein', 'Kreuzfahrverkehr',
       'Faehrverkaehr', 'Warengruppe', 'Datum'],
      dtype='object')


KeyError: "['Friday_1.0', 'Werktag_1.0'] not in index"

In [95]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data//W6_Saisonbrot"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")