### Import Libraries and Data

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os

# Import Data
data = pd.read_csv("/home/codespace/team3_goodweather-1/1_DatasetCharacteristics/processed_data/combined_data_final_imputed.csv")
data.head()  # Print first few rows to verify

Unnamed: 0,Datum,id,Warengruppe,Umsatz,KielerWoche,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,Niederschlag,...,W_Cat_4,W_Cat_5,W_Cat_6,W_Cat_7,W_Cat_8,W_Cat_9,W_Cat_10,Temperatur_kalt,Temperatur_normal,Temperatur_warm
0,2013-07-01,1307011.0,1.0,148.828353,0.0,6.0,17.8375,15.0,20.0,0.3,...,1,0,0,0,0,0,0,1,0,0
1,2013-07-01,1307013.0,3.0,201.198426,0.0,6.0,17.8375,15.0,20.0,0.3,...,1,0,0,0,0,0,0,1,0,0
2,2013-07-01,1307014.0,4.0,65.890169,0.0,6.0,17.8375,15.0,20.0,0.3,...,1,0,0,0,0,0,0,1,0,0
3,2013-07-01,1307015.0,5.0,317.475875,0.0,6.0,17.8375,15.0,20.0,0.3,...,1,0,0,0,0,0,0,1,0,0
4,2013-07-01,1307012.0,2.0,535.856285,0.0,6.0,17.8375,15.0,20.0,0.3,...,1,0,0,0,0,0,0,1,0,0


### Data Preparation

In [2]:
# Define categorical features that need encoding
# KielerWoche and is_holiday are categorical
categorical_features = ['KielerWoche', 'is_holiday']

# Inspect data types and unique values for categorical columns
print(data[categorical_features].dtypes)
print("Unique Values:\n",data[categorical_features].apply(lambda x: x.unique()))

# Ensure categorical columns are treated as categories
for col in categorical_features:
    data[col] = data[col].astype('category')

# Encode categorical variables using pd.get_dummies
features = pd.get_dummies(data[categorical_features], dtype=int)

# Include numeric columns (weather data, rolling averages, etc.)
numeric_features = ['Bewoelkung', 'Temperatur', 'Windgeschwindigkeit', 'Wettercode', 'Niederschlag',
                   'Temperatur_7day_rolling', 'Niederschlag_7day_rolling']
for col in numeric_features:
    features[col] = data[col]

# Include already one-hot encoded features (Warengruppe, Weekday, W_Cat, Temperatur categories, Niederschlag categories)
encoded_features = ['Warengruppe_1.0', 'Warengruppe_2.0', 'Warengruppe_3.0', 'Warengruppe_4.0', 'Warengruppe_5.0', 'Warengruppe_6.0',
                   'Weekday_Monday', 'Weekday_Tuesday', 'Weekday_Wednesday', 'Weekday_Thursday', 'Weekday_Friday', 'Weekday_Saturday', 'Weekday_Sunday',
                   'W_Cat_-1', 'W_Cat_1', 'W_Cat_2', 'W_Cat_3', 'W_Cat_4', 'W_Cat_5', 'W_Cat_6', 'W_Cat_7', 'W_Cat_8', 'W_Cat_9', 'W_Cat_10',
                   'Temperatur_kalt', 'Temperatur_normal', 'Temperatur_warm',
                   'Niederschlag_trocken', 'Niederschlag_nass']
for col in encoded_features:
    features[col] = data[col]

# Construct the prepared data set including the dependent variable ('Umsatz') and metadata
prepared_data = pd.concat([data[['Datum', 'id', 'Umsatz']], features], axis=1)

# Convert Datum to datetime for proper sorting
prepared_data['Datum'] = pd.to_datetime(prepared_data['Datum'])

# Separate data with Umsatz (training/validation) from data without Umsatz (prediction)
data_with_umsatz = prepared_data[prepared_data['Umsatz'].notna()].copy()
data_without_umsatz = prepared_data[prepared_data['Umsatz'].isna()].copy()

print(f"Data with Umsatz (for training): {len(data_with_umsatz)} rows")
print(f"Date range: {data_with_umsatz['Datum'].min()} to {data_with_umsatz['Datum'].max()}")
print(f"\nData without Umsatz (for prediction): {len(data_without_umsatz)} rows")
print(f"Date range: {data_without_umsatz['Datum'].min()} to {data_without_umsatz['Datum'].max()}")

# Display the shape of the prepared data set
print(f"\nTotal prepared data shape: {prepared_data.shape}")
# Display the first few rows of the prepared data set
prepared_data.head()


KielerWoche    float64
is_holiday       int64
dtype: object
Unique Values:
    KielerWoche  is_holiday
0          0.0           0
1          1.0           1
Data with Umsatz (for training): 9334 rows
Date range: 2013-07-01 00:00:00 to 2018-07-31 00:00:00

Data without Umsatz (for prediction): 1877 rows
Date range: 2013-12-25 00:00:00 to 2019-07-30 00:00:00

Total prepared data shape: (11211, 43)


Unnamed: 0,Datum,id,Umsatz,KielerWoche_0.0,KielerWoche_1.0,is_holiday_0,is_holiday_1,Bewoelkung,Temperatur,Windgeschwindigkeit,...,W_Cat_6,W_Cat_7,W_Cat_8,W_Cat_9,W_Cat_10,Temperatur_kalt,Temperatur_normal,Temperatur_warm,Niederschlag_trocken,Niederschlag_nass
0,2013-07-01,1307011.0,148.828353,1,0,1,0,6.0,17.8375,15.0,...,0,0,0,0,0,1,0,0,0,1
1,2013-07-01,1307013.0,201.198426,1,0,1,0,6.0,17.8375,15.0,...,0,0,0,0,0,1,0,0,0,1
2,2013-07-01,1307014.0,65.890169,1,0,1,0,6.0,17.8375,15.0,...,0,0,0,0,0,1,0,0,0,1
3,2013-07-01,1307015.0,317.475875,1,0,1,0,6.0,17.8375,15.0,...,0,0,0,0,0,1,0,0,0,1
4,2013-07-01,1307012.0,535.856285,1,0,1,0,6.0,17.8375,15.0,...,0,0,0,0,0,1,0,0,0,1


### Selection of Training, Validation and Test Data

In [3]:
# TIME-BASED SPLIT: Use chronological order instead of random shuffle
# Training/Validation: All data with Umsatz (2013-07-01 to 2018-07-31)
# Prediction: All data without Umsatz (2018-08-01 to 2019-07-30)

# Sort data with Umsatz by date to maintain temporal order
data_with_umsatz = data_with_umsatz.sort_values('Datum').reset_index(drop=True)

# Split data with Umsatz into training (80%) and validation (20%) based on time
n_total = len(data_with_umsatz)
n_training = int(0.8 * n_total)

training_data = data_with_umsatz.iloc[:n_training]
validation_data = data_with_umsatz.iloc[n_training:]

print(f"Training data: {len(training_data)} rows")
print(f"Date range: {training_data['Datum'].min()} to {training_data['Datum'].max()}")
print(f"\nValidation data: {len(validation_data)} rows")
print(f"Date range: {validation_data['Datum'].min()} to {validation_data['Datum'].max()}")

# Prepare prediction data - filter to only include data from 2018-08-01 onwards
prediction_data = data_without_umsatz[data_without_umsatz['Datum'] >= '2018-08-01'].copy()
prediction_data = prediction_data.sort_values('Datum').reset_index(drop=True)
print(f"\nPrediction data: {len(prediction_data)} rows")
print(f"Date range: {prediction_data['Datum'].min()} to {prediction_data['Datum'].max()}")

# Define feature columns (exclude Datum, id, Umsatz)
feature_columns = [col for col in prepared_data.columns if col not in ['Datum', 'id', 'Umsatz']]

# Separating features and labels for training and validation
training_features = training_data[feature_columns]
validation_features = validation_data[feature_columns]

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]

# Prepare prediction features (no labels available)
prediction_features = prediction_data[feature_columns]
prediction_ids = prediction_data[['id', 'Datum']]

# Print dimensions of the dataframes
print("\n=== Data Dimensions ===")
print("Training features:", training_features.shape)
print("Validation features:", validation_features.shape)
print("Prediction features:", prediction_features.shape)
print()
print("Training labels:", training_labels.shape)
print("Validation labels:", validation_labels.shape)


Training data: 7467 rows
Date range: 2013-07-01 00:00:00 to 2017-07-26 00:00:00

Validation data: 1867 rows
Date range: 2017-07-26 00:00:00 to 2018-07-31 00:00:00

Prediction data: 1839 rows
Date range: 2018-08-01 00:00:00 to 2019-07-30 00:00:00

=== Data Dimensions ===
Training features: (7467, 40)
Validation features: (1867, 40)
Prediction features: (1839, 40)

Training labels: (7467, 1)
Validation labels: (1867, 1)


#### Data Export

In [4]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
prediction_features.to_pickle(f"{subdirectory}/prediction_features.pkl")

training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
prediction_ids.to_pickle(f"{subdirectory}/prediction_ids.pkl")

print("\nData successfully exported to pickle files:")
print(f"- {subdirectory}/training_features.pkl")
print(f"- {subdirectory}/training_labels.pkl")
print(f"- {subdirectory}/validation_features.pkl")
print(f"- {subdirectory}/validation_labels.pkl")
print(f"- {subdirectory}/prediction_features.pkl")
print(f"- {subdirectory}/prediction_ids.pkl (contains id and Datum for predictions)")



Data successfully exported to pickle files:
- pickle_data/training_features.pkl
- pickle_data/training_labels.pkl
- pickle_data/validation_features.pkl
- pickle_data/validation_labels.pkl
- pickle_data/prediction_features.pkl
- pickle_data/prediction_ids.pkl (contains id and Datum for predictions)
