### Import Libraries and Data

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os

# Import Data
data = pd.read_csv("/home/codespace/team3_goodweather-1/1_DatasetCharacteristics/processed_data/train_set_imputed.csv")
data.head()  # Print first few rows to verify

Unnamed: 0,Datum,id,Warengruppe,Umsatz,KielerWoche,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,Niederschlag,...,W_Cat_4,W_Cat_5,W_Cat_6,W_Cat_7,W_Cat_8,W_Cat_9,W_Cat_10,Temperatur_kalt,Temperatur_normal,Temperatur_warm
0,2013-07-01,1307011.0,1.0,148.828353,0.0,6.0,17.8375,15.0,20.0,0.3,...,1,0,0,0,0,0,0,1,0,0
1,2013-07-01,1307013.0,3.0,201.198426,0.0,6.0,17.8375,15.0,20.0,0.3,...,1,0,0,0,0,0,0,1,0,0
2,2013-07-01,1307014.0,4.0,65.890169,0.0,6.0,17.8375,15.0,20.0,0.3,...,1,0,0,0,0,0,0,1,0,0
3,2013-07-01,1307015.0,5.0,317.475875,0.0,6.0,17.8375,15.0,20.0,0.3,...,1,0,0,0,0,0,0,1,0,0
4,2013-07-01,1307012.0,2.0,535.856285,0.0,6.0,17.8375,15.0,20.0,0.3,...,1,0,0,0,0,0,0,1,0,0


### Data Preparation

In [None]:
# Define categorical features
# All features like Warengruppe, KielerWoche and hot encoded features are categorical
categorical_features = ['bathrooms', 'condition']

# Inspect data types and unique values for categorical columns
print(data[categorical_features].dtypes)
print("Unique Values:\n",data[categorical_features].apply(lambda x: x.unique()))

# Ensure categorical columns are treated as categories
for col in categorical_features:
    data[col] = data[col].astype('category')

# Encode categorical variables using pd.get_dummies
features = pd.get_dummies(data[categorical_features], dtype=int)

# Include any numeric columns that are not categorical
# numeric columns are e.g. Temperatur, Niederschlage etc. and our rolling averages
features['sqft_living15'] = data['sqft_living15']

# Construct the prepared data set including the dependent variable ('label')
prepared_data = pd.concat([data[['Umsatz']], features], axis=1)

# Keep IDs separate for later reference
ids = data['id']  # oder data.index falls bereits als Index gesetzt

# Handle missing values by removing rows with any missing values
prepared_data = prepared_data.dropna()

# Display the shape of the prepared data set
print(prepared_data.shape)
# Display the first few rows of the prepared data set
prepared_data.head()


bathrooms    category
condition    category
dtype: object
Unique Values:
 bathrooms    [1.00, 2.25, 2.00, 3.50, 2.50, ..., 0.50, 0.00...
condition    [3, 4, 5, 1, 2]
Categories (5, int64): [1, 2, ...
dtype: object
(17290, 37)


Unnamed: 0,price,bathrooms_0.0,bathrooms_0.5,bathrooms_0.75,bathrooms_1.0,bathrooms_1.25,bathrooms_1.5,bathrooms_1.75,bathrooms_2.0,bathrooms_2.25,...,bathrooms_6.75,bathrooms_7.5,bathrooms_7.75,bathrooms_8.0,condition_1,condition_2,condition_3,condition_4,condition_5,sqft_living15
0,548500.0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1720
1,625504.0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1160
2,349500.0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1600
3,730000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,3000
4,630000.0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1880


### Selection of Training, Validation and Test Data

In [None]:
# Set a random seed for reproducibility
np.random.seed(42)

# Shuffle the data
prepared_data = prepared_data.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for each dataset
# Split test so only lines without missing umsatz are in test set
n_total = len(prepared_data)
n_training = int(0.7 * n_total)
n_validation = int(0.20 * n_total)

# Split the features and labels for training, validation, and test
training_data = prepared_data.iloc[:n_training]
validation_data = prepared_data.iloc[n_training:n_training+n_validation]
test_data = prepared_data.iloc[n_training+n_validation:]

# Separating features and labels
training_features = training_data.drop('price', axis=1)
validation_features = validation_data.drop('price', axis=1)
test_features = test_data.drop('price', axis=1)

training_labels = training_data[['price']]
validation_labels = validation_data[['price']]
test_labels = test_data[['price']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)


Training features dimensions: (12103, 36)
Validation features dimensions: (3458, 36)
Test features dimensions: (1729, 36)

Training labels dimensions: (12103, 1)
Validation labels dimensions: (3458, 1)
Test labels dimensions: (1729, 1)


#### Data Export

In [None]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")