# Random Forest

In [10]:
import pandas as pd

import sys
sys.path.append('../0_DataPreparation')
from utils import plot_missing_heatmap


# read cleaned datasets from ../data/processed
df_train_clean = pd.read_csv("../data/processed/df_train_data_cleaned.csv")
df_validation_clean = pd.read_csv("../data/processed/df_validation_data_cleaned.csv")
df_test_clean = pd.read_csv("../data/processed/df_test_data_cleaned.csv")

# convert Datum column to datetime
df_train_clean['Datum'] = pd.to_datetime(df_train_clean['Datum'])
df_validation_clean['Datum'] = pd.to_datetime(df_validation_clean['Datum'])
df_test_clean['Datum'] = pd.to_datetime(df_test_clean['Datum'])

# print shape of the datasets
print(f"Train data shape: {df_train_clean.shape}")
print(f"Validation data shape: {df_validation_clean.shape}")
print(f"Test data shape: {df_test_clean.shape}")

# add month to train
df_train_clean['month'] = df_train_clean['Datum'].dt.month
# add month to validation
df_validation_clean['month'] = df_validation_clean['Datum'].dt.month

Train data shape: (7487, 17)
Validation data shape: (1841, 18)
Test data shape: (1830, 16)


In [12]:
# Lets test random forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# list of columns to drop
drop_columns = ['Datum', 'Umsatz_umsatz', 'umsatz_rolling7','temperature_2m_mean']

# Assume df_train_data_cleaned has features X and target y
X = df_train_clean.drop(columns=drop_columns)  # Exclude target, date, and rolling7
y = df_train_clean['Umsatz_umsatz']

# Handle categorical features (e.g., encode holidays if needed)
X = pd.get_dummies(X, drop_first=True)

# Train model
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X, y)

# Feature importances
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print(importances)

# print r2 score on training data
r2_score = rf.score(X, y)
print(f"R2 Score on Training Data: {r2_score}")
# use it for predictions on validation data df_validation_data_cleaned and also calculate r2 score and MAPE
X_val = df_validation_clean.drop(columns=drop_columns)
X_val = pd.get_dummies(X_val, drop_first=True)
y_val = df_validation_clean['Umsatz_umsatz']
y_pred = rf.predict(X_val)
from sklearn.metrics import r2_score, mean_absolute_percentage_error
r2_score_val = r2_score(y_val, y_pred)
mape_val = mean_absolute_percentage_error(y_val, y_pred)
print(f"R2 Score on Validation Data: {r2_score_val}")
print(f"MAPE on Validation Data: {mape_val}")


Warengruppe_umsatz             0.649346
Temperatur_weather             0.136403
day_of_week                    0.058931
school_holiday                 0.032472
month                          0.029033
sunshine_hours                 0.017848
Windgeschwindigkeit_weather    0.017686
sunshine_duration              0.015902
next_day_holiday               0.013886
Bewoelkung_weather             0.010347
public_holiday                 0.007284
is_sunday                      0.006260
is_saturday                    0.002807
KielerWoche_kiwo               0.001796
dtype: float64
R2 Score on Training Data: 0.9825224804346953
R2 Score on Validation Data: 0.753744693429515
MAPE on Validation Data: 0.21369789097648734


In [None]:
# merge df_train_clean and df_validation_clean and create a new complete df_train_complete
df_train_complete = pd.concat([df_train_clean, df_validation_clean], ignore_index=True)

# print shape of df_train_complete
print(f"Complete Train data shape: {df_train_complete.shape}")
df_test_clean.head()




In [None]:
# use random forest regresson to train on df_train_complete and predict on df_test_clean

drop_columns = ['Datum', 'umsatz_rolling7','temperature_2m_mean']

TARGET = "Umsatz" 
# rename X_complete Umsatz_umsatz column to Umsatz
df_train_complete.rename(columns={'Umsatz_umsatz': TARGET}, inplace=True)
df_train_complete.rename(columns={'Warengruppe_umsatz': "Warengruppe"}, inplace=True)

# Add TARGET to drop_columns to exclude the target from features
drop_columns_complete = drop_columns + [TARGET]

X_complete = df_train_complete.drop(columns=drop_columns_complete)
y_complete = df_train_complete[TARGET]
X_complete = pd.get_dummies(X_complete, drop_first=True)
rf_complete = RandomForestRegressor(n_estimators=400, random_state=42)
rf_complete.fit(X_complete, y_complete)
# print r2 score on training data
r2_score_complete = rf_complete.score(X_complete, y_complete)
print(f"R2 Score on Complete Training Data: {r2_score_complete}")

# print feature importances
importances_complete = pd.Series(rf_complete.feature_importances_, index=X_complete.columns).sort_values(ascending=False)
print(importances_complete)

# prepare df_test_clean for prediction
drop_columns_test = ['Datum','temperature_2m_mean','id_test', TARGET]

# add column month to df_test_clean
df_test_clean['month'] = df_test_clean['Datum'].dt.month
# rename Warengruppe_test to Warengruppe
df_test_clean.rename(columns={'Warengruppe_test': 'Warengruppe'}, inplace=True)

# add empty TARGET column to df_test_clean which will be predicted
df_test_clean[TARGET] = 0
X_test = df_test_clean.drop(columns=drop_columns_test)

# Ensure X_test has the same dummy columns as X_complete
X_test = pd.get_dummies(X_test, drop_first=True)
# Reindex X_test to match X_complete columns, filling missing with 0
X_test = X_test.reindex(columns=X_complete.columns, fill_value=0)

y_test_pred = rf_complete.predict(X_test)

# print y_test_pred
print(y_test_pred[:5])  # Sample predictions

In [None]:
print (y_test_pred.shape)

# concat id_test column from df_test_clean with y_test_pred into a dataframe and save as csv
submission_df = pd.DataFrame({
    'id': df_test_clean['id_test'],
    'umsatz': y_test_pred
})

submission_df.to_csv("../data/processed/predicted_rf_submission.csv", index=False)
