# Baseline Model

# Import of relevant libraries

In [2]:
# Importieren der nötigen Librarys
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# get datasets

In [3]:
# Einlesen des Trainingdatensatzes
url_train = "https://raw.githubusercontent.com/FabsenMc/bakery_prediction/main/0_DataPreparation/train_dataf_knn.csv"
train_data = pd.read_csv(url_train)

print(train_data.shape)
print(train_data.isnull().sum()) # shows number of missing values -> (NaN in numeric arrays, None or NaN in object arrays, NaT in datetimelike)

print(train_data)

# Einlesen des Validierungsdatensatz

url_validation = "https://raw.githubusercontent.com/FabsenMc/bakery_prediction/main/0_DataPreparation/train_dataf_knn.csv"
validation_data = pd.read_csv(url_validation)

print(validation_data.shape)
print(validation_data.isnull().sum()) # shows number of missing values -> (NaN in numeric arrays, None or NaN in object arrays, NaT in datetimelike)

print(validation_data)

# Einlesen des Testdatensatz
url_test = "https://raw.githubusercontent.com/FabsenMc/bakery_prediction/main/0_DataPreparation/train_dataf_knn.csv"
test_data = pd.read_csv(url_test)

print(test_data.shape)
print(validation_data.isnull().sum()) # shows number of missing values -> (NaN in numeric arrays, None or NaN in object arrays, NaT in datetimelike)

print(validation_data)


(7523, 20)
Datum                    0
KielerWoche              0
Warengruppe              0
Umsatz                   0
Bewoelkung               0
Temperatur               0
Windgeschwindigkeit      0
Wettercode               0
FerienSH                 0
Feiertag                 0
Umschlag                 0
Weihnachtsmarkt          0
Verbraucherpreisindex    0
Regen                    0
Schnee                   0
Wochentag_MDMDFSS        0
Wochenende               0
Jahreszeit_FSHW          0
Temperatur_Kategorie     0
THW_heimspiel            0
dtype: int64
           Datum  KielerWoche  Warengruppe      Umsatz  Bewoelkung  \
0     2013-07-01          0.0          1.0  148.828353         6.0   
1     2013-07-01          0.0          2.0  535.856285         6.0   
2     2013-07-01          0.0          3.0  201.198426         6.0   
3     2013-07-01          0.0          4.0   65.890169         6.0   
4     2013-07-01          0.0          5.0  317.475875         6.0   
...          ...

# Withdrawal of "Datum" to make Model selection run

In [4]:
# Entfernen der 'Datum'-Spalte aus allen Datensätzen
train_data = train_data.drop(columns=['Datum'])
validation_data = validation_data.drop(columns=['Datum'])
test_data = test_data.drop(columns=['Datum'])

# Schrittweise Auswahl der Merkmale basierend auf den Trainingsdaten
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out=0.05, 
                       verbose=True):
    included = list(initial_list)
    while True:
        changed = False
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

# Zielvariable und Merkmale für Trainingsdaten
X_train = train_data.drop(columns=['Umsatz'])
y_train = train_data['Umsatz']

# Durchführen der schrittweisen Merkmalsauswahl
selected_features = stepwise_selection(X_train, y_train)

# Training des Modells mit den ausgewählten Merkmalen
model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()

# Zielvariable und Merkmale für Validierungsdaten
X_val = validation_data.drop(columns=['Umsatz'])
y_val = validation_data['Umsatz']

# Vorhersagen auf den Validierungsdaten
y_val_pred = model.predict(sm.add_constant(X_val[selected_features]))

# Validierungsmetriken berechnen
r2_val = r2_score(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
mae_val = mean_absolute_error(y_val, y_val_pred)

print(f'Validierungs-R²: {r2_val:.2f}')
print(f'Validierungs-MSE: {mse_val:.2f}')
print(f'Validierungs-MAE: {mae_val:.2f}')

Add  Temperatur                     with p-value 2.47916e-80
Add  Wochenende                     with p-value 3.03835e-45
Add  FerienSH                       with p-value 1.61338e-33
Add  Feiertag                       with p-value 2.75595e-13
Add  Verbraucherpreisindex          with p-value 4.5982e-09
Add  Warengruppe                    with p-value 5.35926e-06
Add  Weihnachtsmarkt                with p-value 6.03155e-05
Add  KielerWoche                    with p-value 0.00179142
Validierungs-R²: 0.11
Validierungs-MSE: 19471.38
Validierungs-MAE: 110.41


In [5]:
# Zielvariable und Merkmale für Testdaten
X_test = test_data.drop(columns=['Umsatz'])
y_test = test_data['Umsatz']

# Vorhersagen auf den Testdaten
y_test_pred = model.predict(sm.add_constant(X_test[selected_features]))

# Testmetriken berechnen
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print(f'Test-R²: {r2_test:.2f}')
print(f'Test-MSE: {mse_test:.2f}')
print(f'Test-MAE: {mae_test:.2f}')

Test-R²: 0.11
Test-MSE: 19471.38
Test-MAE: 110.41


# Feature selection and Model selection

In [6]:
# Schrittweise Auswahl der Merkmale basierend auf den Trainingsdaten
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out=0.05, 
                       verbose=True):
    included = list(initial_list)
    while True:
        changed = False
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max()
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

# Zielvariable und Merkmale für Trainingsdaten
X_train = train_data.drop(columns=['Umsatz'])
y_train = train_data['Umsatz']

# Durchführen der schrittweisen Merkmalsauswahl
selected_features = stepwise_selection(X_train, y_train)

# Training des Modells mit den ausgewählten Merkmalen
model = sm.OLS(y_train, sm.add_constant(X_train[selected_features])).fit()


Add  Temperatur                     with p-value 2.47916e-80
Add  Wochenende                     with p-value 3.03835e-45
Add  FerienSH                       with p-value 1.61338e-33
Add  Feiertag                       with p-value 2.75595e-13
Add  Verbraucherpreisindex          with p-value 4.5982e-09
Add  Warengruppe                    with p-value 5.35926e-06
Add  Weihnachtsmarkt                with p-value 6.03155e-05
Add  KielerWoche                    with p-value 0.00179142


In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import statsmodels.api as sm

# Daten vorbereiten
X_train = train_data.drop(columns=['Umsatz'])
y_train = train_data['Umsatz']
X_val = validation_data.drop(columns=['Umsatz'])
y_val = validation_data['Umsatz']
X_test = test_data.drop(columns=['Umsatz'])
y_test = test_data['Umsatz']

# Random Forest Modell
rf = RandomForestRegressor()

# Hyperparameter-Tuning mit GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2'],  # 'auto' entfernt und durch 'sqrt' ersetzt
    'max_depth': [10, 20, 30, None]
}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Bestes Modell
best_rf = grid_search.best_estimator_

# Vorhersagen und Bewertung auf den Validierungsdaten
y_val_pred = best_rf.predict(X_val)
r2_val = r2_score(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
mae_val = mean_absolute_error(y_val, y_val_pred)

print(f'Validation-R²: {r2_val:.2f}')
print(f'Validation-MSE: {mse_val:.2f}')
print(f'Validation-MAE: {mae_val:.2f}')

# Vorhersagen und Bewertung auf den Testdaten
y_test_pred = best_rf.predict(X_test)
r2_test = r2_score(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print(f'Test-R²: {r2_test:.2f}')
print(f'Test-MSE: {mse_test:.2f}')
print(f'Test-MAE: {mae_test:.2f}')

# Wenn Sie die Gleichung eines linearen Modells drucken möchten, verwenden Sie statsmodels:
X_train_with_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_train_with_const).fit()
coefficients = model.params
print("Lineare Modellgleichung:")
equation = "Umsatz = " + " + ".join(f"{coef:.2f}*{name}" for name, coef in zip(['const'] + X_train.columns.tolist(), coefficients))
print(equation)


Validation-R²: 0.97
Validation-MSE: 689.34
Validation-MAE: 16.44
Test-R²: 0.97
Test-MSE: 689.34
Test-MAE: 16.44
Lineare Modellgleichung:
Umsatz = 1227.66*const + 32.02*KielerWoche + -4.57*Warengruppe + 0.34*Bewoelkung + 3.40*Temperatur + 0.07*Windgeschwindigkeit + 0.09*Wettercode + 44.90*FerienSH + 85.44*Feiertag + -10.10*Umschlag + -21.15*Weihnachtsmarkt + -11.44*Verbraucherpreisindex + -5.91*Regen + 4.69*Schnee + 1.35*Wochentag_MDMDFSS + 47.21*Wochenende + -1.15*Jahreszeit_FSHW + 5.35*Temperatur_Kategorie + -7.21*THW_heimspiel


In [13]:
# Beispiel einer Funktion zur Vorhersage
def predict(features, coefficients):
    const = coefficients['const']
    return const + sum(coefficients[name] * features[name] for name in features.index if name in coefficients)

# Beispielvorhersage für einen neuen Datenpunkt
new_data_point = X_test.iloc[0]
predicted_value = predict(new_data_point, coefficients)
print(f"Vorhergesagter Umsatz: {predicted_value:.2f}")


Vorhergesagter Umsatz: 271.05
