# Capstone Projekt Rossmann

# Feature Engineering

## Feature Engineering

In [457]:
import pandas as pd
import numpy as np
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from pandas.api.types import infer_dtype



pd.set_option('display.max_columns', None)

In [458]:
df = pd.read_csv('weekly_sales_with_store_info.csv')
df['Date'] = pd.to_datetime(df['Date'])

In [459]:
print(df.info())
df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150525 entries, 0 to 150524
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   Store                      150525 non-null  int64         
 1   Date                       150525 non-null  datetime64[ns]
 2   CW                         150525 non-null  int64         
 3   Month                      150525 non-null  int64         
 4   Year                       150525 non-null  int64         
 5   DayOfWeek                  150525 non-null  int64         
 6   Sales                      150525 non-null  int64         
 7   SalesPerCustomer           145809 non-null  float64       
 8   SalesPerOpenDay            145815 non-null  float64       
 9   Customers                  150525 non-null  int64         
 10  CustomersPerOpenDay        145815 non-null  float64       
 11  Open                       150525 non-null  int64   

Unnamed: 0,Store,Date,CW,Month,Year,DayOfWeek,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,StateHoliday,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,IsCompetition,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Promo2Member,Promo2Active
67420,500,2014-01-26,4,1,2014,6,33262,14.046453,5543.666667,2368,394.666667,6,5,1,0,0,0,0,0,d,c,10690.0,9.0,2007.0,1,1,5.0,2013.0,"Jan,Apr,Jul,Oct",1,1
109384,811,2013-09-01,35,9,2013,6,27569,6.314475,4594.833333,4366,727.666667,6,5,1,0,0,0,0,0,a,a,410.0,9.0,2012.0,1,0,,,,0,0
110097,816,2014-05-25,21,5,2014,6,28720,6.725995,4786.666667,4270,711.666667,6,5,1,0,0,0,0,0,c,c,460.0,,,0,0,,,,0,0
68833,510,2015-04-12,15,4,2015,6,36703,9.33681,7340.6,3931,786.2,5,0,0,b,1,5,1,1,a,c,8260.0,,,0,0,,,,0,0
88727,658,2013-08-18,33,8,2013,6,42307,10.845168,7051.166667,3901,650.166667,6,5,1,0,0,5,1,0,d,c,520.0,,,0,1,37.0,2009.0,"Jan,Apr,Jul,Oct",1,0


### Handle Missing Values

In [460]:
df.isna().sum()

Store                            0
Date                             0
CW                               0
Month                            0
Year                             0
DayOfWeek                        0
Sales                            0
SalesPerCustomer              4716
SalesPerOpenDay               4710
Customers                        0
CustomersPerOpenDay           4710
Open                             0
Promo                            0
IsPromo                          0
StateHoliday                     0
IsStateHoliday                   0
SchoolHoliday                    0
IsSchoolHoliday                  0
NumStateHoliday                  0
StoreType                        0
Assortment                       0
CompetitionDistance            405
CompetitionOpenSinceMonth    47790
CompetitionOpenSinceYear     47790
IsCompetition                    0
Promo2                           0
Promo2SinceWeek              73440
Promo2SinceYear              73440
PromoInterval       

#### SalesPerCustomer, SalesPerOpenday, CustomersPerOpenday

In [461]:
# As the store were closed, we can fill the nans with 0

# fill nans with 0 for listed columns
columns_to_fill = ['SalesPerCustomer', 'SalesPerOpenDay', 'CustomersPerOpenDay']
df_nans_handeled = df.fillna({col: 0 for col in columns_to_fill})
df_nans_handeled


Unnamed: 0,Store,Date,CW,Month,Year,DayOfWeek,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,StateHoliday,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,IsCompetition,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Promo2Member,Promo2Active
0,1,2013-01-06,1,1,2013,6,19340,7.736000,4835.000000,2500,625.000000,4,0,0,a,1,6,1,1,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
1,1,2013-01-13,2,1,2013,6,32952,8.410413,5492.000000,3918,653.000000,6,5,1,0,0,5,1,0,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
2,1,2013-01-20,3,1,2013,6,25978,7.602575,4329.666667,3417,569.500000,6,0,0,0,0,0,0,0,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
3,1,2013-01-27,4,1,2013,6,33071,8.563180,5511.833333,3862,643.666667,6,5,1,0,0,0,0,0,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
4,1,2013-02-03,5,2,2013,6,28693,8.057568,4782.166667,3561,593.500000,6,0,0,0,0,0,0,0,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150520,1115,2015-07-05,27,7,2015,6,48130,16.140174,8021.666667,2982,497.000000,6,5,1,0,0,0,0,0,d,c,5350.0,,,0,1,22.0,2012.0,"Mar,Jun,Sept,Dec",1,0
150521,1115,2015-07-12,28,7,2015,6,36233,14.315685,6038.833333,2531,421.833333,6,0,0,0,0,0,0,0,d,c,5350.0,,,0,1,22.0,2012.0,"Mar,Jun,Sept,Dec",1,0
150522,1115,2015-07-19,29,7,2015,6,45927,15.023553,7654.500000,3057,509.500000,6,5,1,0,0,0,0,0,d,c,5350.0,,,0,1,22.0,2012.0,"Mar,Jun,Sept,Dec",1,0
150523,1115,2015-07-26,30,7,2015,6,35362,14.122204,5893.666667,2504,417.333333,6,0,0,0,0,0,0,0,d,c,5350.0,,,0,1,22.0,2012.0,"Mar,Jun,Sept,Dec",1,0


In [462]:
df_nans_handeled.isna().sum()

Store                            0
Date                             0
CW                               0
Month                            0
Year                             0
DayOfWeek                        0
Sales                            0
SalesPerCustomer                 0
SalesPerOpenDay                  0
Customers                        0
CustomersPerOpenDay              0
Open                             0
Promo                            0
IsPromo                          0
StateHoliday                     0
IsStateHoliday                   0
SchoolHoliday                    0
IsSchoolHoliday                  0
NumStateHoliday                  0
StoreType                        0
Assortment                       0
CompetitionDistance            405
CompetitionOpenSinceMonth    47790
CompetitionOpenSinceYear     47790
IsCompetition                    0
Promo2                           0
Promo2SinceWeek              73440
Promo2SinceYear              73440
PromoInterval       

#### CompetitionDistance

In [463]:
# Stores with no CompetitionDistance information
print("Stores with no CompetitionDistance information:", df_nans_handeled[(df_nans_handeled['CompetitionDistance'].isna())]['Store'].unique())

print("StoreType of store 291", df_nans_handeled[(df_nans_handeled['Store'] == 291)]['StoreType'].unique())
print("StoreType of store 622", df_nans_handeled[(df_nans_handeled['Store'] == 622)]['StoreType'].unique())
print("StoreType of store 879", df_nans_handeled[(df_nans_handeled['Store'] == 879)]['StoreType'].unique())

Stores with no CompetitionDistance information: [291 622 879]
StoreType of store 291 ['d']
StoreType of store 622 ['a']
StoreType of store 879 ['d']


In [464]:
# As store 291, 622 and 879 have no CompetitionDistance information, we can fill them with the median value of the column

# median competition distance for store type a
median_competition_distance_a = df_nans_handeled[(df_nans_handeled['StoreType'] == 'a')]['CompetitionDistance'].median()
# median competition distance for store type d
median_competition_distance_d = df_nans_handeled[(df_nans_handeled['StoreType'] == 'd')]['CompetitionDistance'].median()

# fill nans for storetype a with median_competition_distance_a
df_nans_handeled.loc[(df_nans_handeled['Store'] == 291), 'CompetitionDistance'] = median_competition_distance_a
# fill nans for storetype d with median_competition_distance_d
df_nans_handeled.loc[(df_nans_handeled['Store'] == 622), 'CompetitionDistance'] = median_competition_distance_d
# fill nans for storetype d with median_competition_distance_d
df_nans_handeled.loc[(df_nans_handeled['Store'] == 879), 'CompetitionDistance'] = median_competition_distance_d



In [465]:
df_nans_handeled.isna().sum()

Store                            0
Date                             0
CW                               0
Month                            0
Year                             0
DayOfWeek                        0
Sales                            0
SalesPerCustomer                 0
SalesPerOpenDay                  0
Customers                        0
CustomersPerOpenDay              0
Open                             0
Promo                            0
IsPromo                          0
StateHoliday                     0
IsStateHoliday                   0
SchoolHoliday                    0
IsSchoolHoliday                  0
NumStateHoliday                  0
StoreType                        0
Assortment                       0
CompetitionDistance              0
CompetitionOpenSinceMonth    47790
CompetitionOpenSinceYear     47790
IsCompetition                    0
Promo2                           0
Promo2SinceWeek              73440
Promo2SinceYear              73440
PromoInterval       

#### CompetitionOpenSinceMonth, CompetitionOpenSinceYear

In [466]:
# CompetitionOpenSinceMonth and CompetitionOpenSinceYear can be deleted as they are reflected in IsCompetition
df_nans_handeled = df_nans_handeled.drop(columns=['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'])

In [467]:
df_nans_handeled.isna().sum()

Store                      0
Date                       0
CW                         0
Month                      0
Year                       0
DayOfWeek                  0
Sales                      0
SalesPerCustomer           0
SalesPerOpenDay            0
Customers                  0
CustomersPerOpenDay        0
Open                       0
Promo                      0
IsPromo                    0
StateHoliday               0
IsStateHoliday             0
SchoolHoliday              0
IsSchoolHoliday            0
NumStateHoliday            0
StoreType                  0
Assortment                 0
CompetitionDistance        0
IsCompetition              0
Promo2                     0
Promo2SinceWeek        73440
Promo2SinceYear        73440
PromoInterval          73440
Promo2Member               0
Promo2Active               0
dtype: int64

#### Promo2SinceWeek, Promo2SinceYear

In [468]:
# Promo2SinceWeek and Promo2SinceYear can be deleted as they are reflected in Promo2Member
df_nans_handeled = df_nans_handeled.drop(columns=['Promo2SinceWeek', 'Promo2SinceYear'])

In [469]:
df_nans_handeled.isna().sum()

Store                      0
Date                       0
CW                         0
Month                      0
Year                       0
DayOfWeek                  0
Sales                      0
SalesPerCustomer           0
SalesPerOpenDay            0
Customers                  0
CustomersPerOpenDay        0
Open                       0
Promo                      0
IsPromo                    0
StateHoliday               0
IsStateHoliday             0
SchoolHoliday              0
IsSchoolHoliday            0
NumStateHoliday            0
StoreType                  0
Assortment                 0
CompetitionDistance        0
IsCompetition              0
Promo2                     0
PromoInterval          73440
Promo2Member               0
Promo2Active               0
dtype: int64

#### PromoInterval

In [470]:
df_nans_handeled[(df_nans_handeled['Promo2'] == 1) & (df_nans_handeled['PromoInterval'].isna())]

Unnamed: 0,Store,Date,CW,Month,Year,DayOfWeek,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,StateHoliday,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,StoreType,Assortment,CompetitionDistance,IsCompetition,Promo2,PromoInterval,Promo2Member,Promo2Active


In [471]:
# As if the store is not participating in Promo2, PromoInterval is 0, we can fill the nans with 0
df_nans_handeled['PromoInterval'] = df_nans_handeled['PromoInterval'].fillna(0)

In [472]:
df_nans_handeled.isna().sum()

Store                  0
Date                   0
CW                     0
Month                  0
Year                   0
DayOfWeek              0
Sales                  0
SalesPerCustomer       0
SalesPerOpenDay        0
Customers              0
CustomersPerOpenDay    0
Open                   0
Promo                  0
IsPromo                0
StateHoliday           0
IsStateHoliday         0
SchoolHoliday          0
IsSchoolHoliday        0
NumStateHoliday        0
StoreType              0
Assortment             0
CompetitionDistance    0
IsCompetition          0
Promo2                 0
PromoInterval          0
Promo2Member           0
Promo2Active           0
dtype: int64

### Remove not needed Features

In [473]:
# Store is just an ID
#df_nans_handeled = df_nans_handeled.drop(columns=['Store'])

# Date is an object and is reflected by CW, Month and Year
df_nans_handeled = df_nans_handeled.drop(columns=['Date'])

# DayOfWeek is not relevant in weekly data
df_nans_handeled = df_nans_handeled.drop(columns=['DayOfWeek'])

df_nans_handeled

Unnamed: 0,Store,CW,Month,Year,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,StateHoliday,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,StoreType,Assortment,CompetitionDistance,IsCompetition,Promo2,PromoInterval,Promo2Member,Promo2Active
0,1,1,1,2013,19340,7.736000,4835.000000,2500,625.000000,4,0,0,a,1,6,1,1,c,a,1270.0,1,0,0,0,0
1,1,2,1,2013,32952,8.410413,5492.000000,3918,653.000000,6,5,1,0,0,5,1,0,c,a,1270.0,1,0,0,0,0
2,1,3,1,2013,25978,7.602575,4329.666667,3417,569.500000,6,0,0,0,0,0,0,0,c,a,1270.0,1,0,0,0,0
3,1,4,1,2013,33071,8.563180,5511.833333,3862,643.666667,6,5,1,0,0,0,0,0,c,a,1270.0,1,0,0,0,0
4,1,5,2,2013,28693,8.057568,4782.166667,3561,593.500000,6,0,0,0,0,0,0,0,c,a,1270.0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150520,1115,27,7,2015,48130,16.140174,8021.666667,2982,497.000000,6,5,1,0,0,0,0,0,d,c,5350.0,0,1,"Mar,Jun,Sept,Dec",1,0
150521,1115,28,7,2015,36233,14.315685,6038.833333,2531,421.833333,6,0,0,0,0,0,0,0,d,c,5350.0,0,1,"Mar,Jun,Sept,Dec",1,0
150522,1115,29,7,2015,45927,15.023553,7654.500000,3057,509.500000,6,5,1,0,0,0,0,0,d,c,5350.0,0,1,"Mar,Jun,Sept,Dec",1,0
150523,1115,30,7,2015,35362,14.122204,5893.666667,2504,417.333333,6,0,0,0,0,0,0,0,d,c,5350.0,0,1,"Mar,Jun,Sept,Dec",1,0


In [474]:
#df_cleaned

### Categorical Feature Encoding

In [475]:
df_nans_handeled.select_dtypes(include='object').columns

Index(['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval'], dtype='object')

In [476]:
# check if a column contains mixed data typesrom pandas.api.types import infer_dtype

for col in ['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']:
    dtype = infer_dtype(df_nans_handeled[col])
    print(f"Data type of {col}: {dtype}")


Data type of StateHoliday: string
Data type of StoreType: string
Data type of Assortment: string
Data type of PromoInterval: mixed-integer


In [477]:
# Convert mixed columns
cols_to_convert = ['PromoInterval']
df_nans_handeled[cols_to_convert] = df_nans_handeled[cols_to_convert].astype(str)

In [478]:
from sklearn.preprocessing import OneHotEncoder
# handle_unknown='ignore': um Fehler zu vermeiden, wenn die Trainingsdaten Klassen/Kategorien enthalten, die nicht in den Trainingsdaten vertreten sind
# sparse=False: stellt sicher, dass die kodierten Spalten als NumPy-Array zurückgegeben werden (anstelle einer Sparse-Matrix).
OneHotEnc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# Es dürfen nur die Kategorialen Spalten üergeben werden, nicht das ganze df
encoded_array = OneHotEnc.fit_transform(df_nans_handeled[['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']])
tmp_cat = pd.DataFrame(encoded_array, columns=OneHotEnc.get_feature_names_out(), index=df_nans_handeled.index)
df_nans_handeled_cat = pd.concat([df_nans_handeled.select_dtypes(include=['number']), tmp_cat], axis=1)
df_nans_handeled_cat

Unnamed: 0,Store,CW,Month,Year,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,CompetitionDistance,IsCompetition,Promo2,Promo2Member,Promo2Active,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec"
0,1,1,1,2013,19340,7.736000,4835.000000,2500,625.000000,4,0,0,1,6,1,1,1270.0,1,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,2,1,2013,32952,8.410413,5492.000000,3918,653.000000,6,5,1,0,5,1,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1,3,1,2013,25978,7.602575,4329.666667,3417,569.500000,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1,4,1,2013,33071,8.563180,5511.833333,3862,643.666667,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1,5,2,2013,28693,8.057568,4782.166667,3561,593.500000,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150520,1115,27,7,2015,48130,16.140174,8021.666667,2982,497.000000,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
150521,1115,28,7,2015,36233,14.315685,6038.833333,2531,421.833333,6,0,0,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
150522,1115,29,7,2015,45927,15.023553,7654.500000,3057,509.500000,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
150523,1115,30,7,2015,35362,14.122204,5893.666667,2504,417.333333,6,0,0,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [479]:
# Lag-Features für diese Spalten erstellen
lag_columns = ['Sales', 'SalesPerCustomer', 'SalesPerOpenDay', 'Customers', 'CustomersPerOpenDay']
n_lags = 8  # Anzahl der zu erstellenden Lag-Features
n_periods_for_ma = 4  # Anzahl der Perioden für den gleitenden Durchschnitt

# Für jede Spalte und jeden Lag
for col in lag_columns:
    for lag in range(1, n_lags + 1):
        lag_col_name = f'{col}_Lag_{lag}'
        df_nans_handeled_cat[lag_col_name] = df_nans_handeled_cat.groupby('Store')[col].shift(lag)

        # Berechnen des gleitenden Durchschnitts für das Lag-Feature
        ma_col_name = f'{lag_col_name}_MA_{n_periods_for_ma}'
        df_nans_handeled_cat[ma_col_name] = df_nans_handeled_cat.groupby('Store')[lag_col_name].rolling(window=n_periods_for_ma).mean().reset_index(level=0, drop=True)

# Zielvariable um 8 Wochen in die Zukunft verschieben und hinter der sales spalte einfügen
df_nans_handeled_cat.insert(df_nans_handeled_cat.columns.get_loc('Sales') + 1, 'Future_Sales', df_nans_handeled_cat.groupby('Store')['Sales'].shift(-8))

#df_nans_handeled_cat['Future_Sales'] = df_nans_handeled_cat.groupby('Store')['Sales'].shift(-8)

# Entfernen von Zeilen mit NaN-Werten, die durch das Verschieben entstanden sind
df_nans_handeled_cat = df_nans_handeled_cat.dropna()

# remove of source columns
df_nans_handeled_cat = df_nans_handeled_cat.drop(columns=lag_columns)

df_nans_handeled_cat


Unnamed: 0,Store,CW,Month,Year,Future_Sales,Open,Promo,IsPromo,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,CompetitionDistance,IsCompetition,Promo2,Promo2Member,Promo2Active,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",Sales_Lag_1,Sales_Lag_1_MA_4,Sales_Lag_2,Sales_Lag_2_MA_4,Sales_Lag_3,Sales_Lag_3_MA_4,Sales_Lag_4,Sales_Lag_4_MA_4,Sales_Lag_5,Sales_Lag_5_MA_4,Sales_Lag_6,Sales_Lag_6_MA_4,Sales_Lag_7,Sales_Lag_7_MA_4,Sales_Lag_8,Sales_Lag_8_MA_4,SalesPerCustomer_Lag_1,SalesPerCustomer_Lag_1_MA_4,SalesPerCustomer_Lag_2,SalesPerCustomer_Lag_2_MA_4,SalesPerCustomer_Lag_3,SalesPerCustomer_Lag_3_MA_4,SalesPerCustomer_Lag_4,SalesPerCustomer_Lag_4_MA_4,SalesPerCustomer_Lag_5,SalesPerCustomer_Lag_5_MA_4,SalesPerCustomer_Lag_6,SalesPerCustomer_Lag_6_MA_4,SalesPerCustomer_Lag_7,SalesPerCustomer_Lag_7_MA_4,SalesPerCustomer_Lag_8,SalesPerCustomer_Lag_8_MA_4,SalesPerOpenDay_Lag_1,SalesPerOpenDay_Lag_1_MA_4,SalesPerOpenDay_Lag_2,SalesPerOpenDay_Lag_2_MA_4,SalesPerOpenDay_Lag_3,SalesPerOpenDay_Lag_3_MA_4,SalesPerOpenDay_Lag_4,SalesPerOpenDay_Lag_4_MA_4,SalesPerOpenDay_Lag_5,SalesPerOpenDay_Lag_5_MA_4,SalesPerOpenDay_Lag_6,SalesPerOpenDay_Lag_6_MA_4,SalesPerOpenDay_Lag_7,SalesPerOpenDay_Lag_7_MA_4,SalesPerOpenDay_Lag_8,SalesPerOpenDay_Lag_8_MA_4,Customers_Lag_1,Customers_Lag_1_MA_4,Customers_Lag_2,Customers_Lag_2_MA_4,Customers_Lag_3,Customers_Lag_3_MA_4,Customers_Lag_4,Customers_Lag_4_MA_4,Customers_Lag_5,Customers_Lag_5_MA_4,Customers_Lag_6,Customers_Lag_6_MA_4,Customers_Lag_7,Customers_Lag_7_MA_4,Customers_Lag_8,Customers_Lag_8_MA_4,CustomersPerOpenDay_Lag_1,CustomersPerOpenDay_Lag_1_MA_4,CustomersPerOpenDay_Lag_2,CustomersPerOpenDay_Lag_2_MA_4,CustomersPerOpenDay_Lag_3,CustomersPerOpenDay_Lag_3_MA_4,CustomersPerOpenDay_Lag_4,CustomersPerOpenDay_Lag_4_MA_4,CustomersPerOpenDay_Lag_5,CustomersPerOpenDay_Lag_5_MA_4,CustomersPerOpenDay_Lag_6,CustomersPerOpenDay_Lag_6_MA_4,CustomersPerOpenDay_Lag_7,CustomersPerOpenDay_Lag_7_MA_4,CustomersPerOpenDay_Lag_8,CustomersPerOpenDay_Lag_8_MA_4
11,1,12,3,2013,29696.0,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,28179.0,31293.25,37016.0,31218.50,27027.0,30907.25,32951.0,31323.75,27880.0,31353.75,35771.0,30878.25,28693.0,30173.50,33071.0,27835.25,7.518410,8.064789,8.484071,8.101601,7.764148,8.164938,8.492526,8.238293,7.665659,8.255957,8.737421,8.240186,8.057568,8.158434,8.563180,8.078042,4696.500000,5215.541667,6169.333333,5203.083333,4504.500000,5151.208333,5491.833333,5220.625000,4646.666667,5225.625000,5961.833333,5146.375000,4782.166667,5028.916667,5511.833333,5042.125000,3748.0,3868.00,4363.0,3840.25,3481.0,3773.00,3880.0,3793.00,3637.0,3788.50,4094.0,3733.50,3561.0,3689.50,3862.0,3424.25,624.666667,644.666667,727.166667,640.041667,580.166667,628.833333,646.666667,632.166667,606.166667,631.416667,682.333333,622.250000,593.500000,614.916667,643.666667,622.791667
12,1,13,3,2013,21018.0,5,5,1,1,5,1,1,1270.0,1,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,35521.0,31935.75,28179.0,31293.25,37016.0,31218.50,27027.0,30907.25,32951.0,31323.75,27880.0,31353.75,35771.0,30878.25,28693.0,30173.50,8.411319,8.044487,7.518410,8.064789,8.484071,8.101601,7.764148,8.164938,8.492526,8.238293,7.665659,8.255957,8.737421,8.240186,8.057568,8.158434,5920.166667,5322.625000,4696.500000,5215.541667,6169.333333,5203.083333,4504.500000,5151.208333,5491.833333,5220.625000,4646.666667,5225.625000,5961.833333,5146.375000,4782.166667,5028.916667,4223.0,3953.75,3748.0,3868.00,4363.0,3840.25,3481.0,3773.00,3880.0,3793.00,3637.0,3788.50,4094.0,3733.50,3561.0,3689.50,703.833333,658.958333,624.666667,644.666667,727.166667,640.041667,580.166667,628.833333,646.666667,632.166667,606.166667,631.416667,682.333333,622.250000,593.500000,614.916667
13,1,14,4,2013,29884.0,5,0,0,1,5,1,1,1270.0,1,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,34492.0,33802.00,35521.0,31935.75,28179.0,31293.25,37016.0,31218.50,27027.0,30907.25,32951.0,31323.75,27880.0,31353.75,35771.0,30878.25,8.670689,8.271122,8.411319,8.044487,7.518410,8.064789,8.484071,8.101601,7.764148,8.164938,8.492526,8.238293,7.665659,8.255957,8.737421,8.240186,6898.400000,5921.100000,5920.166667,5322.625000,4696.500000,5215.541667,6169.333333,5203.083333,4504.500000,5151.208333,5491.833333,5220.625000,4646.666667,5225.625000,5961.833333,5146.375000,3978.0,4078.00,4223.0,3953.75,3748.0,3868.00,4363.0,3840.25,3481.0,3773.00,3880.0,3793.00,3637.0,3788.50,4094.0,3733.50,795.600000,712.816667,703.833333,658.958333,624.666667,644.666667,727.166667,640.041667,580.166667,628.833333,646.666667,632.166667,606.166667,631.416667,682.333333,622.250000
14,1,15,4,2013,29112.0,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,23867.0,30514.75,34492.0,33802.00,35521.0,31935.75,28179.0,31293.25,37016.0,31218.50,27027.0,30907.25,32951.0,31323.75,27880.0,31353.75,7.910839,8.127814,8.670689,8.271122,8.411319,8.044487,7.518410,8.064789,8.484071,8.101601,7.764148,8.164938,8.492526,8.238293,7.665659,8.255957,4773.400000,5572.116667,6898.400000,5921.100000,5920.166667,5322.625000,4696.500000,5215.541667,6169.333333,5203.083333,4504.500000,5151.208333,5491.833333,5220.625000,4646.666667,5225.625000,3017.0,3741.50,3978.0,4078.00,4223.0,3953.75,3748.0,3868.00,4363.0,3840.25,3481.0,3773.00,3880.0,3793.00,3637.0,3788.50,603.400000,681.875000,795.600000,712.816667,703.833333,658.958333,624.666667,644.666667,727.166667,640.041667,580.166667,628.833333,646.666667,632.166667,606.166667,631.416667
15,1,16,4,2013,24215.0,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,30865.0,31186.25,23867.0,30514.75,34492.0,33802.00,35521.0,31935.75,28179.0,31293.25,37016.0,31218.50,27027.0,30907.25,32951.0,31323.75,8.509788,8.375659,7.910839,8.127814,8.670689,8.271122,8.411319,8.044487,7.518410,8.064789,8.484071,8.101601,7.764148,8.164938,8.492526,8.238293,5144.166667,5684.033333,4773.400000,5572.116667,6898.400000,5921.100000,5920.166667,5322.625000,4696.500000,5215.541667,6169.333333,5203.083333,4504.500000,5151.208333,5491.833333,5220.625000,3627.0,3711.25,3017.0,3741.50,3978.0,4078.00,4223.0,3953.75,3748.0,3868.00,4363.0,3840.25,3481.0,3773.00,3880.0,3793.00,604.500000,676.833333,603.400000,681.875000,795.600000,712.816667,703.833333,658.958333,624.666667,644.666667,727.166667,640.041667,580.166667,628.833333,646.666667,632.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150512,1115,19,5,2015,48130.0,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,44909.0,37810.25,32221.0,39762.50,46236.0,40150.25,27875.0,40626.00,52718.0,42956.25,33772.0,41586.75,48139.0,42621.75,37196.0,41559.25,15.779691,14.561868,13.521192,14.687204,15.473896,14.748814,13.472692,14.844355,16.281038,15.097288,13.767631,14.836706,15.856061,15.012356,14.484424,14.929575,8981.800000,6908.241667,5370.166667,7298.691667,7706.000000,7363.316667,5575.000000,7442.608333,10543.600000,7598.691667,5628.666667,6931.125000,8023.166667,7103.625000,6199.333333,6926.541667,2846.0,2571.50,2383.0,2669.50,2988.0,2687.00,2069.0,2699.00,3238.0,2823.75,2453.0,2789.25,3036.0,2831.00,2568.0,2778.75,569.200000,469.541667,397.166667,489.141667,498.000000,492.058333,413.800000,494.058333,647.600000,497.608333,408.833333,464.875000,506.000000,471.833333,428.000000,463.125000
150513,1115,20,5,2015,36233.0,5,0,0,1,0,0,1,5350.0,0,1,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,47767.0,42783.25,44909.0,37810.25,32221.0,39762.50,46236.0,40150.25,27875.0,40626.00,52718.0,42956.25,33772.0,41586.75,48139.0,42621.75,15.188235,14.990753,15.779691,14.561868,13.521192,14.687204,15.473896,14.748814,13.472692,14.844355,16.281038,15.097288,13.767631,14.836706,15.856061,15.012356,7961.166667,7504.783333,8981.800000,6908.241667,5370.166667,7298.691667,7706.000000,7363.316667,5575.000000,7442.608333,10543.600000,7598.691667,5628.666667,6931.125000,8023.166667,7103.625000,3145.0,2840.50,2846.0,2571.50,2383.0,2669.50,2988.0,2687.00,2069.0,2699.00,3238.0,2823.75,2453.0,2789.25,3036.0,2831.00,524.166667,497.133333,569.200000,469.541667,397.166667,489.141667,498.000000,492.058333,413.800000,494.058333,647.600000,497.608333,408.833333,464.875000,506.000000,471.833333
150514,1115,21,5,2015,45927.0,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,33638.0,39633.75,47767.0,42783.25,44909.0,37810.25,32221.0,39762.50,46236.0,40150.25,27875.0,40626.00,52718.0,42956.25,33772.0,41586.75,14.098072,14.646797,15.188235,14.990753,15.779691,14.561868,13.521192,14.687204,15.473896,14.748814,13.472692,14.844355,16.281038,15.097288,13.767631,14.836706,6727.600000,7260.183333,7961.166667,7504.783333,8981.800000,6908.241667,5370.166667,7298.691667,7706.000000,7363.316667,5575.000000,7442.608333,10543.600000,7598.691667,5628.666667,6931.125000,2386.0,2690.00,3145.0,2840.50,2846.0,2571.50,2383.0,2669.50,2988.0,2687.00,2069.0,2699.00,3238.0,2823.75,2453.0,2789.25,477.200000,491.933333,524.166667,497.133333,569.200000,469.541667,397.166667,489.141667,498.000000,492.058333,413.800000,494.058333,647.600000,497.608333,408.833333,464.875000
150515,1115,22,5,2015,35362.0,5,0,0,1,0,0,1,5350.0,0,1,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,46629.0,43235.75,33638.0,39633.75,47767.0,42783.25,44909.0,37810.25,32221.0,39762.50,46236.0,40150.25,27875.0,40626.00,52718.0,42956.25,15.085409,15.037852,14.098072,14.646797,15.188235,14.990753,15.779691,14.561868,13.521192,14.687204,15.473896,14.748814,13.472692,14.844355,16.281038,15.097288,7771.500000,7860.516667,6727.600000,7260.183333,7961.166667,7504.783333,8981.800000,6908.241667,5370.166667,7298.691667,7706.000000,7363.316667,5575.000000,7442.608333,10543.600000,7598.691667,3091.0,2867.00,2386.0,2690.00,3145.0,2840.50,2846.0,2571.50,2383.0,2669.50,2988.0,2687.00,2069.0,2699.00,3238.0,2823.75,515.166667,521.433333,477.200000,491.933333,524.166667,497.133333,569.200000,469.541667,397.166667,489.141667,498.000000,492.058333,413.800000,494.058333,647.600000,497.608333


### New additional Features

In [480]:
print(aaa)

NameError: name 'aaa' is not defined

## Skewness

In [None]:
#pd.set_option('display.max_rows', None)
num_columns_float = df_nans_handeled_cat.select_dtypes(include='float').columns
skew = df_nans_handeled_cat[num_columns_float].skew().sort_values(ascending=False)
skew[skew >= 3]

Assortment_b            10.995446
StoreType_b              7.912343
StateHoliday_c           7.753076
StateHoliday_b           4.048241
Customers_Lag_1_MA_4     3.057111
Customers_Lag_2_MA_4     3.051108
Customers_Lag_3_MA_4     3.046590
Customers_Lag_4_MA_4     3.041758
Customers_Lag_5_MA_4     3.039706
Customers_Lag_6_MA_4     3.034354
Customers_Lag_7_MA_4     3.028990
Customers_Lag_8_MA_4     3.024029
dtype: float64

In [481]:
for_log_transform = skew[skew >= 3].index
df_nans_handeled_cat[for_log_transform] = np.log(df_nans_handeled_cat[for_log_transform]+1)

In [429]:
# use PowerTransformer to transform the data
#for_log_transform = skew[skew >= 3].index
#from sklearn.preprocessing import PowerTransformer
#pt = PowerTransformer()
#df_nans_handeled_cat[for_log_transform] = pt.fit_transform(df_nans_handeled_cat[for_log_transform])


**Results:**
- Before: MAE:4793.96383	R2:0.844227
- After Power Transformation skew>=2: MAE:4615.821447	R2:	0.850871
- After Power Transformation skew>=3: MAE:4682.241185	R2: 0.847547
- After log transformation skew>=2: MAE: 4641.626858	R2: 0.851209
- After log transformation skew>=3: MAE: 4528.091135	R2: 0.854112

-> Log transformation with skew>=3 will be used

In [482]:
## Nutzt gesplitte daten in einmaligen test und train wobei test die letzten 8 wochen jedes stores beinhaltet
##
##


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score
from math import sqrt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import RobustScaler


df = df_nans_handeled_cat

# Aufteilen in Features (X) und Zielvariable (y)
#feature_columns = df.columns.difference(['Future_Sales'])
#X = df[feature_columns]
#y = df['Future_Sales']

# Aufteilen in Trainings- und Testdaten

# Listen, um die Trainings- und Testdaten zu speichern
train_data = []
test_data = []

# Gruppieren nach Store und Aufteilen in Trainings- und Testdaten
amount_test_weeks = 8
for store_id, group in df_nans_handeled_cat.groupby('Store'):
    train_data.append(group[: -amount_test_weeks])
    test_data.append(group[-amount_test_weeks:])

# Kombinieren der Trainings- und Testdaten
train_df = pd.concat(train_data)
test_df = pd.concat(test_data)

X_train = train_df.drop(columns=['Future_Sales'])
y_train = train_df['Future_Sales']
X_test = test_df.drop(columns=['Future_Sales'])
y_test = test_df['Future_Sales']

# Scaling of the data
#scaler = MinMaxScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Funktion zur Berechnung des angepassten R2
def adj_r2_score(model, X, y):
    n = X.shape[0]
    p = X.shape[1]
    r2 = r2_score(y, model.predict(X))
    return 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Modelle definieren
models = [
    ('LinearRegression', LinearRegression(n_jobs=-1)),
    #('RidgeRegression', Ridge(random_state=42)),
    #('LassoRegression', Lasso(random_state=42)),
    #('DecisionTreeRegressor', DecisionTreeRegressor(random_state=42)),
    #('RandomForestRegressor', RandomForestRegressor(n_jobs=-1, max_depth=10, random_state=42, n_estimators=100)),
    #('SVR', SVR()),
    #('KNN', KNeighborsRegressor())
]

# Ergebnis-DataFrame vorbereiten
results = []

# Modelle trainieren und Metriken auswerten
for name, model in models:
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    results.append({
        'Model': name,
        'RMSE_Train': sqrt(mse(y_train, y_train_pred)),
        'MAE_Train': mae(y_train, y_train_pred),
        'R2_Train': r2_score(y_train, y_train_pred),
        'Adj_R2_Train': adj_r2_score(model, X_train, y_train),
        'RMSE_Test': sqrt(mse(y_test, y_test_pred)),
        'MAE_Test': mae(y_test, y_test_pred),
        'R2_Test': r2_score(y_test, y_test_pred),
        'Adj_R2_Test': adj_r2_score(model, X_test, y_test)
    })
	#print last result
    print(results[-1])

# Konvertieren Sie die Liste von Dictionaries in einen DataFrame
results_df = pd.DataFrame(results)

# Ergebnisse anzeigen
results_df


{'Model': 'LinearRegression', 'RMSE_Train': 9336.133587273906, 'MAE_Train': 6469.108014669365, 'R2_Train': 0.7358540555781442, 'Adj_R2_Train': 0.7356103461005465, 'RMSE_Test': 6156.253648562417, 'MAE_Test': 4528.091135205937, 'R2_Test': 0.8541117726313912, 'Adj_R2_Test': 0.8522732629540619}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9336.133587,6469.108015,0.735854,0.73561,6156.253649,4528.091135,0.854112,0.852273


## Feature Scaling

In [29]:
df_nans_handeled_cat

Unnamed: 0,Store,CW,Month,Year,Future_Sales,Open,Promo,IsPromo,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,CompetitionDistance,IsCompetition,Promo2,Promo2Member,Promo2Active,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",Sales_Lag_1,Sales_Lag_1_MA_4,Sales_Lag_2,Sales_Lag_2_MA_4,Sales_Lag_3,Sales_Lag_3_MA_4,Sales_Lag_4,Sales_Lag_4_MA_4,Sales_Lag_5,Sales_Lag_5_MA_4,Sales_Lag_6,Sales_Lag_6_MA_4,Sales_Lag_7,Sales_Lag_7_MA_4,Sales_Lag_8,Sales_Lag_8_MA_4,SalesPerCustomer_Lag_1,SalesPerCustomer_Lag_1_MA_4,SalesPerCustomer_Lag_2,SalesPerCustomer_Lag_2_MA_4,SalesPerCustomer_Lag_3,SalesPerCustomer_Lag_3_MA_4,SalesPerCustomer_Lag_4,SalesPerCustomer_Lag_4_MA_4,SalesPerCustomer_Lag_5,SalesPerCustomer_Lag_5_MA_4,SalesPerCustomer_Lag_6,SalesPerCustomer_Lag_6_MA_4,SalesPerCustomer_Lag_7,SalesPerCustomer_Lag_7_MA_4,SalesPerCustomer_Lag_8,SalesPerCustomer_Lag_8_MA_4,SalesPerOpenDay_Lag_1,SalesPerOpenDay_Lag_1_MA_4,SalesPerOpenDay_Lag_2,SalesPerOpenDay_Lag_2_MA_4,SalesPerOpenDay_Lag_3,SalesPerOpenDay_Lag_3_MA_4,SalesPerOpenDay_Lag_4,SalesPerOpenDay_Lag_4_MA_4,SalesPerOpenDay_Lag_5,SalesPerOpenDay_Lag_5_MA_4,SalesPerOpenDay_Lag_6,SalesPerOpenDay_Lag_6_MA_4,SalesPerOpenDay_Lag_7,SalesPerOpenDay_Lag_7_MA_4,SalesPerOpenDay_Lag_8,SalesPerOpenDay_Lag_8_MA_4,Customers_Lag_1,Customers_Lag_1_MA_4,Customers_Lag_2,Customers_Lag_2_MA_4,Customers_Lag_3,Customers_Lag_3_MA_4,Customers_Lag_4,Customers_Lag_4_MA_4,Customers_Lag_5,Customers_Lag_5_MA_4,Customers_Lag_6,Customers_Lag_6_MA_4,Customers_Lag_7,Customers_Lag_7_MA_4,Customers_Lag_8,Customers_Lag_8_MA_4,CustomersPerOpenDay_Lag_1,CustomersPerOpenDay_Lag_1_MA_4,CustomersPerOpenDay_Lag_2,CustomersPerOpenDay_Lag_2_MA_4,CustomersPerOpenDay_Lag_3,CustomersPerOpenDay_Lag_3_MA_4,CustomersPerOpenDay_Lag_4,CustomersPerOpenDay_Lag_4_MA_4,CustomersPerOpenDay_Lag_5,CustomersPerOpenDay_Lag_5_MA_4,CustomersPerOpenDay_Lag_6,CustomersPerOpenDay_Lag_6_MA_4,CustomersPerOpenDay_Lag_7,CustomersPerOpenDay_Lag_7_MA_4,CustomersPerOpenDay_Lag_8,CustomersPerOpenDay_Lag_8_MA_4
11,1,12,3,2013,29696.0,6,5,1,0,0,0,0,7.147559,1,0,0,0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.693147,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.000000,28179.0,31293.25,37016.0,31218.50,27027.0,30907.25,32951.0,31323.75,27880.0,31353.75,35771.0,30878.25,28693.0,30173.50,33071.0,27835.25,7.518410,8.064789,8.484071,8.101601,7.764148,8.164938,8.492526,8.238293,7.665659,8.255957,8.737421,8.240186,8.057568,8.158434,8.563180,8.078042,4696.500000,5215.541667,6169.333333,5203.083333,4504.500000,5151.208333,5491.833333,5220.625000,4646.666667,5225.625000,5961.833333,5146.375000,4782.166667,5028.916667,5511.833333,5042.125000,8.229244,8.260751,8.381144,8.253553,8.155362,8.235891,8.263848,8.241176,8.199189,8.239989,8.317522,8.225369,8.178077,8.213517,8.259199,8.138930,6.438818,6.470283,6.590530,6.463094,6.365038,6.445455,6.473376,6.450734,6.408803,6.449548,6.526983,6.434948,6.387721,6.423112,6.468733,6.435816
12,1,13,3,2013,21018.0,5,5,1,1,5,1,1,7.147559,1,0,0,0,0.0,0.000000,0.693147,0.0,0.0,0.0,0.693147,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.000000,35521.0,31935.75,28179.0,31293.25,37016.0,31218.50,27027.0,30907.25,32951.0,31323.75,27880.0,31353.75,35771.0,30878.25,28693.0,30173.50,8.411319,8.044487,7.518410,8.064789,8.484071,8.101601,7.764148,8.164938,8.492526,8.238293,7.665659,8.255957,8.737421,8.240186,8.057568,8.158434,5920.166667,5322.625000,4696.500000,5215.541667,6169.333333,5203.083333,4504.500000,5151.208333,5491.833333,5220.625000,4646.666667,5225.625000,5961.833333,5146.375000,4782.166667,5028.916667,8.348538,8.282673,8.229244,8.260751,8.381144,8.253553,8.155362,8.235891,8.263848,8.241176,8.199189,8.239989,8.317522,8.225369,8.178077,8.213517,6.557961,6.492177,6.438818,6.470283,6.590530,6.463094,6.365038,6.445455,6.473376,6.450734,6.408803,6.449548,6.526983,6.434948,6.387721,6.423112
13,1,14,4,2013,29884.0,5,0,0,1,5,1,1,7.147559,1,0,0,0,0.0,0.000000,0.693147,0.0,0.0,0.0,0.693147,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.000000,34492.0,33802.00,35521.0,31935.75,28179.0,31293.25,37016.0,31218.50,27027.0,30907.25,32951.0,31323.75,27880.0,31353.75,35771.0,30878.25,8.670689,8.271122,8.411319,8.044487,7.518410,8.064789,8.484071,8.101601,7.764148,8.164938,8.492526,8.238293,7.665659,8.255957,8.737421,8.240186,6898.400000,5921.100000,5920.166667,5322.625000,4696.500000,5215.541667,6169.333333,5203.083333,4504.500000,5151.208333,5491.833333,5220.625000,4646.666667,5225.625000,5961.833333,5146.375000,8.288786,8.313607,8.348538,8.282673,8.229244,8.260751,8.381144,8.253553,8.155362,8.235891,8.263848,8.241176,8.199189,8.239989,8.317522,8.225369,6.680353,6.570626,6.557961,6.492177,6.438818,6.470283,6.590530,6.463094,6.365038,6.445455,6.473376,6.450734,6.408803,6.449548,6.526983,6.434948
14,1,15,4,2013,29112.0,6,5,1,0,0,0,0,7.147559,1,0,0,0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.693147,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.000000,23867.0,30514.75,34492.0,33802.00,35521.0,31935.75,28179.0,31293.25,37016.0,31218.50,27027.0,30907.25,32951.0,31323.75,27880.0,31353.75,7.910839,8.127814,8.670689,8.271122,8.411319,8.044487,7.518410,8.064789,8.484071,8.101601,7.764148,8.164938,8.492526,8.238293,7.665659,8.255957,4773.400000,5572.116667,6898.400000,5921.100000,5920.166667,5322.625000,4696.500000,5215.541667,6169.333333,5203.083333,4504.500000,5151.208333,5491.833333,5220.625000,4646.666667,5225.625000,8.012350,8.227509,8.288786,8.313607,8.348538,8.282673,8.229244,8.260751,8.381144,8.253553,8.155362,8.235891,8.263848,8.241176,8.199189,8.239989,6.404236,6.526312,6.680353,6.570626,6.557961,6.492177,6.438818,6.470283,6.590530,6.463094,6.365038,6.445455,6.473376,6.450734,6.408803,6.449548
15,1,16,4,2013,24215.0,6,0,0,0,0,0,0,7.147559,1,0,0,0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.693147,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.000000,30865.0,31186.25,23867.0,30514.75,34492.0,33802.00,35521.0,31935.75,28179.0,31293.25,37016.0,31218.50,27027.0,30907.25,32951.0,31323.75,8.509788,8.375659,7.910839,8.127814,8.670689,8.271122,8.411319,8.044487,7.518410,8.064789,8.484071,8.101601,7.764148,8.164938,8.492526,8.238293,5144.166667,5684.033333,4773.400000,5572.116667,6898.400000,5921.100000,5920.166667,5322.625000,4696.500000,5215.541667,6169.333333,5203.083333,4504.500000,5151.208333,5491.833333,5220.625000,8.196437,8.219393,8.012350,8.227509,8.288786,8.313607,8.348538,8.282673,8.229244,8.260751,8.381144,8.253553,8.155362,8.235891,8.263848,8.241176,6.406055,6.518901,6.404236,6.526312,6.680353,6.570626,6.557961,6.492177,6.438818,6.470283,6.590530,6.463094,6.365038,6.445455,6.473376,6.450734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150512,1115,19,5,2015,48130.0,6,5,1,0,0,0,0,8.585039,0,1,1,0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.693147,44909.0,37810.25,32221.0,39762.50,46236.0,40150.25,27875.0,40626.00,52718.0,42956.25,33772.0,41586.75,48139.0,42621.75,37196.0,41559.25,15.779691,14.561868,13.521192,14.687204,15.473896,14.748814,13.472692,14.844355,16.281038,15.097288,13.767631,14.836706,15.856061,15.012356,14.484424,14.929575,8981.800000,6908.241667,5370.166667,7298.691667,7706.000000,7363.316667,5575.000000,7442.608333,10543.600000,7598.691667,5628.666667,6931.125000,8023.166667,7103.625000,6199.333333,6926.541667,7.954021,7.852633,7.776535,7.890021,8.002694,7.896553,7.635304,7.901007,8.083020,7.946175,7.805475,7.933886,8.018625,7.948738,7.851272,7.930116,6.345987,6.153885,5.986871,6.194694,6.212606,6.200627,6.027796,6.204676,6.474816,6.211821,6.015751,6.143917,6.228511,6.158743,6.061457,6.140154
150513,1115,20,5,2015,36233.0,5,0,0,1,0,0,1,8.585039,0,1,1,0,0.0,0.693147,0.000000,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.693147,47767.0,42783.25,44909.0,37810.25,32221.0,39762.50,46236.0,40150.25,27875.0,40626.00,52718.0,42956.25,33772.0,41586.75,48139.0,42621.75,15.188235,14.990753,15.779691,14.561868,13.521192,14.687204,15.473896,14.748814,13.472692,14.844355,16.281038,15.097288,13.767631,14.836706,15.856061,15.012356,7961.166667,7504.783333,8981.800000,6908.241667,5370.166667,7298.691667,7706.000000,7363.316667,5575.000000,7442.608333,10543.600000,7598.691667,5628.666667,6931.125000,8023.166667,7103.625000,8.053887,7.952087,7.954021,7.852633,7.776535,7.890021,8.002694,7.896553,7.635304,7.901007,8.083020,7.946175,7.805475,7.933886,8.018625,7.948738,6.263716,6.210868,6.345987,6.153885,5.986871,6.194694,6.212606,6.200627,6.027796,6.204676,6.474816,6.211821,6.015751,6.143917,6.228511,6.158743
150514,1115,21,5,2015,45927.0,6,5,1,0,0,0,0,8.585039,0,1,1,0,1.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.693147,33638.0,39633.75,47767.0,42783.25,44909.0,37810.25,32221.0,39762.50,46236.0,40150.25,27875.0,40626.00,52718.0,42956.25,33772.0,41586.75,14.098072,14.646797,15.188235,14.990753,15.779691,14.561868,13.521192,14.687204,15.473896,14.748814,13.472692,14.844355,16.281038,15.097288,13.767631,14.836706,6727.600000,7260.183333,7961.166667,7504.783333,8981.800000,6908.241667,5370.166667,7298.691667,7706.000000,7363.316667,5575.000000,7442.608333,10543.600000,7598.691667,5628.666667,6931.125000,7.777793,7.897668,8.053887,7.952087,7.954021,7.852633,7.776535,7.890021,8.002694,7.896553,7.635304,7.901007,8.083020,7.946175,7.805475,7.933886,6.170029,6.200374,6.263716,6.210868,6.345987,6.153885,5.986871,6.194694,6.212606,6.200627,6.027796,6.204676,6.474816,6.211821,6.015751,6.143917
150515,1115,22,5,2015,35362.0,5,0,0,1,0,0,1,8.585039,0,1,1,0,0.0,0.693147,0.000000,0.0,0.0,0.0,0.000000,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.693147,46629.0,43235.75,33638.0,39633.75,47767.0,42783.25,44909.0,37810.25,32221.0,39762.50,46236.0,40150.25,27875.0,40626.00,52718.0,42956.25,15.085409,15.037852,14.098072,14.646797,15.188235,14.990753,15.779691,14.561868,13.521192,14.687204,15.473896,14.748814,13.472692,14.844355,16.281038,15.097288,7771.500000,7860.516667,6727.600000,7260.183333,7961.166667,7504.783333,8981.800000,6908.241667,5370.166667,7298.691667,7706.000000,7363.316667,5575.000000,7442.608333,10543.600000,7598.691667,8.036573,7.961370,7.777793,7.897668,8.053887,7.952087,7.954021,7.852633,7.776535,7.890021,8.002694,7.896553,7.635304,7.901007,8.083020,7.946175,6.246430,6.258497,6.170029,6.200374,6.263716,6.210868,6.345987,6.153885,5.986871,6.194694,6.212606,6.200627,6.027796,6.204676,6.474816,6.211821


In [303]:
## Nutzt gesplitte daten in einmaligen test und train wobei test die letzten 8 wochen jedes stores beinhaltet
##
##


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score
from math import sqrt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import RobustScaler


df = df_nans_handeled_cat

# Aufteilen in Features (X) und Zielvariable (y)
#feature_columns = df.columns.difference(['Future_Sales'])
#X = df[feature_columns]
#y = df['Future_Sales']

# Aufteilen in Trainings- und Testdaten

# Listen, um die Trainings- und Testdaten zu speichern
train_data = []
test_data = []

# Gruppieren nach Store und Aufteilen in Trainings- und Testdaten
amount_test_weeks = 8
for store_id, group in df_nans_handeled_cat.groupby('Store'):
    train_data.append(group[: -amount_test_weeks])
    test_data.append(group[-amount_test_weeks:])

# Kombinieren der Trainings- und Testdaten
train_df = pd.concat(train_data)
test_df = pd.concat(test_data)

X_train = train_df.drop(columns=['Future_Sales'])
y_train = train_df['Future_Sales']
X_test = test_df.drop(columns=['Future_Sales'])
y_test = test_df['Future_Sales']

# Scaling of the data
#scaler = MinMaxScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Funktion zur Berechnung des angepassten R2
def adj_r2_score(model, X, y):
    n = X.shape[0]
    p = X.shape[1]
    r2 = r2_score(y, model.predict(X))
    return 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Modelle definieren
models = [
    ('LinearRegression', LinearRegression(n_jobs=-1)),
    #('RidgeRegression', Ridge(random_state=42)),
    #('LassoRegression', Lasso(random_state=42)),
    #('DecisionTreeRegressor', DecisionTreeRegressor(random_state=42)),
    #('RandomForestRegressor', RandomForestRegressor(n_jobs=-1, max_depth=10, random_state=42, n_estimators=100)),
    #('SVR', SVR()),
    #('KNN', KNeighborsRegressor())
]

# Ergebnis-DataFrame vorbereiten
results = []

# Modelle trainieren und Metriken auswerten
for name, model in models:
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    results.append({
        'Model': name,
        'RMSE_Train': sqrt(mse(y_train, y_train_pred)),
        'MAE_Train': mae(y_train, y_train_pred),
        'R2_Train': r2_score(y_train, y_train_pred),
        'Adj_R2_Train': adj_r2_score(model, X_train, y_train),
        'RMSE_Test': sqrt(mse(y_test, y_test_pred)),
        'MAE_Test': mae(y_test, y_test_pred),
        'R2_Test': r2_score(y_test, y_test_pred),
        'Adj_R2_Test': adj_r2_score(model, X_test, y_test)
    })
	#print last result
    print(results[-1])

# Konvertieren Sie die Liste von Dictionaries in einen DataFrame
results_df = pd.DataFrame(results)

# Ergebnisse anzeigen
results_df


{'Model': 'LinearRegression', 'RMSE_Train': 9336.133587273906, 'MAE_Train': 6469.108014669365, 'R2_Train': 0.7358540555781442, 'Adj_R2_Train': 0.7356103461005465, 'RMSE_Test': 6156.253648562417, 'MAE_Test': 4528.091135205937, 'R2_Test': 0.8541117726313912, 'Adj_R2_Test': 0.8522732629540619}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9336.133587,6469.108015,0.735854,0.73561,6156.253649,4528.091135,0.854112,0.852273


In [48]:
############### Erstellt x splits in test und train wobei im jeweiligen test split die letzten 8 wochen jedes stores beinhaltet werden
# und die splits mittels gap gleichmäßig verteilt werden
##
##


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score
from math import sqrt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import RobustScaler


df = df_nans_handeled_cat

n_splits = 5
window_size = 8
total_weeks =109
train_size = window_size / 0.2
gap = int((total_weeks - window_size - train_size) // (n_splits))

# Ergebnis-DataFrame vorbereiten
results = []

for split in range(n_splits):
    train_data = []
    test_data = []

    for store_id, group in df.groupby('Store'):
        #group = group.sort_values(by=['Year', 'Month', 'CW'])
        # Berechnen des Startpunkts für das Testset

        if split == 0:
            test_start_index = -window_size
            test_df_store = group[test_start_index:]  # Kein Endindex für den ersten Split
        else:
            test_start_index = -(window_size + gap * split)
            test_end_index = test_start_index + window_size
            test_df_store = group[test_start_index:test_end_index]
            print("test:", test_df_store.shape, "Test Start Index:", test_start_index, "Test End Index:", test_end_index)
        train_start_index = -int(-test_start_index + gap + train_size)
        train_df_store = group[train_start_index:test_start_index]
        print("Train:", train_df_store.shape, "Train Start Index:", train_start_index, "Train End Index:", test_start_index)
        # Überprüfen Sie, ob das Testset Daten enthält
        if not test_df_store.empty:
            train_data.append(train_df_store)
            test_data.append(test_df_store)
        else:
            print(f"Store {store_id} hat nicht genügend Daten für Split {split}")
        #train_data.append(train_df_store)
        #test_data.append(test_df_store)

    # Kombinieren der Trainings- und Testdaten aller Filialen für diese Iteration
    train_df_combined = pd.concat(train_data)
    test_df_combined = pd.concat(test_data)

    # Erstellen von Feature- und Zielvariablen
    X_train = train_df_combined.drop(columns=['Future_Sales'])
    y_train = train_df_combined['Future_Sales']
    X_test = test_df_combined.drop(columns=['Future_Sales'])
    y_test = test_df_combined['Future_Sales']

    # Scaling of the data
    #scaler = RobustScaler()
    #X_train = scaler.fit_transform(X_train)
    #X_test = scaler.transform(X_test)

    # Trainieren und Bewerten des Modells für diese Iteration
	# Funktion zur Berechnung des angepassten R2
    def adj_r2_score(model, X, y):
        n = X.shape[0]
        p = X.shape[1]
        r2 = r2_score(y, model.predict(X))
        return 1 - (1 - r2) * ((n - 1) / (n - p - 1))

	# Modelle definieren
    models = [
		('LinearRegression', LinearRegression(n_jobs=-1)),
		#('RidgeRegression', Ridge(random_state=42)),
		#('LassoRegression', Lasso(random_state=42)),
		#('DecisionTreeRegressor', DecisionTreeRegressor(random_state=42)),
		#('RandomForestRegressor', RandomForestRegressor(n_jobs=-1, max_depth=10, random_state=42, n_estimators=100)),
		#('SVR', SVR()),
		#('KNN', KNeighborsRegressor())
	]
    
    # Modelle trainieren und Metriken auswerten
    for name, model in models:
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
    
        results.append({
            'Model': name,
            'RMSE_Train': sqrt(mse(y_train, y_train_pred)),
            'MAE_Train': mae(y_train, y_train_pred),
            'R2_Train': r2_score(y_train, y_train_pred),
            'Adj_R2_Train': adj_r2_score(model, X_train, y_train),
            'RMSE_Test': sqrt(mse(y_test, y_test_pred)),
            'MAE_Test': mae(y_test, y_test_pred),
            'R2_Test': r2_score(y_test, y_test_pred),
            'Adj_R2_Test': adj_r2_score(model, X_test, y_test)
        })
    	#print last result
        print(results[-1])

# Konvertieren Sie die Liste von Dictionaries in einen DataFrame
results_df = pd.DataFrame(results)

# Ergebnisse anzeigen
results_df


Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 Train End Index: -8
Train: (52, 112) Train Start Index: -60 

Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,10049.992683,6771.11047,0.736154,0.735648,5771.529549,4273.896122,0.871776,0.87016
1,LinearRegression,10060.00188,6556.022863,0.733453,0.732941,8079.800324,5711.968364,0.763218,0.760234
2,LinearRegression,8651.236574,5393.90129,0.799549,0.799164,17171.406157,11161.618859,-0.084317,-0.097982
3,LinearRegression,9356.638965,6176.024774,0.734585,0.734076,10796.517694,8265.685592,0.73241,0.729038
4,LinearRegression,7738.216419,5495.273275,0.787493,0.787086,17536.068343,11252.379604,0.299461,0.290633


In [49]:
model_list = results_df['Model'].unique()
# create resulte_mean_df
resulte_mean_df = pd.DataFrame(columns=results_df.columns)
# iterate over model_list
for model in model_list:
	# get mean of each model
	mean = results_df[results_df['Model'] == model].mean(numeric_only=True)
	mean['Model'] = model
	# append mean to resulte_mean_df
	resulte_mean_df = pd.concat([resulte_mean_df, pd.DataFrame([mean], columns=results_df.columns)], ignore_index=True)

resulte_mean_df

  resulte_mean_df = pd.concat([resulte_mean_df, pd.DataFrame([mean], columns=results_df.columns)], ignore_index=True)


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9171.217304,6078.466534,0.758247,0.757783,11871.064413,8133.109708,0.51651,0.510417


In [None]:
model_list = results_df['Model'].unique()
# create resulte_mean_df
resulte_mean_df = pd.DataFrame(columns=results_df.columns)
# iterate over model_list
for model in model_list:
	# get mean of each model
	mean = results_df[results_df['Model'] == model].mean(numeric_only=True)
	mean['Model'] = model
	# append mean to resulte_mean_df
	resulte_mean_df = pd.concat([resulte_mean_df, pd.DataFrame([mean], columns=results_df.columns)], ignore_index=True)

resulte_mean_df

  resulte_mean_df = pd.concat([resulte_mean_df, pd.DataFrame([mean], columns=results_df.columns)], ignore_index=True)


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9659.199055,6611.188665,0.731806,0.731478,11899.476645,8314.87706,0.510512,0.506584
1,RidgeRegression,9659.206587,6611.138311,0.731806,0.731477,11903.20525,8317.065439,0.510285,0.506355


**Result:**
StandardScaler, MinMaxScaler, RobustScaler has no effect on the model performance. It stays at LinearRegression R2 = 0.833

TODO: Test Cyclical Feature Encoding

Cyclical encoding is a technique used to transform cyclical features like hours of the day, days of the week, months in a year, or calendar weeks, so that these features can be properly understood and utilized by machine learning models. The key challenge with cyclical data is that they wrap around: the highest value is succeeded by the lowest value, forming a cycle. Traditional numerical encoding of these values can mislead a model into thinking that the cycle is linear and that values at the end of the cycle are far apart when, in reality, they are close.

How Cyclical Encoding Works:
The most common way to encode cyclical data is by using sine and cosine transformations. This approach projects each cyclical feature onto a circle such that the beginning and end of the cycle meet. Here's how it's typically done:

Sine and Cosine Transformation: Each cyclical value is transformed into two features using sine and cosine functions. This maps the data points onto a unit circle and preserves their cyclical nature.

Example: For a month feature (with values from 1 to 12), the transformation would be:

python
Copy code
df['month_sin'] = np.sin((df['month']-1)*(2.*np.pi/12))
df['month_cos'] = np.cos((df['month']-1)*(2.*np.pi/12))
This creates two new columns, month_sin and month_cos, which represent the sine and cosine values of the month on the unit circle.

Advantages of Cyclical Encoding:
Preserves Cyclical Nature: It correctly represents the proximity of values at the end and start of the cycle. For example, December (12) is close to January (1).

Continuous Representation: It provides a smooth and continuous representation, which is useful for many machine learning models.

Efficient Use of Features: Unlike one-hot encoding, which would create many sparse columns for each possible value, sine and cosine transformations keep the feature space more compact.

When to Use Cyclical Encoding:
Cyclical encoding is particularly useful in time series analysis or when the cyclical aspect of a feature is important for the model. Common scenarios include:

Time of Day: Hours of the day for predicting traffic patterns.
Days of the Week: Days for weekly sales forecasts.
Seasons: Months or calendar weeks for seasonal effects in data.
Implementation Considerations:
Standardization: After applying sine and cosine transformations, you might need to standardize these features, especially if you're using models sensitive to feature scaling.

Model Compatibility: While cyclical encoding is beneficial for many models, it's always good practice to test and validate its impact on your specific model and dataset.

In summary, cyclical encoding is a powerful technique for handling cyclical features in machine learning, enabling models to better understand and utilize the patterns within such data.

# Performance Reference

In [45]:
## Nutzt gesplitte daten in einmaligen test und train wobei test die letzten 8 wochen jedes stores beinhaltet
##
##

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score
from math import sqrt

# Aufteilen in Trainings- und Testdaten
# Listen, um die Trainings- und Testdaten zu speichern
train_data = []
test_data = []

# Gruppieren nach Store und Aufteilen in Trainings- und Testdaten
amount_test_weeks = 8
for store_id, group in df.groupby('Store'):
    train_data.append(group[: -amount_test_weeks])
    test_data.append(group[-amount_test_weeks:])

# Kombinieren der Trainings- und Testdaten
train_df = pd.concat(train_data)
test_df = pd.concat(test_data)

X_train = train_df
y_train = train_df['Sales']
X_test = test_df
y_test = test_df['Sales']


# Funktion zur Berechnung des angepassten R2
def adj_r2_score(model, X, y):
    n = X.shape[0]
    p = X.shape[1]
    r2 = r2_score(y, model.predict(X))
    return 1 - (1 - r2) * ((n - 1) / (n - p - 1))


# Ergebnis-DataFrame vorbereiten
results = []

# Calculate the salces mean and using it as a prediction
# df for means of last x weeks
timeframeForMean = 12
last_day_in_train = X_train['Date'].max()
df_X_train_for_means = X_train[X_train['Date'] > last_day_in_train - pd.Timedelta(weeks=timeframeForMean)]
mean_sales_train = df_X_train_for_means.mean(numeric_only=True)['Sales']
#mean_sales_test = X_test['Sales'].mean()

y_train_pred = np.full(y_train.shape, mean_sales_train)
y_test_pred = np.full(y_test.shape, mean_sales_train)

results.append({
    'Model': "Mean reference",
    'RMSE_Train': sqrt(mse(y_train, y_train_pred)),
    'MAE_Train': mae(y_train, y_train_pred),
    'R2_Train': r2_score(y_train, y_train_pred),
    #'Adj_R2_Train': adj_r2_score(model, X_train, y_train),
    'RMSE_Test': sqrt(mse(y_test, y_test_pred)),
    'MAE_Test': mae(y_test, y_test_pred),
    'R2_Test': r2_score(y_test, y_test_pred),
    #'Adj_R2_Test': adj_r2_score(model, X_test, y_test)
})
#print last result
print(results[-1])

# Konvertieren Sie die Liste von Dictionaries in einen DataFrame
results_df = pd.DataFrame(results)

# Ergebnisse anzeigen
results_df

{'Model': 'Mean reference', 'RMSE_Train': 18049.073796005105, 'MAE_Train': 13313.040861216872, 'R2_Train': -0.018051765838443812, 'RMSE_Test': 16117.825576344818, 'MAE_Test': 11780.306415371311, 'R2_Test': -3.2271684458073935e-07}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,RMSE_Test,MAE_Test,R2_Test
0,Mean reference,18049.073796,13313.040861,-0.018052,16117.825576,11780.306415,-3.227168e-07
