# Capstone Projekt Rossmann

# Feature Engineering

## Feature Engineering

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from pandas.api.types import infer_dtype



pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('weekly_sales_with_store_info.csv')

In [3]:
print(df.info())
df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150525 entries, 0 to 150524
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Store                      150525 non-null  int64  
 1   Date                       150525 non-null  object 
 2   CW                         150525 non-null  int64  
 3   Month                      150525 non-null  int64  
 4   Year                       150525 non-null  int64  
 5   DayOfWeek                  150525 non-null  int64  
 6   Sales                      150525 non-null  int64  
 7   SalesPerCustomer           145809 non-null  float64
 8   SalesPerOpenDay            145815 non-null  float64
 9   Customers                  150525 non-null  int64  
 10  CustomersPerOpenDay        145815 non-null  float64
 11  Open                       150525 non-null  int64  
 12  Promo                      150525 non-null  int64  
 13  IsPromo                    15

Unnamed: 0,Store,Date,CW,Month,Year,DayOfWeek,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,StateHoliday,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,IsCompetition,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Promo2Member,Promo2Active
9504,71,2014-01-19,3,1,2014,6,41357,10.883421,6892.833333,3800,633.333333,6,0,0,0,0,0,0,0,a,a,17500.0,8.0,2008.0,1,1,37.0,2009.0,"Mar,Jun,Sept,Dec",1,0
146009,1082,2014-06-08,23,6,2014,6,43886,7.096701,7314.333333,6184,1030.666667,6,5,1,0,0,0,0,0,c,a,440.0,4.0,2002.0,1,0,,,,0,0
76178,565,2013-09-29,39,9,2013,6,40914,8.36516,6819.0,4891,815.166667,6,5,1,0,0,0,0,0,a,c,160.0,7.0,2007.0,1,0,,,,0,0
139387,1033,2014-04-20,16,4,2014,6,83666,11.030455,16733.2,7585,1517.0,5,5,1,b,1,5,1,1,a,a,7680.0,3.0,2006.0,1,0,,,,0,0
54746,406,2014-05-18,20,5,2014,6,30704,12.059701,5117.333333,2546,424.333333,6,0,0,0,0,0,0,0,d,c,8240.0,3.0,2001.0,1,1,10.0,2013.0,"Feb,May,Aug,Nov",1,1


### Handle Missing Values

In [4]:
df.isna().sum()

Store                            0
Date                             0
CW                               0
Month                            0
Year                             0
DayOfWeek                        0
Sales                            0
SalesPerCustomer              4716
SalesPerOpenDay               4710
Customers                        0
CustomersPerOpenDay           4710
Open                             0
Promo                            0
IsPromo                          0
StateHoliday                     0
IsStateHoliday                   0
SchoolHoliday                    0
IsSchoolHoliday                  0
NumStateHoliday                  0
StoreType                        0
Assortment                       0
CompetitionDistance            405
CompetitionOpenSinceMonth    47790
CompetitionOpenSinceYear     47790
IsCompetition                    0
Promo2                           0
Promo2SinceWeek              73440
Promo2SinceYear              73440
PromoInterval       

#### SalesPerCustomer, SalesPerOpenday, CustomersPerOpenday

In [5]:
# As the store were closed, we can fill the nans with 0

# fill nans with 0 for listed columns
columns_to_fill = ['SalesPerCustomer', 'SalesPerOpenDay', 'CustomersPerOpenDay']
df_nans_handeled = df.fillna({col: 0 for col in columns_to_fill})
df_nans_handeled


Unnamed: 0,Store,Date,CW,Month,Year,DayOfWeek,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,StateHoliday,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,IsCompetition,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Promo2Member,Promo2Active
0,1,2013-01-06,1,1,2013,6,19340,7.736000,4835.000000,2500,625.000000,4,0,0,a,1,6,1,1,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
1,1,2013-01-13,2,1,2013,6,32952,8.410413,5492.000000,3918,653.000000,6,5,1,0,0,5,1,0,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
2,1,2013-01-20,3,1,2013,6,25978,7.602575,4329.666667,3417,569.500000,6,0,0,0,0,0,0,0,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
3,1,2013-01-27,4,1,2013,6,33071,8.563180,5511.833333,3862,643.666667,6,5,1,0,0,0,0,0,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
4,1,2013-02-03,5,2,2013,6,28693,8.057568,4782.166667,3561,593.500000,6,0,0,0,0,0,0,0,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150520,1115,2015-07-05,27,7,2015,6,48130,16.140174,8021.666667,2982,497.000000,6,5,1,0,0,0,0,0,d,c,5350.0,,,0,1,22.0,2012.0,"Mar,Jun,Sept,Dec",1,0
150521,1115,2015-07-12,28,7,2015,6,36233,14.315685,6038.833333,2531,421.833333,6,0,0,0,0,0,0,0,d,c,5350.0,,,0,1,22.0,2012.0,"Mar,Jun,Sept,Dec",1,0
150522,1115,2015-07-19,29,7,2015,6,45927,15.023553,7654.500000,3057,509.500000,6,5,1,0,0,0,0,0,d,c,5350.0,,,0,1,22.0,2012.0,"Mar,Jun,Sept,Dec",1,0
150523,1115,2015-07-26,30,7,2015,6,35362,14.122204,5893.666667,2504,417.333333,6,0,0,0,0,0,0,0,d,c,5350.0,,,0,1,22.0,2012.0,"Mar,Jun,Sept,Dec",1,0


In [6]:
df_nans_handeled.isna().sum()

Store                            0
Date                             0
CW                               0
Month                            0
Year                             0
DayOfWeek                        0
Sales                            0
SalesPerCustomer                 0
SalesPerOpenDay                  0
Customers                        0
CustomersPerOpenDay              0
Open                             0
Promo                            0
IsPromo                          0
StateHoliday                     0
IsStateHoliday                   0
SchoolHoliday                    0
IsSchoolHoliday                  0
NumStateHoliday                  0
StoreType                        0
Assortment                       0
CompetitionDistance            405
CompetitionOpenSinceMonth    47790
CompetitionOpenSinceYear     47790
IsCompetition                    0
Promo2                           0
Promo2SinceWeek              73440
Promo2SinceYear              73440
PromoInterval       

#### CompetitionDistance

In [7]:
# Stores with no CompetitionDistance information
print("Stores with no CompetitionDistance information:", df_nans_handeled[(df_nans_handeled['CompetitionDistance'].isna())]['Store'].unique())

print("StoreType of store 291", df_nans_handeled[(df_nans_handeled['Store'] == 291)]['StoreType'].unique())
print("StoreType of store 622", df_nans_handeled[(df_nans_handeled['Store'] == 622)]['StoreType'].unique())
print("StoreType of store 879", df_nans_handeled[(df_nans_handeled['Store'] == 879)]['StoreType'].unique())

Stores with no CompetitionDistance information: [291 622 879]
StoreType of store 291 ['d']
StoreType of store 622 ['a']
StoreType of store 879 ['d']


In [8]:
# As store 291, 622 and 879 have no CompetitionDistance information, we can fill them with the median value of the column

# median competition distance for store type a
median_competition_distance_a = df_nans_handeled[(df_nans_handeled['StoreType'] == 'a')]['CompetitionDistance'].median()
# median competition distance for store type d
median_competition_distance_d = df_nans_handeled[(df_nans_handeled['StoreType'] == 'd')]['CompetitionDistance'].median()

# fill nans for storetype a with median_competition_distance_a
df_nans_handeled.loc[(df_nans_handeled['Store'] == 291), 'CompetitionDistance'] = median_competition_distance_a
# fill nans for storetype d with median_competition_distance_d
df_nans_handeled.loc[(df_nans_handeled['Store'] == 622), 'CompetitionDistance'] = median_competition_distance_d
# fill nans for storetype d with median_competition_distance_d
df_nans_handeled.loc[(df_nans_handeled['Store'] == 879), 'CompetitionDistance'] = median_competition_distance_d



In [9]:
df_nans_handeled.isna().sum()

Store                            0
Date                             0
CW                               0
Month                            0
Year                             0
DayOfWeek                        0
Sales                            0
SalesPerCustomer                 0
SalesPerOpenDay                  0
Customers                        0
CustomersPerOpenDay              0
Open                             0
Promo                            0
IsPromo                          0
StateHoliday                     0
IsStateHoliday                   0
SchoolHoliday                    0
IsSchoolHoliday                  0
NumStateHoliday                  0
StoreType                        0
Assortment                       0
CompetitionDistance              0
CompetitionOpenSinceMonth    47790
CompetitionOpenSinceYear     47790
IsCompetition                    0
Promo2                           0
Promo2SinceWeek              73440
Promo2SinceYear              73440
PromoInterval       

#### CompetitionOpenSinceMonth, CompetitionOpenSinceYear

In [10]:
# CompetitionOpenSinceMonth and CompetitionOpenSinceYear can be deleted as they are reflected in IsCompetition
df_nans_handeled = df_nans_handeled.drop(columns=['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'])

In [11]:
df_nans_handeled.isna().sum()

Store                      0
Date                       0
CW                         0
Month                      0
Year                       0
DayOfWeek                  0
Sales                      0
SalesPerCustomer           0
SalesPerOpenDay            0
Customers                  0
CustomersPerOpenDay        0
Open                       0
Promo                      0
IsPromo                    0
StateHoliday               0
IsStateHoliday             0
SchoolHoliday              0
IsSchoolHoliday            0
NumStateHoliday            0
StoreType                  0
Assortment                 0
CompetitionDistance        0
IsCompetition              0
Promo2                     0
Promo2SinceWeek        73440
Promo2SinceYear        73440
PromoInterval          73440
Promo2Member               0
Promo2Active               0
dtype: int64

#### Promo2SinceWeek, Promo2SinceYear

In [12]:
# Promo2SinceWeek and Promo2SinceYear can be deleted as they are reflected in Promo2Member
df_nans_handeled = df_nans_handeled.drop(columns=['Promo2SinceWeek', 'Promo2SinceYear'])

In [13]:
df_nans_handeled.isna().sum()

Store                      0
Date                       0
CW                         0
Month                      0
Year                       0
DayOfWeek                  0
Sales                      0
SalesPerCustomer           0
SalesPerOpenDay            0
Customers                  0
CustomersPerOpenDay        0
Open                       0
Promo                      0
IsPromo                    0
StateHoliday               0
IsStateHoliday             0
SchoolHoliday              0
IsSchoolHoliday            0
NumStateHoliday            0
StoreType                  0
Assortment                 0
CompetitionDistance        0
IsCompetition              0
Promo2                     0
PromoInterval          73440
Promo2Member               0
Promo2Active               0
dtype: int64

#### PromoInterval

In [14]:
df_nans_handeled[(df_nans_handeled['Promo2'] == 1) & (df_nans_handeled['PromoInterval'].isna())]

Unnamed: 0,Store,Date,CW,Month,Year,DayOfWeek,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,StateHoliday,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,StoreType,Assortment,CompetitionDistance,IsCompetition,Promo2,PromoInterval,Promo2Member,Promo2Active


In [15]:
# As if the store is not participating in Promo2, PromoInterval is 0, we can fill the nans with 0
df_nans_handeled['PromoInterval'] = df_nans_handeled['PromoInterval'].fillna(0)

In [16]:
df_nans_handeled.isna().sum()

Store                  0
Date                   0
CW                     0
Month                  0
Year                   0
DayOfWeek              0
Sales                  0
SalesPerCustomer       0
SalesPerOpenDay        0
Customers              0
CustomersPerOpenDay    0
Open                   0
Promo                  0
IsPromo                0
StateHoliday           0
IsStateHoliday         0
SchoolHoliday          0
IsSchoolHoliday        0
NumStateHoliday        0
StoreType              0
Assortment             0
CompetitionDistance    0
IsCompetition          0
Promo2                 0
PromoInterval          0
Promo2Member           0
Promo2Active           0
dtype: int64

### Remove not needed Features

In [17]:
# Store is just an ID
#df_nans_handeled = df_nans_handeled.drop(columns=['Store'])

# Date is an object and is reflected by CW, Month and Year
df_nans_handeled = df_nans_handeled.drop(columns=['Date'])

# DayOfWeek is not relevant in weekly data
df_nans_handeled = df_nans_handeled.drop(columns=['DayOfWeek'])

df_nans_handeled

Unnamed: 0,Store,CW,Month,Year,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,StateHoliday,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,StoreType,Assortment,CompetitionDistance,IsCompetition,Promo2,PromoInterval,Promo2Member,Promo2Active
0,1,1,1,2013,19340,7.736000,4835.000000,2500,625.000000,4,0,0,a,1,6,1,1,c,a,1270.0,1,0,0,0,0
1,1,2,1,2013,32952,8.410413,5492.000000,3918,653.000000,6,5,1,0,0,5,1,0,c,a,1270.0,1,0,0,0,0
2,1,3,1,2013,25978,7.602575,4329.666667,3417,569.500000,6,0,0,0,0,0,0,0,c,a,1270.0,1,0,0,0,0
3,1,4,1,2013,33071,8.563180,5511.833333,3862,643.666667,6,5,1,0,0,0,0,0,c,a,1270.0,1,0,0,0,0
4,1,5,2,2013,28693,8.057568,4782.166667,3561,593.500000,6,0,0,0,0,0,0,0,c,a,1270.0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150520,1115,27,7,2015,48130,16.140174,8021.666667,2982,497.000000,6,5,1,0,0,0,0,0,d,c,5350.0,0,1,"Mar,Jun,Sept,Dec",1,0
150521,1115,28,7,2015,36233,14.315685,6038.833333,2531,421.833333,6,0,0,0,0,0,0,0,d,c,5350.0,0,1,"Mar,Jun,Sept,Dec",1,0
150522,1115,29,7,2015,45927,15.023553,7654.500000,3057,509.500000,6,5,1,0,0,0,0,0,d,c,5350.0,0,1,"Mar,Jun,Sept,Dec",1,0
150523,1115,30,7,2015,35362,14.122204,5893.666667,2504,417.333333,6,0,0,0,0,0,0,0,d,c,5350.0,0,1,"Mar,Jun,Sept,Dec",1,0


In [18]:
#df_cleaned

### Categorical Feature Encoding

In [19]:
df_nans_handeled.select_dtypes(include='object').columns

Index(['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval'], dtype='object')

In [20]:
# check if a column contains mixed data typesrom pandas.api.types import infer_dtype

for col in ['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']:
    dtype = infer_dtype(df_nans_handeled[col])
    print(f"Data type of {col}: {dtype}")


Data type of StateHoliday: string
Data type of StoreType: string
Data type of Assortment: string
Data type of PromoInterval: mixed-integer


In [21]:
# Convert mixed columns
cols_to_convert = ['PromoInterval']
df_nans_handeled[cols_to_convert] = df_nans_handeled[cols_to_convert].astype(str)

In [22]:
from sklearn.preprocessing import OneHotEncoder
# handle_unknown='ignore': um Fehler zu vermeiden, wenn die Trainingsdaten Klassen/Kategorien enthalten, die nicht in den Trainingsdaten vertreten sind
# sparse=False: stellt sicher, dass die kodierten Spalten als NumPy-Array zurückgegeben werden (anstelle einer Sparse-Matrix).
OneHotEnc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# Es dürfen nur die Kategorialen Spalten üergeben werden, nicht das ganze df
encoded_array = OneHotEnc.fit_transform(df_nans_handeled[['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']])
tmp_cat = pd.DataFrame(encoded_array, columns=OneHotEnc.get_feature_names_out(), index=df_nans_handeled.index)
df_nans_handeled_cat = pd.concat([df_nans_handeled.select_dtypes(include=['number']), tmp_cat], axis=1)
df_nans_handeled_cat

Unnamed: 0,Store,CW,Month,Year,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,CompetitionDistance,IsCompetition,Promo2,Promo2Member,Promo2Active,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec"
0,1,1,1,2013,19340,7.736000,4835.000000,2500,625.000000,4,0,0,1,6,1,1,1270.0,1,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,2,1,2013,32952,8.410413,5492.000000,3918,653.000000,6,5,1,0,5,1,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1,3,1,2013,25978,7.602575,4329.666667,3417,569.500000,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1,4,1,2013,33071,8.563180,5511.833333,3862,643.666667,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1,5,2,2013,28693,8.057568,4782.166667,3561,593.500000,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150520,1115,27,7,2015,48130,16.140174,8021.666667,2982,497.000000,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
150521,1115,28,7,2015,36233,14.315685,6038.833333,2531,421.833333,6,0,0,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
150522,1115,29,7,2015,45927,15.023553,7654.500000,3057,509.500000,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
150523,1115,30,7,2015,35362,14.122204,5893.666667,2504,417.333333,6,0,0,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [23]:
# Lag-Features für diese Spalten erstellen
lag_columns = ['Sales', 'SalesPerCustomer', 'SalesPerOpenDay', 'Customers', 'CustomersPerOpenDay']
n_lags = 8  # Anzahl der zu erstellenden Lag-Features

# Für jede Spalte und jeden Lag
for col in lag_columns:
    for lag in range(1, n_lags + 1):
        df_nans_handeled_cat[f'{col}_Lag_{lag}'] = df_nans_handeled_cat.groupby('Store')[col].shift(lag)

# Zielvariable um 8 Wochen in die Zukunft verschieben und hinter der sales spalte einfügen
df_nans_handeled_cat.insert(df_nans_handeled_cat.columns.get_loc('Sales') + 1, 'Future_Sales', df_nans_handeled_cat.groupby('Store')['Sales'].shift(-8))

#df_nans_handeled_cat['Future_Sales'] = df_nans_handeled_cat.groupby('Store')['Sales'].shift(-8)

# Entfernen von Zeilen mit NaN-Werten, die durch das Verschieben entstanden sind
df_nans_handeled_cat = df_nans_handeled_cat.dropna()

# remove of source columns
df_nans_handeled_cat = df_nans_handeled_cat.drop(columns=lag_columns)

df_nans_handeled_cat


Unnamed: 0,Store,CW,Month,Year,Future_Sales,Open,Promo,IsPromo,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,CompetitionDistance,IsCompetition,Promo2,Promo2Member,Promo2Active,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",Sales_Lag_1,Sales_Lag_2,Sales_Lag_3,Sales_Lag_4,Sales_Lag_5,Sales_Lag_6,Sales_Lag_7,Sales_Lag_8,SalesPerCustomer_Lag_1,SalesPerCustomer_Lag_2,SalesPerCustomer_Lag_3,SalesPerCustomer_Lag_4,SalesPerCustomer_Lag_5,SalesPerCustomer_Lag_6,SalesPerCustomer_Lag_7,SalesPerCustomer_Lag_8,SalesPerOpenDay_Lag_1,SalesPerOpenDay_Lag_2,SalesPerOpenDay_Lag_3,SalesPerOpenDay_Lag_4,SalesPerOpenDay_Lag_5,SalesPerOpenDay_Lag_6,SalesPerOpenDay_Lag_7,SalesPerOpenDay_Lag_8,Customers_Lag_1,Customers_Lag_2,Customers_Lag_3,Customers_Lag_4,Customers_Lag_5,Customers_Lag_6,Customers_Lag_7,Customers_Lag_8,CustomersPerOpenDay_Lag_1,CustomersPerOpenDay_Lag_2,CustomersPerOpenDay_Lag_3,CustomersPerOpenDay_Lag_4,CustomersPerOpenDay_Lag_5,CustomersPerOpenDay_Lag_6,CustomersPerOpenDay_Lag_7,CustomersPerOpenDay_Lag_8
8,1,9,3,2013,28979.0,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,32951.0,27880.0,35771.0,28693.0,33071.0,25978.0,32952.0,19340.0,8.492526,7.665659,8.737421,8.057568,8.563180,7.602575,8.410413,7.736000,5491.833333,4646.666667,5961.833333,4782.166667,5511.833333,4329.666667,5492.000000,4835.000000,3880.0,3637.0,4094.0,3561.0,3862.0,3417.0,3918.0,2500.0,646.666667,606.166667,682.333333,593.500000,643.666667,569.500000,653.000000,625.000000
9,1,10,3,2013,30171.0,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,27027.0,32951.0,27880.0,35771.0,28693.0,33071.0,25978.0,32952.0,7.764148,8.492526,7.665659,8.737421,8.057568,8.563180,7.602575,8.410413,4504.500000,5491.833333,4646.666667,5961.833333,4782.166667,5511.833333,4329.666667,5492.000000,3481.0,3880.0,3637.0,4094.0,3561.0,3862.0,3417.0,3918.0,580.166667,646.666667,606.166667,682.333333,593.500000,643.666667,569.500000,653.000000
10,1,11,3,2013,24895.0,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,37016.0,27027.0,32951.0,27880.0,35771.0,28693.0,33071.0,25978.0,8.484071,7.764148,8.492526,7.665659,8.737421,8.057568,8.563180,7.602575,6169.333333,4504.500000,5491.833333,4646.666667,5961.833333,4782.166667,5511.833333,4329.666667,4363.0,3481.0,3880.0,3637.0,4094.0,3561.0,3862.0,3417.0,727.166667,580.166667,646.666667,606.166667,682.333333,593.500000,643.666667,569.500000
11,1,12,3,2013,29696.0,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,28179.0,37016.0,27027.0,32951.0,27880.0,35771.0,28693.0,33071.0,7.518410,8.484071,7.764148,8.492526,7.665659,8.737421,8.057568,8.563180,4696.500000,6169.333333,4504.500000,5491.833333,4646.666667,5961.833333,4782.166667,5511.833333,3748.0,4363.0,3481.0,3880.0,3637.0,4094.0,3561.0,3862.0,624.666667,727.166667,580.166667,646.666667,606.166667,682.333333,593.500000,643.666667
12,1,13,3,2013,21018.0,5,5,1,1,5,1,1,1270.0,1,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,35521.0,28179.0,37016.0,27027.0,32951.0,27880.0,35771.0,28693.0,8.411319,7.518410,8.484071,7.764148,8.492526,7.665659,8.737421,8.057568,5920.166667,4696.500000,6169.333333,4504.500000,5491.833333,4646.666667,5961.833333,4782.166667,4223.0,3748.0,4363.0,3481.0,3880.0,3637.0,4094.0,3561.0,703.833333,624.666667,727.166667,580.166667,646.666667,606.166667,682.333333,593.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150512,1115,19,5,2015,48130.0,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,44909.0,32221.0,46236.0,27875.0,52718.0,33772.0,48139.0,37196.0,15.779691,13.521192,15.473896,13.472692,16.281038,13.767631,15.856061,14.484424,8981.800000,5370.166667,7706.000000,5575.000000,10543.600000,5628.666667,8023.166667,6199.333333,2846.0,2383.0,2988.0,2069.0,3238.0,2453.0,3036.0,2568.0,569.200000,397.166667,498.000000,413.800000,647.600000,408.833333,506.000000,428.000000
150513,1115,20,5,2015,36233.0,5,0,0,1,0,0,1,5350.0,0,1,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,47767.0,44909.0,32221.0,46236.0,27875.0,52718.0,33772.0,48139.0,15.188235,15.779691,13.521192,15.473896,13.472692,16.281038,13.767631,15.856061,7961.166667,8981.800000,5370.166667,7706.000000,5575.000000,10543.600000,5628.666667,8023.166667,3145.0,2846.0,2383.0,2988.0,2069.0,3238.0,2453.0,3036.0,524.166667,569.200000,397.166667,498.000000,413.800000,647.600000,408.833333,506.000000
150514,1115,21,5,2015,45927.0,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,33638.0,47767.0,44909.0,32221.0,46236.0,27875.0,52718.0,33772.0,14.098072,15.188235,15.779691,13.521192,15.473896,13.472692,16.281038,13.767631,6727.600000,7961.166667,8981.800000,5370.166667,7706.000000,5575.000000,10543.600000,5628.666667,2386.0,3145.0,2846.0,2383.0,2988.0,2069.0,3238.0,2453.0,477.200000,524.166667,569.200000,397.166667,498.000000,413.800000,647.600000,408.833333
150515,1115,22,5,2015,35362.0,5,0,0,1,0,0,1,5350.0,0,1,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,46629.0,33638.0,47767.0,44909.0,32221.0,46236.0,27875.0,52718.0,15.085409,14.098072,15.188235,15.779691,13.521192,15.473896,13.472692,16.281038,7771.500000,6727.600000,7961.166667,8981.800000,5370.166667,7706.000000,5575.000000,10543.600000,3091.0,2386.0,3145.0,2846.0,2383.0,2988.0,2069.0,3238.0,515.166667,477.200000,524.166667,569.200000,397.166667,498.000000,413.800000,647.600000


In [24]:
print(aaa)

NameError: name 'aaa' is not defined

## Feature Scaling

In [25]:
df_nans_handeled_cat

Unnamed: 0,Store,CW,Month,Year,Future_Sales,Open,Promo,IsPromo,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,CompetitionDistance,IsCompetition,Promo2,Promo2Member,Promo2Active,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",Sales_Lag_1,Sales_Lag_2,Sales_Lag_3,Sales_Lag_4,Sales_Lag_5,Sales_Lag_6,Sales_Lag_7,Sales_Lag_8,SalesPerCustomer_Lag_1,SalesPerCustomer_Lag_2,SalesPerCustomer_Lag_3,SalesPerCustomer_Lag_4,SalesPerCustomer_Lag_5,SalesPerCustomer_Lag_6,SalesPerCustomer_Lag_7,SalesPerCustomer_Lag_8,SalesPerOpenDay_Lag_1,SalesPerOpenDay_Lag_2,SalesPerOpenDay_Lag_3,SalesPerOpenDay_Lag_4,SalesPerOpenDay_Lag_5,SalesPerOpenDay_Lag_6,SalesPerOpenDay_Lag_7,SalesPerOpenDay_Lag_8,Customers_Lag_1,Customers_Lag_2,Customers_Lag_3,Customers_Lag_4,Customers_Lag_5,Customers_Lag_6,Customers_Lag_7,Customers_Lag_8,CustomersPerOpenDay_Lag_1,CustomersPerOpenDay_Lag_2,CustomersPerOpenDay_Lag_3,CustomersPerOpenDay_Lag_4,CustomersPerOpenDay_Lag_5,CustomersPerOpenDay_Lag_6,CustomersPerOpenDay_Lag_7,CustomersPerOpenDay_Lag_8
8,1,9,3,2013,28979.0,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,32951.0,27880.0,35771.0,28693.0,33071.0,25978.0,32952.0,19340.0,8.492526,7.665659,8.737421,8.057568,8.563180,7.602575,8.410413,7.736000,5491.833333,4646.666667,5961.833333,4782.166667,5511.833333,4329.666667,5492.000000,4835.000000,3880.0,3637.0,4094.0,3561.0,3862.0,3417.0,3918.0,2500.0,646.666667,606.166667,682.333333,593.500000,643.666667,569.500000,653.000000,625.000000
9,1,10,3,2013,30171.0,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,27027.0,32951.0,27880.0,35771.0,28693.0,33071.0,25978.0,32952.0,7.764148,8.492526,7.665659,8.737421,8.057568,8.563180,7.602575,8.410413,4504.500000,5491.833333,4646.666667,5961.833333,4782.166667,5511.833333,4329.666667,5492.000000,3481.0,3880.0,3637.0,4094.0,3561.0,3862.0,3417.0,3918.0,580.166667,646.666667,606.166667,682.333333,593.500000,643.666667,569.500000,653.000000
10,1,11,3,2013,24895.0,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,37016.0,27027.0,32951.0,27880.0,35771.0,28693.0,33071.0,25978.0,8.484071,7.764148,8.492526,7.665659,8.737421,8.057568,8.563180,7.602575,6169.333333,4504.500000,5491.833333,4646.666667,5961.833333,4782.166667,5511.833333,4329.666667,4363.0,3481.0,3880.0,3637.0,4094.0,3561.0,3862.0,3417.0,727.166667,580.166667,646.666667,606.166667,682.333333,593.500000,643.666667,569.500000
11,1,12,3,2013,29696.0,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,28179.0,37016.0,27027.0,32951.0,27880.0,35771.0,28693.0,33071.0,7.518410,8.484071,7.764148,8.492526,7.665659,8.737421,8.057568,8.563180,4696.500000,6169.333333,4504.500000,5491.833333,4646.666667,5961.833333,4782.166667,5511.833333,3748.0,4363.0,3481.0,3880.0,3637.0,4094.0,3561.0,3862.0,624.666667,727.166667,580.166667,646.666667,606.166667,682.333333,593.500000,643.666667
12,1,13,3,2013,21018.0,5,5,1,1,5,1,1,1270.0,1,0,0,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,35521.0,28179.0,37016.0,27027.0,32951.0,27880.0,35771.0,28693.0,8.411319,7.518410,8.484071,7.764148,8.492526,7.665659,8.737421,8.057568,5920.166667,4696.500000,6169.333333,4504.500000,5491.833333,4646.666667,5961.833333,4782.166667,4223.0,3748.0,4363.0,3481.0,3880.0,3637.0,4094.0,3561.0,703.833333,624.666667,727.166667,580.166667,646.666667,606.166667,682.333333,593.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150512,1115,19,5,2015,48130.0,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,44909.0,32221.0,46236.0,27875.0,52718.0,33772.0,48139.0,37196.0,15.779691,13.521192,15.473896,13.472692,16.281038,13.767631,15.856061,14.484424,8981.800000,5370.166667,7706.000000,5575.000000,10543.600000,5628.666667,8023.166667,6199.333333,2846.0,2383.0,2988.0,2069.0,3238.0,2453.0,3036.0,2568.0,569.200000,397.166667,498.000000,413.800000,647.600000,408.833333,506.000000,428.000000
150513,1115,20,5,2015,36233.0,5,0,0,1,0,0,1,5350.0,0,1,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,47767.0,44909.0,32221.0,46236.0,27875.0,52718.0,33772.0,48139.0,15.188235,15.779691,13.521192,15.473896,13.472692,16.281038,13.767631,15.856061,7961.166667,8981.800000,5370.166667,7706.000000,5575.000000,10543.600000,5628.666667,8023.166667,3145.0,2846.0,2383.0,2988.0,2069.0,3238.0,2453.0,3036.0,524.166667,569.200000,397.166667,498.000000,413.800000,647.600000,408.833333,506.000000
150514,1115,21,5,2015,45927.0,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,33638.0,47767.0,44909.0,32221.0,46236.0,27875.0,52718.0,33772.0,14.098072,15.188235,15.779691,13.521192,15.473896,13.472692,16.281038,13.767631,6727.600000,7961.166667,8981.800000,5370.166667,7706.000000,5575.000000,10543.600000,5628.666667,2386.0,3145.0,2846.0,2383.0,2988.0,2069.0,3238.0,2453.0,477.200000,524.166667,569.200000,397.166667,498.000000,413.800000,647.600000,408.833333
150515,1115,22,5,2015,35362.0,5,0,0,1,0,0,1,5350.0,0,1,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,46629.0,33638.0,47767.0,44909.0,32221.0,46236.0,27875.0,52718.0,15.085409,14.098072,15.188235,15.779691,13.521192,15.473896,13.472692,16.281038,7771.500000,6727.600000,7961.166667,8981.800000,5370.166667,7706.000000,5575.000000,10543.600000,3091.0,2386.0,3145.0,2846.0,2383.0,2988.0,2069.0,3238.0,515.166667,477.200000,524.166667,569.200000,397.166667,498.000000,413.800000,647.600000


In [26]:
## Nutzt gesplitte daten in einmaligen test und train wobei test die letzten 8 wochen jedes stores beinhaltet
##
##


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score
from math import sqrt


df = df_nans_handeled_cat

# Aufteilen in Features (X) und Zielvariable (y)
#feature_columns = df.columns.difference(['Future_Sales'])
#X = df[feature_columns]
#y = df['Future_Sales']

# Aufteilen in Trainings- und Testdaten

# Listen, um die Trainings- und Testdaten zu speichern
train_data = []
test_data = []

# Gruppieren nach Store und Aufteilen in Trainings- und Testdaten
amount_test_weeks = 8
for store_id, group in df_nans_handeled_cat.groupby('Store'):
    train_data.append(group[: -amount_test_weeks])
    test_data.append(group[-amount_test_weeks:])

# Kombinieren der Trainings- und Testdaten
train_df = pd.concat(train_data)
test_df = pd.concat(test_data)

X_train = train_df.drop(columns=['Future_Sales'])
y_train = train_df['Future_Sales']
X_test = test_df.drop(columns=['Future_Sales'])
y_test = test_df['Future_Sales']

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Funktion zur Berechnung des angepassten R2
def adj_r2_score(model, X, y):
    n = X.shape[0]
    p = X.shape[1]
    r2 = r2_score(y, model.predict(X))
    return 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Modelle definieren
models = [
    ('LinearRegression', LinearRegression(n_jobs=-1)),
    #('RidgeRegression', Ridge(random_state=42)),
    #('LassoRegression', Lasso(random_state=42)),
    #('DecisionTreeRegressor', DecisionTreeRegressor(random_state=42)),
    #('RandomForestRegressor', RandomForestRegressor(n_jobs=-1, max_depth=10, random_state=42, n_estimators=100)),
    #('SVR', SVR()),
    #('KNN', KNeighborsRegressor())
]

# Ergebnis-DataFrame vorbereiten
results = []

# Modelle trainieren und Metriken auswerten
for name, model in models:
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    results.append({
        'Model': name,
        'RMSE_Train': sqrt(mse(y_train, y_train_pred)),
        'MAE_Train': mae(y_train, y_train_pred),
        'R2_Train': r2_score(y_train, y_train_pred),
        'Adj_R2_Train': adj_r2_score(model, X_train, y_train),
        'RMSE_Test': sqrt(mse(y_test, y_test_pred)),
        'MAE_Test': mae(y_test, y_test_pred),
        'R2_Test': r2_score(y_test, y_test_pred),
        'Adj_R2_Test': adj_r2_score(model, X_test, y_test)
    })
	#print last result
    print(results[-1])

# Konvertieren Sie die Liste von Dictionaries in einen DataFrame
results_df = pd.DataFrame(results)

# Ergebnisse anzeigen
results_df


{'Model': 'LinearRegression', 'RMSE_Train': 9796.355596686768, 'MAE_Train': 6976.6687356943, 'R2_Train': 0.7070232141300387, 'Adj_R2_Train': 0.7068550449385989, 'RMSE_Test': 6585.029230005763, 'MAE_Test': 5012.718658803254, 'R2_Test': 0.833082200366611, 'Adj_R2_Test': 0.8317427831227173}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9796.355597,6976.668736,0.707023,0.706855,6585.02923,5012.718659,0.833082,0.831743


In [51]:
############### Erstellt x splits in test und train wobei im jeweiligen test split die letzten 8 wochen jedes stores beinhaltet werden
# und die splits mittels gap gleichmäßig verteilt werden
##
##


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score
from math import sqrt


df = df_nans_handeled_cat

n_splits = 5
window_size = 8
total_weeks =109
train_size = window_size / 0.2
gap = int((total_weeks - window_size - train_size) // (n_splits))

# Ergebnis-DataFrame vorbereiten
results = []

for split in range(n_splits):
    train_data = []
    test_data = []

    for store_id, group in df.groupby('Store'):
        #group = group.sort_values(by=['Year', 'Month', 'CW'])
        # Berechnen des Startpunkts für das Testset

        if split == 0:
            test_start_index = -window_size
            test_df_store = group[test_start_index:]  # Kein Endindex für den ersten Split
        else:
            test_start_index = -(window_size + gap * split)
            test_end_index = test_start_index + window_size
            test_df_store = group[test_start_index:test_end_index]
            print("test:", test_df_store.shape, "Test Start Index:", test_start_index, "Test End Index:", test_end_index)
        train_start_index = -int(-test_start_index + gap + train_size)
        train_df_store = group[train_start_index:test_start_index]
        print("Train:", train_df_store.shape, "Train Start Index:", train_start_index, "Train End Index:", test_start_index)
        # Überprüfen Sie, ob das Testset Daten enthält
        if not test_df_store.empty:
            train_data.append(train_df_store)
            test_data.append(test_df_store)
        else:
            print(f"Store {store_id} hat nicht genügend Daten für Split {split}")
        #train_data.append(train_df_store)
        #test_data.append(test_df_store)

    # Kombinieren der Trainings- und Testdaten aller Filialen für diese Iteration
    train_df_combined = pd.concat(train_data)
    test_df_combined = pd.concat(test_data)

    # Erstellen von Feature- und Zielvariablen
    X_train = train_df_combined.drop(columns=['Future_Sales'])
    y_train = train_df_combined['Future_Sales']
    X_test = test_df_combined.drop(columns=['Future_Sales'])
    y_test = test_df_combined['Future_Sales']

    # Trainieren und Bewerten des Modells für diese Iteration
	# Funktion zur Berechnung des angepassten R2
    def adj_r2_score(model, X, y):
        n = X.shape[0]
        p = X.shape[1]
        r2 = r2_score(y, model.predict(X))
        return 1 - (1 - r2) * ((n - 1) / (n - p - 1))

	# Modelle definieren
    models = [
		('LinearRegression', LinearRegression(n_jobs=-1)),
		('RidgeRegression', Ridge(random_state=42)),
		#('LassoRegression', Lasso(random_state=42)),
		#('DecisionTreeRegressor', DecisionTreeRegressor(random_state=42)),
		#('RandomForestRegressor', RandomForestRegressor(n_jobs=-1, max_depth=10, random_state=42, n_estimators=100)),
		#('SVR', SVR()),
		#('KNN', KNeighborsRegressor())
	]
    
    # Modelle trainieren und Metriken auswerten
    for name, model in models:
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
    
        results.append({
            'Model': name,
            'RMSE_Train': sqrt(mse(y_train, y_train_pred)),
            'MAE_Train': mae(y_train, y_train_pred),
            'R2_Train': r2_score(y_train, y_train_pred),
            'Adj_R2_Train': adj_r2_score(model, X_train, y_train),
            'RMSE_Test': sqrt(mse(y_test, y_test_pred)),
            'MAE_Test': mae(y_test, y_test_pred),
            'R2_Test': r2_score(y_test, y_test_pred),
            'Adj_R2_Test': adj_r2_score(model, X_test, y_test)
        })
    	#print last result
        print(results[-1])

# Konvertieren Sie die Liste von Dictionaries in einen DataFrame
results_df = pd.DataFrame(results)


# Hinzufügen einer Zeile mit den Mittelwerten
#mean_row = results_df.mean(numeric_only=True)
#mean_row_df = pd.DataFrame([mean_row])
#mean_row_df['Model'] = 'Average'
#results_df = pd.concat([results_df, mean_row_df], ignore_index=True)

# Ergebnisse anzeigen
results_df


Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: -8
Train: (52, 72) Train Start Index: -60 Train End Index: 

Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,10360.789689,7107.335678,0.719583,0.719239,5615.587649,4168.895229,0.878611,0.877637
1,RidgeRegression,10360.80006,7107.410855,0.719583,0.719239,5615.163812,4168.610475,0.87863,0.877656
2,LinearRegression,10728.99793,7321.032831,0.696823,0.696451,7963.269649,6067.102231,0.769999,0.768154
3,RidgeRegression,10729.009931,7321.038553,0.696822,0.69645,7963.87141,6067.541031,0.769964,0.768118
4,LinearRegression,9140.563643,5872.763556,0.776232,0.775957,17300.512938,11994.485215,-0.100684,-0.109516
5,RidgeRegression,9140.573493,5872.652997,0.776231,0.775957,17303.89687,11996.286873,-0.101115,-0.10995
6,LinearRegression,9754.990442,6656.622018,0.711504,0.71115,10664.747831,8358.784288,0.738902,0.736807
7,RidgeRegression,9754.995668,6656.408226,0.711504,0.71115,10681.380562,8369.128781,0.738087,0.735985
8,LinearRegression,8310.653572,6098.189243,0.75489,0.75459,17953.265159,10985.11834,0.265732,0.25984
9,RidgeRegression,8310.653784,6098.180926,0.75489,0.754589,17951.713595,10983.760037,0.265859,0.259968


In [54]:
model_list = results_df['Model'].unique()
# create resulte_mean_df
resulte_mean_df = pd.DataFrame(columns=results_df.columns)
# iterate over model_list
for model in model_list:
	# get mean of each model
	mean = results_df[results_df['Model'] == model].mean(numeric_only=True)
	mean['Model'] = model
	# append mean to resulte_mean_df
	resulte_mean_df = pd.concat([resulte_mean_df, pd.DataFrame([mean], columns=results_df.columns)], ignore_index=True)

resulte_mean_df

Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9659.199055,6611.188665,0.731806,0.731478,11899.476645,8314.87706,0.510512,0.506584
1,RidgeRegression,9659.206587,6611.138311,0.731806,0.731477,11903.20525,8317.065439,0.510285,0.506355


TODO: Test Cyclical Feature Encoding

Cyclical encoding is a technique used to transform cyclical features like hours of the day, days of the week, months in a year, or calendar weeks, so that these features can be properly understood and utilized by machine learning models. The key challenge with cyclical data is that they wrap around: the highest value is succeeded by the lowest value, forming a cycle. Traditional numerical encoding of these values can mislead a model into thinking that the cycle is linear and that values at the end of the cycle are far apart when, in reality, they are close.

How Cyclical Encoding Works:
The most common way to encode cyclical data is by using sine and cosine transformations. This approach projects each cyclical feature onto a circle such that the beginning and end of the cycle meet. Here's how it's typically done:

Sine and Cosine Transformation: Each cyclical value is transformed into two features using sine and cosine functions. This maps the data points onto a unit circle and preserves their cyclical nature.

Example: For a month feature (with values from 1 to 12), the transformation would be:

python
Copy code
df['month_sin'] = np.sin((df['month']-1)*(2.*np.pi/12))
df['month_cos'] = np.cos((df['month']-1)*(2.*np.pi/12))
This creates two new columns, month_sin and month_cos, which represent the sine and cosine values of the month on the unit circle.

Advantages of Cyclical Encoding:
Preserves Cyclical Nature: It correctly represents the proximity of values at the end and start of the cycle. For example, December (12) is close to January (1).

Continuous Representation: It provides a smooth and continuous representation, which is useful for many machine learning models.

Efficient Use of Features: Unlike one-hot encoding, which would create many sparse columns for each possible value, sine and cosine transformations keep the feature space more compact.

When to Use Cyclical Encoding:
Cyclical encoding is particularly useful in time series analysis or when the cyclical aspect of a feature is important for the model. Common scenarios include:

Time of Day: Hours of the day for predicting traffic patterns.
Days of the Week: Days for weekly sales forecasts.
Seasons: Months or calendar weeks for seasonal effects in data.
Implementation Considerations:
Standardization: After applying sine and cosine transformations, you might need to standardize these features, especially if you're using models sensitive to feature scaling.

Model Compatibility: While cyclical encoding is beneficial for many models, it's always good practice to test and validate its impact on your specific model and dataset.

In summary, cyclical encoding is a powerful technique for handling cyclical features in machine learning, enabling models to better understand and utilize the patterns within such data.