# Capstone Projekt Rossmann

# Feature Engineering

## Functions needed for testing

### Test models with test and tain data. Test includes the last 8 weeks from each store

In [42]:
import pandas as pd
import numpy as np
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from pandas.api.types import infer_dtype

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from xgboost import XGBRegressor
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from scikeras.wrappers import KerasRegressor

from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score
from math import sqrt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

pd.set_option('display.max_columns', None)

In [48]:
## Test models with test and tain data. Test includes the last 8 weeks from each store

def build_neural_network(X_train):
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=X_train.shape[1]))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model





def testModelsTestSplit8W(df, scaler):
	train_data = []
	test_data = []

	# Group by store and split into training and test data
	amount_test_weeks = 8
	for store_id, group in df.groupby('Store'):
		train_data.append(group[: -amount_test_weeks])
		test_data.append(group[-amount_test_weeks:])

	# Combine the list entries to one dataframe
	train_df = pd.concat(train_data)
	test_df = pd.concat(test_data)

	X_train = train_df.drop(columns=['Future_Sales'])
	y_train = train_df['Future_Sales']
	X_test = test_df.drop(columns=['Future_Sales'])
	y_test = test_df['Future_Sales']

	# Scaling of the data
	if scaler:
		X_train = scaler.fit_transform(X_train)
		X_test = scaler.transform(X_test)

	def adj_r2_score(model, X, y):
		n = X.shape[0]
		p = X.shape[1]
		r2 = r2_score(y, model.predict(X))
		return 1 - (1 - r2) * ((n - 1) / (n - p - 1))

	# Defining the models to test
	models = [
		('LinearRegression', LinearRegression(n_jobs=-1)),
		('XGBRegressor', XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=3, learning_rate=0.1, n_jobs=-1, random_state=42, device="cuda")),
		#('GradientBoostingRegressor', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)),
  		('NeuralNetwork', KerasRegressor(build_fn=build_neural_network(X_train), epochs=100, batch_size=10, verbose=0)),
  		('MLPRegressor', MLPRegressor(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=200, shuffle=False, random_state=42))
		#('RidgeRegression', Ridge(random_state=42)),
		#('LassoRegression', Lasso(random_state=42)),
		#('DecisionTreeRegressor', DecisionTreeRegressor(random_state=42)),
		#('RandomForestRegressor', RandomForestRegressor(n_jobs=-1, max_depth=10, random_state=42, n_estimators=100)),
		#('SVR', SVR()),
		#('KNN', KNeighborsRegressor())
	]

	results = []
	# Train models and calculate metrics
	for name, model in models:
		model.fit(X_train, y_train)
		y_train_pred = model.predict(X_train)
		y_test_pred = model.predict(X_test)

		results.append({
			'Model': name,
			'RMSE_Train': sqrt(mse(y_train, y_train_pred)),
			'MAE_Train': mae(y_train, y_train_pred),
			'R2_Train': r2_score(y_train, y_train_pred),
			'Adj_R2_Train': adj_r2_score(model, X_train, y_train),
			'RMSE_Test': sqrt(mse(y_test, y_test_pred)),
			'MAE_Test': mae(y_test, y_test_pred),
			'R2_Test': r2_score(y_test, y_test_pred),
			'Adj_R2_Test': adj_r2_score(model, X_test, y_test)
		})
		#print last result
		print(results[-1])

	results_df = pd.DataFrame(results)
	return results_df

In [8]:
#!pip install scikeras


### Creates x splits in test and train where the last 8 weeks of each store are included in the respective test split and the splits are distributed evenly using gap

In [9]:
#Creates x splits in test and train where the last 8 weeks of each store are included in the respective test split and the splits are distributed
# evenly using gap

def testModelsCV8W(df, scaler):

    n_splits = 5
    window_size = 8
    total_weeks =109
    train_size = window_size / 0.2
    gap = int((total_weeks - window_size - train_size) // (n_splits))

    results = []

    for split in range(n_splits):
        train_data = []
        test_data = []

        for store_id, group in df.groupby('Store'):
            # calculate start and end index for test data
            if split == 0:
                test_start_index = -window_size
                test_df_store = group[test_start_index:] # No end index for the first split
            else:
                test_start_index = -(window_size + gap * split)
                test_end_index = test_start_index + window_size
                test_df_store = group[test_start_index:test_end_index]
                print("test:", test_df_store.shape, "Test Start Index:", test_start_index, "Test End Index:", test_end_index)

            train_start_index = -int(-test_start_index + gap + train_size)
            train_df_store = group[train_start_index:test_start_index]
            print("Train:", train_df_store.shape, "Train Start Index:", train_start_index, "Train End Index:", test_start_index)
            # Check if test set contains data
            if not test_df_store.empty:
                train_data.append(train_df_store)
                test_data.append(test_df_store)
            else:
                print(f"Store {store_id} has not enough data for splitting {split}")

        # Combine the list entries to one dataframe
        train_df_combined = pd.concat(train_data)
        test_df_combined = pd.concat(test_data)

        # Create feature and target data frames
        X_train = train_df_combined.drop(columns=['Future_Sales'])
        y_train = train_df_combined['Future_Sales']
        X_test = test_df_combined.drop(columns=['Future_Sales'])
        y_test = test_df_combined['Future_Sales']

        # Scaling of the data
        if scaler:
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

        def adj_r2_score(model, X, y):
            n = X.shape[0]
            p = X.shape[1]
            r2 = r2_score(y, model.predict(X))
            return 1 - (1 - r2) * ((n - 1) / (n - p - 1))

    	# Defining the models to test
        models = [
            ('LinearRegression', LinearRegression(n_jobs=-1)),
            #('RidgeRegression', Ridge(random_state=42)),
            #('LassoRegression', Lasso(random_state=42)),
            #('DecisionTreeRegressor', DecisionTreeRegressor(random_state=42)),
            #('RandomForestRegressor', RandomForestRegressor(n_jobs=-1, max_depth=10, random_state=42, n_estimators=100)),
            #('SVR', SVR()),
            #('KNN', KNeighborsRegressor())
        ]

        # Train models and calculate metrics
        for name, model in models:
            model.fit(X_train, y_train)
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)

            results.append({
                'Model': name,
                'RMSE_Train': sqrt(mse(y_train, y_train_pred)),
                'MAE_Train': mae(y_train, y_train_pred),
                'R2_Train': r2_score(y_train, y_train_pred),
                'Adj_R2_Train': adj_r2_score(model, X_train, y_train),
                'RMSE_Test': sqrt(mse(y_test, y_test_pred)),
                'MAE_Test': mae(y_test, y_test_pred),
                'R2_Test': r2_score(y_test, y_test_pred),
                'Adj_R2_Test': adj_r2_score(model, X_test, y_test)
            })
            #print last result
            print(results[-1])

    results_df = pd.DataFrame(results)

    # calculate mean of all splits
    model_list = results_df['Model'].unique()
    # create resulte_mean_df
    resulte_mean_df = pd.DataFrame(columns=results_df.columns)
    # iterate over model_list
    for model in model_list:
        # get mean of each model
        mean = results_df[results_df['Model'] == model].mean(numeric_only=True)
        mean['Model'] = model
        # append mean to resulte_mean_df
        resulte_mean_df = pd.concat([resulte_mean_df, pd.DataFrame([mean], columns=results_df.columns)], ignore_index=True)

    return results_df, resulte_mean_df


## Feature Engineering

In [11]:
df = pd.read_csv('weekly_sales_with_store_info.csv')
df['Date'] = pd.to_datetime(df['Date'])

In [12]:
print(df.info())
df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150525 entries, 0 to 150524
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   Store                      150525 non-null  int64         
 1   Date                       150525 non-null  datetime64[ns]
 2   CW                         150525 non-null  int64         
 3   Month                      150525 non-null  int64         
 4   Year                       150525 non-null  int64         
 5   DayOfWeek                  150525 non-null  int64         
 6   Sales                      150525 non-null  int64         
 7   SalesPerCustomer           145809 non-null  float64       
 8   SalesPerOpenDay            145815 non-null  float64       
 9   Customers                  150525 non-null  int64         
 10  CustomersPerOpenDay        145815 non-null  float64       
 11  Open                       150525 non-null  int64   

Unnamed: 0,Store,Date,CW,Month,Year,DayOfWeek,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,StateHoliday,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,IsCompetition,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Promo2Member,Promo2Active
99138,735,2013-12-08,49,12,2013,6,49876,12.003851,8312.666667,4155,692.5,6,5,1,0,0,0,0,0,d,c,1920.0,4.0,2005.0,1,0,,,,0,0
18684,139,2014-01-19,3,1,2014,6,29864,11.277946,4977.333333,2648,441.333333,6,0,0,0,0,0,0,0,a,a,1700.0,1.0,2008.0,1,1,14.0,2011.0,"Jan,Apr,Jul,Oct",1,1
6796,51,2013-11-24,47,11,2013,6,38791,13.58704,6465.166667,2855,475.833333,6,5,1,0,0,0,0,0,a,c,10570.0,7.0,2013.0,1,1,9.0,2011.0,"Jan,Apr,Jul,Oct",1,0
22660,168,2015-03-22,12,3,2015,6,50900,10.492682,8483.333333,4851,808.5,6,5,1,0,0,0,0,0,a,a,12540.0,,,0,0,,,,0,0
41591,309,2013-03-24,12,3,2013,6,47129,12.146649,7854.833333,3880,646.666667,6,5,1,0,0,0,0,0,d,a,8740.0,,,0,1,37.0,2009.0,"Feb,May,Aug,Nov",1,0


### Handle Missing Values

In [13]:
df.isna().sum()

Store                            0
Date                             0
CW                               0
Month                            0
Year                             0
DayOfWeek                        0
Sales                            0
SalesPerCustomer              4716
SalesPerOpenDay               4710
Customers                        0
CustomersPerOpenDay           4710
Open                             0
Promo                            0
IsPromo                          0
StateHoliday                     0
IsStateHoliday                   0
SchoolHoliday                    0
IsSchoolHoliday                  0
NumStateHoliday                  0
StoreType                        0
Assortment                       0
CompetitionDistance            405
CompetitionOpenSinceMonth    47790
CompetitionOpenSinceYear     47790
IsCompetition                    0
Promo2                           0
Promo2SinceWeek              73440
Promo2SinceYear              73440
PromoInterval       

#### SalesPerCustomer, SalesPerOpenday, CustomersPerOpenday

In [14]:
# As the store were closed, we can fill the nans with 0

# fill nans with 0 for listed columns
columns_to_fill = ['SalesPerCustomer', 'SalesPerOpenDay', 'CustomersPerOpenDay']
df_nans_handeled = df.fillna({col: 0 for col in columns_to_fill})
df_nans_handeled


Unnamed: 0,Store,Date,CW,Month,Year,DayOfWeek,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,StateHoliday,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,IsCompetition,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Promo2Member,Promo2Active
0,1,2013-01-06,1,1,2013,6,19340,7.736000,4835.000000,2500,625.000000,4,0,0,a,1,6,1,1,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
1,1,2013-01-13,2,1,2013,6,32952,8.410413,5492.000000,3918,653.000000,6,5,1,0,0,5,1,0,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
2,1,2013-01-20,3,1,2013,6,25978,7.602575,4329.666667,3417,569.500000,6,0,0,0,0,0,0,0,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
3,1,2013-01-27,4,1,2013,6,33071,8.563180,5511.833333,3862,643.666667,6,5,1,0,0,0,0,0,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
4,1,2013-02-03,5,2,2013,6,28693,8.057568,4782.166667,3561,593.500000,6,0,0,0,0,0,0,0,c,a,1270.0,9.0,2008.0,1,0,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150520,1115,2015-07-05,27,7,2015,6,48130,16.140174,8021.666667,2982,497.000000,6,5,1,0,0,0,0,0,d,c,5350.0,,,0,1,22.0,2012.0,"Mar,Jun,Sept,Dec",1,0
150521,1115,2015-07-12,28,7,2015,6,36233,14.315685,6038.833333,2531,421.833333,6,0,0,0,0,0,0,0,d,c,5350.0,,,0,1,22.0,2012.0,"Mar,Jun,Sept,Dec",1,0
150522,1115,2015-07-19,29,7,2015,6,45927,15.023553,7654.500000,3057,509.500000,6,5,1,0,0,0,0,0,d,c,5350.0,,,0,1,22.0,2012.0,"Mar,Jun,Sept,Dec",1,0
150523,1115,2015-07-26,30,7,2015,6,35362,14.122204,5893.666667,2504,417.333333,6,0,0,0,0,0,0,0,d,c,5350.0,,,0,1,22.0,2012.0,"Mar,Jun,Sept,Dec",1,0


In [15]:
df_nans_handeled.isna().sum()

Store                            0
Date                             0
CW                               0
Month                            0
Year                             0
DayOfWeek                        0
Sales                            0
SalesPerCustomer                 0
SalesPerOpenDay                  0
Customers                        0
CustomersPerOpenDay              0
Open                             0
Promo                            0
IsPromo                          0
StateHoliday                     0
IsStateHoliday                   0
SchoolHoliday                    0
IsSchoolHoliday                  0
NumStateHoliday                  0
StoreType                        0
Assortment                       0
CompetitionDistance            405
CompetitionOpenSinceMonth    47790
CompetitionOpenSinceYear     47790
IsCompetition                    0
Promo2                           0
Promo2SinceWeek              73440
Promo2SinceYear              73440
PromoInterval       

#### CompetitionDistance

In [16]:
# Stores with no CompetitionDistance information
print("Stores with no CompetitionDistance information:", df_nans_handeled[(df_nans_handeled['CompetitionDistance'].isna())]['Store'].unique())

print("StoreType of store 291", df_nans_handeled[(df_nans_handeled['Store'] == 291)]['StoreType'].unique())
print("StoreType of store 622", df_nans_handeled[(df_nans_handeled['Store'] == 622)]['StoreType'].unique())
print("StoreType of store 879", df_nans_handeled[(df_nans_handeled['Store'] == 879)]['StoreType'].unique())

Stores with no CompetitionDistance information: [291 622 879]
StoreType of store 291 ['d']
StoreType of store 622 ['a']
StoreType of store 879 ['d']


In [17]:
# As store 291, 622 and 879 have no CompetitionDistance information, we can fill them with the median value of the column

# median competition distance for store type a
median_competition_distance_a = df_nans_handeled[(df_nans_handeled['StoreType'] == 'a')]['CompetitionDistance'].median()
# median competition distance for store type d
median_competition_distance_d = df_nans_handeled[(df_nans_handeled['StoreType'] == 'd')]['CompetitionDistance'].median()

# fill nans for storetype a with median_competition_distance_a
df_nans_handeled.loc[(df_nans_handeled['Store'] == 291), 'CompetitionDistance'] = median_competition_distance_a
# fill nans for storetype d with median_competition_distance_d
df_nans_handeled.loc[(df_nans_handeled['Store'] == 622), 'CompetitionDistance'] = median_competition_distance_d
# fill nans for storetype d with median_competition_distance_d
df_nans_handeled.loc[(df_nans_handeled['Store'] == 879), 'CompetitionDistance'] = median_competition_distance_d



In [18]:
df_nans_handeled.isna().sum()

Store                            0
Date                             0
CW                               0
Month                            0
Year                             0
DayOfWeek                        0
Sales                            0
SalesPerCustomer                 0
SalesPerOpenDay                  0
Customers                        0
CustomersPerOpenDay              0
Open                             0
Promo                            0
IsPromo                          0
StateHoliday                     0
IsStateHoliday                   0
SchoolHoliday                    0
IsSchoolHoliday                  0
NumStateHoliday                  0
StoreType                        0
Assortment                       0
CompetitionDistance              0
CompetitionOpenSinceMonth    47790
CompetitionOpenSinceYear     47790
IsCompetition                    0
Promo2                           0
Promo2SinceWeek              73440
Promo2SinceYear              73440
PromoInterval       

#### CompetitionOpenSinceMonth, CompetitionOpenSinceYear

In [19]:
# CompetitionOpenSinceMonth and CompetitionOpenSinceYear can be deleted as they are reflected in IsCompetition
df_nans_handeled = df_nans_handeled.drop(columns=['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'])

In [20]:
df_nans_handeled.isna().sum()

Store                      0
Date                       0
CW                         0
Month                      0
Year                       0
DayOfWeek                  0
Sales                      0
SalesPerCustomer           0
SalesPerOpenDay            0
Customers                  0
CustomersPerOpenDay        0
Open                       0
Promo                      0
IsPromo                    0
StateHoliday               0
IsStateHoliday             0
SchoolHoliday              0
IsSchoolHoliday            0
NumStateHoliday            0
StoreType                  0
Assortment                 0
CompetitionDistance        0
IsCompetition              0
Promo2                     0
Promo2SinceWeek        73440
Promo2SinceYear        73440
PromoInterval          73440
Promo2Member               0
Promo2Active               0
dtype: int64

#### Promo2SinceWeek, Promo2SinceYear

In [21]:
# Promo2SinceWeek and Promo2SinceYear can be deleted as they are reflected in Promo2Member
df_nans_handeled = df_nans_handeled.drop(columns=['Promo2SinceWeek', 'Promo2SinceYear'])

In [22]:
df_nans_handeled.isna().sum()

Store                      0
Date                       0
CW                         0
Month                      0
Year                       0
DayOfWeek                  0
Sales                      0
SalesPerCustomer           0
SalesPerOpenDay            0
Customers                  0
CustomersPerOpenDay        0
Open                       0
Promo                      0
IsPromo                    0
StateHoliday               0
IsStateHoliday             0
SchoolHoliday              0
IsSchoolHoliday            0
NumStateHoliday            0
StoreType                  0
Assortment                 0
CompetitionDistance        0
IsCompetition              0
Promo2                     0
PromoInterval          73440
Promo2Member               0
Promo2Active               0
dtype: int64

#### PromoInterval

In [23]:
df_nans_handeled[(df_nans_handeled['Promo2'] == 1) & (df_nans_handeled['PromoInterval'].isna())]

Unnamed: 0,Store,Date,CW,Month,Year,DayOfWeek,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,StateHoliday,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,StoreType,Assortment,CompetitionDistance,IsCompetition,Promo2,PromoInterval,Promo2Member,Promo2Active


In [24]:
# As if the store is not participating in Promo2, PromoInterval is 0, we can fill the nans with 0
df_nans_handeled['PromoInterval'] = df_nans_handeled['PromoInterval'].fillna(0)

In [25]:
df_nans_handeled.isna().sum()

Store                  0
Date                   0
CW                     0
Month                  0
Year                   0
DayOfWeek              0
Sales                  0
SalesPerCustomer       0
SalesPerOpenDay        0
Customers              0
CustomersPerOpenDay    0
Open                   0
Promo                  0
IsPromo                0
StateHoliday           0
IsStateHoliday         0
SchoolHoliday          0
IsSchoolHoliday        0
NumStateHoliday        0
StoreType              0
Assortment             0
CompetitionDistance    0
IsCompetition          0
Promo2                 0
PromoInterval          0
Promo2Member           0
Promo2Active           0
dtype: int64

### Remove not needed Features

In [26]:
# Date is an object and is reflected by CW, Month and Year
df_nans_handeled = df_nans_handeled.drop(columns=['Date'])

# DayOfWeek is not relevant in weekly data
df_nans_handeled = df_nans_handeled.drop(columns=['DayOfWeek'])

df_nans_handeled

Unnamed: 0,Store,CW,Month,Year,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,StateHoliday,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,StoreType,Assortment,CompetitionDistance,IsCompetition,Promo2,PromoInterval,Promo2Member,Promo2Active
0,1,1,1,2013,19340,7.736000,4835.000000,2500,625.000000,4,0,0,a,1,6,1,1,c,a,1270.0,1,0,0,0,0
1,1,2,1,2013,32952,8.410413,5492.000000,3918,653.000000,6,5,1,0,0,5,1,0,c,a,1270.0,1,0,0,0,0
2,1,3,1,2013,25978,7.602575,4329.666667,3417,569.500000,6,0,0,0,0,0,0,0,c,a,1270.0,1,0,0,0,0
3,1,4,1,2013,33071,8.563180,5511.833333,3862,643.666667,6,5,1,0,0,0,0,0,c,a,1270.0,1,0,0,0,0
4,1,5,2,2013,28693,8.057568,4782.166667,3561,593.500000,6,0,0,0,0,0,0,0,c,a,1270.0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150520,1115,27,7,2015,48130,16.140174,8021.666667,2982,497.000000,6,5,1,0,0,0,0,0,d,c,5350.0,0,1,"Mar,Jun,Sept,Dec",1,0
150521,1115,28,7,2015,36233,14.315685,6038.833333,2531,421.833333,6,0,0,0,0,0,0,0,d,c,5350.0,0,1,"Mar,Jun,Sept,Dec",1,0
150522,1115,29,7,2015,45927,15.023553,7654.500000,3057,509.500000,6,5,1,0,0,0,0,0,d,c,5350.0,0,1,"Mar,Jun,Sept,Dec",1,0
150523,1115,30,7,2015,35362,14.122204,5893.666667,2504,417.333333,6,0,0,0,0,0,0,0,d,c,5350.0,0,1,"Mar,Jun,Sept,Dec",1,0


### Categorical Feature Encoding

In [28]:
df_nans_handeled.select_dtypes(include='object').columns

Index(['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval'], dtype='object')

In [29]:
# check if a column contains mixed data typesrom pandas.api.types import infer_dtype

for col in ['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']:
    dtype = infer_dtype(df_nans_handeled[col])
    print(f"Data type of {col}: {dtype}")


Data type of StateHoliday: string
Data type of StoreType: string
Data type of Assortment: string
Data type of PromoInterval: mixed-integer


In [30]:
# Convert mixed columns
cols_to_convert = ['PromoInterval']
df_nans_handeled[cols_to_convert] = df_nans_handeled[cols_to_convert].astype(str)

In [31]:
from sklearn.preprocessing import OneHotEncoder
# handle_unknown='ignore': to avoid error if the training data contains classes/categories that are not represented in the training data
# sparse=False: ensures that the encoded columns are returned as a NumPy array (instead of a sparse matrix).
OneHotEnc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# it is important to pass only the categorical columns, not the whole dataframe
encoded_array = OneHotEnc.fit_transform(df_nans_handeled[['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']])
tmp_cat = pd.DataFrame(encoded_array, columns=OneHotEnc.get_feature_names_out(), index=df_nans_handeled.index)
df_nans_handeled_cat = pd.concat([df_nans_handeled.select_dtypes(include=['number']), tmp_cat], axis=1)
df_nans_handeled_cat

Unnamed: 0,Store,CW,Month,Year,Sales,SalesPerCustomer,SalesPerOpenDay,Customers,CustomersPerOpenDay,Open,Promo,IsPromo,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,CompetitionDistance,IsCompetition,Promo2,Promo2Member,Promo2Active,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec"
0,1,1,1,2013,19340,7.736000,4835.000000,2500,625.000000,4,0,0,1,6,1,1,1270.0,1,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,2,1,2013,32952,8.410413,5492.000000,3918,653.000000,6,5,1,0,5,1,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1,3,1,2013,25978,7.602575,4329.666667,3417,569.500000,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1,4,1,2013,33071,8.563180,5511.833333,3862,643.666667,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1,5,2,2013,28693,8.057568,4782.166667,3561,593.500000,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150520,1115,27,7,2015,48130,16.140174,8021.666667,2982,497.000000,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
150521,1115,28,7,2015,36233,14.315685,6038.833333,2531,421.833333,6,0,0,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
150522,1115,29,7,2015,45927,15.023553,7654.500000,3057,509.500000,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
150523,1115,30,7,2015,35362,14.122204,5893.666667,2504,417.333333,6,0,0,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [32]:
# Creating Lag-Features for these columns
lag_columns = ['Sales', 'SalesPerCustomer', 'SalesPerOpenDay', 'Customers', 'CustomersPerOpenDay']
n_lags = 8  # Anzahl der zu erstellenden Lag-Features
n_periods_for_ma = 4  # amount of periods for moving average
n_periods_for_ma2 = 6
n_periods_for_ma3 = 8

lag_features = []
for col in lag_columns:
    store_groups = df_nans_handeled_cat.groupby('Store')[col]
    for lag in range(1, n_lags + 1):
        lag_col_name = f'{col}_Lag_{lag}'
        # create the lag feature
        lag_feature = store_groups.shift(lag).rename(lag_col_name)
        lag_features.append(lag_feature)

        # create the moving average feature
        ma_col_name = f'{lag_col_name}_MA_{n_periods_for_ma}'
        ma_feature = lag_feature.rolling(window=n_periods_for_ma).mean().rename(ma_col_name)
        lag_features.append(ma_feature)

        ma2_col_name = f'{lag_col_name}_MA_{n_periods_for_ma2}'
        ma2_feature = lag_feature.rolling(window=n_periods_for_ma2).mean().rename(ma2_col_name)
        lag_features.append(ma2_feature)

        ma3_col_name = f'{lag_col_name}_MA_{n_periods_for_ma3}'
        ma3_feature = lag_feature.rolling(window=n_periods_for_ma3).mean().rename(ma3_col_name)
        lag_features.append(ma3_feature)

# Concatenate the lag features
features_df = pd.concat(lag_features, axis=1)
df_nans_handeled_cat = pd.concat([df_nans_handeled_cat, features_df], axis=1)

# Add the future sales
future_sales = df_nans_handeled_cat.groupby('Store')['Sales'].shift(-8).rename('Future_Sales')
df_nans_handeled_cat = pd.concat([df_nans_handeled_cat, future_sales], axis=1)

# Remove rows with NaN values that were created by the shifting and remove the original columns
df_nans_handeled_cat = df_nans_handeled_cat.dropna().drop(columns=lag_columns)
df_nans_handeled_cat


Unnamed: 0,Store,CW,Month,Year,Open,Promo,IsPromo,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,CompetitionDistance,IsCompetition,Promo2,Promo2Member,Promo2Active,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",Sales_Lag_1,Sales_Lag_1_MA_4,Sales_Lag_1_MA_6,Sales_Lag_1_MA_8,Sales_Lag_2,Sales_Lag_2_MA_4,Sales_Lag_2_MA_6,Sales_Lag_2_MA_8,Sales_Lag_3,Sales_Lag_3_MA_4,Sales_Lag_3_MA_6,Sales_Lag_3_MA_8,Sales_Lag_4,Sales_Lag_4_MA_4,Sales_Lag_4_MA_6,Sales_Lag_4_MA_8,Sales_Lag_5,Sales_Lag_5_MA_4,Sales_Lag_5_MA_6,Sales_Lag_5_MA_8,Sales_Lag_6,Sales_Lag_6_MA_4,Sales_Lag_6_MA_6,Sales_Lag_6_MA_8,Sales_Lag_7,Sales_Lag_7_MA_4,Sales_Lag_7_MA_6,Sales_Lag_7_MA_8,Sales_Lag_8,Sales_Lag_8_MA_4,Sales_Lag_8_MA_6,Sales_Lag_8_MA_8,SalesPerCustomer_Lag_1,SalesPerCustomer_Lag_1_MA_4,SalesPerCustomer_Lag_1_MA_6,SalesPerCustomer_Lag_1_MA_8,SalesPerCustomer_Lag_2,SalesPerCustomer_Lag_2_MA_4,SalesPerCustomer_Lag_2_MA_6,SalesPerCustomer_Lag_2_MA_8,SalesPerCustomer_Lag_3,SalesPerCustomer_Lag_3_MA_4,SalesPerCustomer_Lag_3_MA_6,SalesPerCustomer_Lag_3_MA_8,SalesPerCustomer_Lag_4,SalesPerCustomer_Lag_4_MA_4,SalesPerCustomer_Lag_4_MA_6,SalesPerCustomer_Lag_4_MA_8,SalesPerCustomer_Lag_5,SalesPerCustomer_Lag_5_MA_4,SalesPerCustomer_Lag_5_MA_6,SalesPerCustomer_Lag_5_MA_8,SalesPerCustomer_Lag_6,SalesPerCustomer_Lag_6_MA_4,SalesPerCustomer_Lag_6_MA_6,SalesPerCustomer_Lag_6_MA_8,SalesPerCustomer_Lag_7,SalesPerCustomer_Lag_7_MA_4,SalesPerCustomer_Lag_7_MA_6,SalesPerCustomer_Lag_7_MA_8,SalesPerCustomer_Lag_8,SalesPerCustomer_Lag_8_MA_4,SalesPerCustomer_Lag_8_MA_6,SalesPerCustomer_Lag_8_MA_8,SalesPerOpenDay_Lag_1,SalesPerOpenDay_Lag_1_MA_4,SalesPerOpenDay_Lag_1_MA_6,SalesPerOpenDay_Lag_1_MA_8,SalesPerOpenDay_Lag_2,SalesPerOpenDay_Lag_2_MA_4,SalesPerOpenDay_Lag_2_MA_6,SalesPerOpenDay_Lag_2_MA_8,SalesPerOpenDay_Lag_3,SalesPerOpenDay_Lag_3_MA_4,SalesPerOpenDay_Lag_3_MA_6,SalesPerOpenDay_Lag_3_MA_8,SalesPerOpenDay_Lag_4,SalesPerOpenDay_Lag_4_MA_4,SalesPerOpenDay_Lag_4_MA_6,SalesPerOpenDay_Lag_4_MA_8,SalesPerOpenDay_Lag_5,SalesPerOpenDay_Lag_5_MA_4,SalesPerOpenDay_Lag_5_MA_6,SalesPerOpenDay_Lag_5_MA_8,SalesPerOpenDay_Lag_6,SalesPerOpenDay_Lag_6_MA_4,SalesPerOpenDay_Lag_6_MA_6,SalesPerOpenDay_Lag_6_MA_8,SalesPerOpenDay_Lag_7,SalesPerOpenDay_Lag_7_MA_4,SalesPerOpenDay_Lag_7_MA_6,SalesPerOpenDay_Lag_7_MA_8,SalesPerOpenDay_Lag_8,SalesPerOpenDay_Lag_8_MA_4,SalesPerOpenDay_Lag_8_MA_6,SalesPerOpenDay_Lag_8_MA_8,Customers_Lag_1,Customers_Lag_1_MA_4,Customers_Lag_1_MA_6,Customers_Lag_1_MA_8,Customers_Lag_2,Customers_Lag_2_MA_4,Customers_Lag_2_MA_6,Customers_Lag_2_MA_8,Customers_Lag_3,Customers_Lag_3_MA_4,Customers_Lag_3_MA_6,Customers_Lag_3_MA_8,Customers_Lag_4,Customers_Lag_4_MA_4,Customers_Lag_4_MA_6,Customers_Lag_4_MA_8,Customers_Lag_5,Customers_Lag_5_MA_4,Customers_Lag_5_MA_6,Customers_Lag_5_MA_8,Customers_Lag_6,Customers_Lag_6_MA_4,Customers_Lag_6_MA_6,Customers_Lag_6_MA_8,Customers_Lag_7,Customers_Lag_7_MA_4,Customers_Lag_7_MA_6,Customers_Lag_7_MA_8,Customers_Lag_8,Customers_Lag_8_MA_4,Customers_Lag_8_MA_6,Customers_Lag_8_MA_8,CustomersPerOpenDay_Lag_1,CustomersPerOpenDay_Lag_1_MA_4,CustomersPerOpenDay_Lag_1_MA_6,CustomersPerOpenDay_Lag_1_MA_8,CustomersPerOpenDay_Lag_2,CustomersPerOpenDay_Lag_2_MA_4,CustomersPerOpenDay_Lag_2_MA_6,CustomersPerOpenDay_Lag_2_MA_8,CustomersPerOpenDay_Lag_3,CustomersPerOpenDay_Lag_3_MA_4,CustomersPerOpenDay_Lag_3_MA_6,CustomersPerOpenDay_Lag_3_MA_8,CustomersPerOpenDay_Lag_4,CustomersPerOpenDay_Lag_4_MA_4,CustomersPerOpenDay_Lag_4_MA_6,CustomersPerOpenDay_Lag_4_MA_8,CustomersPerOpenDay_Lag_5,CustomersPerOpenDay_Lag_5_MA_4,CustomersPerOpenDay_Lag_5_MA_6,CustomersPerOpenDay_Lag_5_MA_8,CustomersPerOpenDay_Lag_6,CustomersPerOpenDay_Lag_6_MA_4,CustomersPerOpenDay_Lag_6_MA_6,CustomersPerOpenDay_Lag_6_MA_8,CustomersPerOpenDay_Lag_7,CustomersPerOpenDay_Lag_7_MA_4,CustomersPerOpenDay_Lag_7_MA_6,CustomersPerOpenDay_Lag_7_MA_8,CustomersPerOpenDay_Lag_8,CustomersPerOpenDay_Lag_8_MA_4,CustomersPerOpenDay_Lag_8_MA_6,CustomersPerOpenDay_Lag_8_MA_8,Future_Sales
15,1,16,4,2013,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,30865.0,31186.25,31656.666667,31239.750,23867.0,30514.75,31017.000000,30866.625,34492.0,33802.00,32531.000000,32354.625,35521.0,31935.75,31429.000000,31629.750,28179.0,31293.25,31470.666667,31323.500,37016.0,31218.50,31556.333333,31048.375,27027.0,30907.25,30898.833333,30540.375,32951.0,31323.75,30724.000000,29579.500,8.509788,8.375659,8.250852,8.220224,7.910839,8.127814,8.126579,8.114707,8.670689,8.271122,8.223527,8.218030,8.411319,8.044487,8.056022,8.141390,7.518410,8.064789,8.110372,8.160373,8.484071,8.101601,8.200232,8.170893,7.764148,8.164938,8.213417,8.161686,8.492526,8.238293,8.186488,8.158168,5144.166667,5684.033333,5600.327778,5449.787500,4773.400000,5572.116667,5493.716667,5387.600000,6898.400000,5921.100000,5613.455556,5536.154167,5920.166667,5322.625000,5238.166667,5271.625000,4696.500000,5215.541667,5245.111111,5220.583333,6169.333333,5203.083333,5259.388889,5174.729167,4504.500000,5151.208333,5149.805556,5090.062500,5491.833333,5220.625000,5120.666667,5131.375000,3627.0,3711.25,3826.000000,3789.625,3017.0,3741.50,3801.666667,3790.875,3978.0,4078.00,3945.500000,3925.500,4223.0,3953.75,3888.666667,3873.375,3748.0,3868.00,3867.166667,3828.250,4363.0,3840.25,3836.000000,3786.875,3481.0,3773.00,3752.500000,3731.250,3880.0,3793.00,3741.833333,3608.625,604.500000,676.833333,676.527778,660.750000,603.400000,681.875000,672.472222,660.958333,795.600000,712.816667,679.683333,670.825000,703.833333,658.958333,648.111111,645.562500,624.666667,644.666667,644.527778,638.041667,727.166667,640.041667,639.333333,631.145833,580.166667,628.833333,625.416667,621.875000,646.666667,632.166667,623.638889,627.479167,24215.0
16,1,17,4,2013,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,22552.0,27944.00,29246.000000,29939.875,30865.0,31186.25,31656.666667,31239.750,23867.0,30514.75,31017.000000,30866.625,34492.0,33802.00,32531.000000,32354.625,35521.0,31935.75,31429.000000,31629.750,28179.0,31293.25,31470.666667,31323.500,37016.0,31218.50,31556.333333,31048.375,27027.0,30907.25,30898.833333,30540.375,7.560174,8.162872,8.096870,8.103680,8.509788,8.375659,8.250852,8.220224,7.910839,8.127814,8.126579,8.114707,8.670689,8.271122,8.223527,8.218030,8.411319,8.044487,8.056022,8.141390,7.518410,8.064789,8.110372,8.160373,8.484071,8.101601,8.200232,8.170893,7.764148,8.164938,8.213417,8.161686,3758.666667,5143.658333,5198.550000,5233.141667,5144.166667,5684.033333,5600.327778,5449.787500,4773.400000,5572.116667,5493.716667,5387.600000,6898.400000,5921.100000,5613.455556,5536.154167,5920.166667,5322.625000,5238.166667,5271.625000,4696.500000,5215.541667,5245.111111,5220.583333,6169.333333,5203.083333,5259.388889,5174.729167,4504.500000,5151.208333,5149.805556,5090.062500,2983.0,3401.25,3596.000000,3677.500,3627.0,3711.25,3826.000000,3789.625,3017.0,3741.50,3801.666667,3790.875,3978.0,4078.00,3945.500000,3925.500,4223.0,3953.75,3888.666667,3873.375,3748.0,3868.00,3867.166667,3828.250,4363.0,3840.25,3836.000000,3786.875,3481.0,3773.00,3752.500000,3731.250,497.166667,625.166667,638.194444,642.062500,604.500000,676.833333,676.527778,660.750000,603.400000,681.875000,672.472222,660.958333,795.600000,712.816667,679.683333,670.825000,703.833333,658.958333,648.111111,645.562500,624.666667,644.666667,644.527778,638.041667,727.166667,640.041667,639.333333,631.145833,580.166667,628.833333,625.416667,621.875000,28675.0
17,1,18,5,2013,5,5,1,1,0,0,1,1270.0,1,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,28979.0,26565.75,29379.333333,30183.875,22552.0,27944.00,29246.000000,29939.875,30865.0,31186.25,31656.666667,31239.750,23867.0,30514.75,31017.000000,30866.625,34492.0,33802.00,32531.000000,32354.625,35521.0,31935.75,31429.000000,31629.750,28179.0,31293.25,31470.666667,31323.500,37016.0,31218.50,31556.333333,31048.375,8.402146,8.095737,8.244159,8.183429,7.560174,8.162872,8.096870,8.103680,8.509788,8.375659,8.250852,8.220224,7.910839,8.127814,8.126579,8.114707,8.670689,8.271122,8.223527,8.218030,8.411319,8.044487,8.056022,8.141390,7.518410,8.064789,8.110372,8.160373,8.484071,8.101601,8.200232,8.170893,4829.833333,4626.516667,5220.772222,5273.808333,3758.666667,5143.658333,5198.550000,5233.141667,5144.166667,5684.033333,5600.327778,5449.787500,4773.400000,5572.116667,5493.716667,5387.600000,6898.400000,5921.100000,5613.455556,5536.154167,5920.166667,5322.625000,5238.166667,5271.625000,4696.500000,5215.541667,5245.111111,5220.583333,6169.333333,5203.083333,5259.388889,5174.729167,3449.0,3269.00,3546.166667,3673.500,2983.0,3401.25,3596.000000,3677.500,3627.0,3711.25,3826.000000,3789.625,3017.0,3741.50,3801.666667,3790.875,3978.0,4078.00,3945.500000,3925.500,4223.0,3953.75,3888.666667,3873.375,3748.0,3868.00,3867.166667,3828.250,4363.0,3840.25,3836.000000,3786.875,574.833333,569.975000,629.888889,641.395833,497.166667,625.166667,638.194444,642.062500,604.500000,676.833333,676.527778,660.750000,603.400000,681.875000,672.472222,660.958333,795.600000,712.816667,679.683333,670.825000,703.833333,658.958333,648.111111,645.562500,624.666667,644.666667,644.527778,638.041667,727.166667,640.041667,639.333333,631.145833,25716.0
18,1,19,5,2013,5,0,0,1,0,0,1,1270.0,1,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,30171.0,28141.75,28487.666667,29328.250,28979.0,26565.75,29379.333333,30183.875,22552.0,27944.00,29246.000000,29939.875,30865.0,31186.25,31656.666667,31239.750,23867.0,30514.75,31017.000000,30866.625,34492.0,33802.00,32531.000000,32354.625,35521.0,31935.75,31429.000000,31629.750,28179.0,31293.25,31470.666667,31323.500,8.735090,8.301799,8.298121,8.214807,8.402146,8.095737,8.244159,8.183429,7.560174,8.162872,8.096870,8.103680,8.509788,8.375659,8.250852,8.220224,7.910839,8.127814,8.126579,8.114707,8.670689,8.271122,8.223527,8.218030,8.411319,8.044487,8.056022,8.141390,7.518410,8.064789,8.110372,8.160373,6034.200000,4941.716667,5239.777778,5256.916667,4829.833333,4626.516667,5220.772222,5273.808333,3758.666667,5143.658333,5198.550000,5233.141667,5144.166667,5684.033333,5600.327778,5449.787500,4773.400000,5572.116667,5493.716667,5387.600000,6898.400000,5921.100000,5613.455556,5536.154167,5920.166667,5322.625000,5238.166667,5271.625000,4696.500000,5215.541667,5245.111111,5220.583333,3454.0,3378.25,3418.000000,3559.875,3449.0,3269.00,3546.166667,3673.500,2983.0,3401.25,3596.000000,3677.500,3627.0,3711.25,3826.000000,3789.625,3017.0,3741.50,3801.666667,3790.875,3978.0,4078.00,3945.500000,3925.500,4223.0,3953.75,3888.666667,3873.375,3748.0,3868.00,3867.166667,3828.250,690.800000,591.825000,627.716667,636.850000,574.833333,569.975000,629.888889,641.395833,497.166667,625.166667,638.194444,642.062500,604.500000,676.833333,676.527778,660.750000,603.400000,681.875000,672.472222,660.958333,795.600000,712.816667,679.683333,670.825000,703.833333,658.958333,648.111111,645.562500,624.666667,644.666667,644.527778,638.041667,32134.0
19,1,20,5,2013,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,24895.0,26649.25,26888.166667,28917.750,30171.0,28141.75,28487.666667,29328.250,28979.0,26565.75,29379.333333,30183.875,22552.0,27944.00,29246.000000,29939.875,30865.0,31186.25,31656.666667,31239.750,23867.0,30514.75,31017.000000,30866.625,34492.0,33802.00,32531.000000,32354.625,35521.0,31935.75,31429.000000,31629.750,8.251574,8.237246,8.228268,8.306452,8.735090,8.301799,8.298121,8.214807,8.402146,8.095737,8.244159,8.183429,7.560174,8.162872,8.096870,8.103680,8.509788,8.375659,8.250852,8.220224,7.910839,8.127814,8.126579,8.114707,8.670689,8.271122,8.223527,8.218030,8.411319,8.044487,8.056022,8.141390,4979.000000,4900.425000,4919.877778,5292.229167,6034.200000,4941.716667,5239.777778,5256.916667,4829.833333,4626.516667,5220.772222,5273.808333,3758.666667,5143.658333,5198.550000,5233.141667,5144.166667,5684.033333,5600.327778,5449.787500,4773.400000,5572.116667,5493.716667,5387.600000,6898.400000,5921.100000,5613.455556,5536.154167,5920.166667,5322.625000,5238.166667,5271.625000,3017.0,3225.75,3257.833333,3468.500,3454.0,3378.25,3418.000000,3559.875,3449.0,3269.00,3546.166667,3673.500,2983.0,3401.25,3596.000000,3677.500,3627.0,3711.25,3826.000000,3789.625,3017.0,3741.50,3801.666667,3790.875,3978.0,4078.00,3945.500000,3925.500,4223.0,3953.75,3888.666667,3873.375,603.400000,591.550000,595.683333,634.191667,690.800000,591.825000,627.716667,636.850000,574.833333,569.975000,629.888889,641.395833,497.166667,625.166667,638.194444,642.062500,604.500000,676.833333,676.527778,660.750000,603.400000,681.875000,672.472222,660.958333,795.600000,712.816667,679.683333,670.825000,703.833333,658.958333,648.111111,645.562500,24687.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150512,1115,19,5,2015,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,44909.0,37810.25,39621.833333,40383.250,32221.0,39762.50,40160.166667,40674.625,46236.0,40150.25,40989.333333,41386.000,27875.0,40626.00,41156.666667,41092.625,52718.0,42956.25,42829.500000,41771.375,33772.0,41586.75,41358.000000,40577.750,48139.0,42621.75,41280.166667,42283.875,37196.0,41559.25,40451.833333,40031.750,15.779691,14.561868,14.716023,14.829578,13.521192,14.687204,14.728752,14.761955,15.473896,14.748814,14.889290,14.880585,13.472692,14.844355,14.850093,14.886965,16.281038,15.097288,15.016349,14.880206,13.767631,14.836706,14.890332,14.750487,15.856061,15.012356,14.832163,15.010029,14.484424,14.929575,14.730034,14.725610,8981.800000,6908.241667,7300.872222,7253.466667,5370.166667,7298.691667,7141.100000,7114.908333,7706.000000,7363.316667,7279.294444,7233.470833,5575.000000,7442.608333,7307.183333,7184.575000,10543.600000,7598.691667,7431.127778,7181.554167,5628.666667,6931.125000,6893.000000,6762.958333,8023.166667,7103.625000,6880.027778,7047.312500,6199.333333,6926.541667,6741.972222,6671.958333,2846.0,2571.50,2662.833333,2697.625,2383.0,2669.50,2694.500000,2729.375,2988.0,2687.00,2725.333333,2759.000,2069.0,2699.00,2744.000000,2738.875,3238.0,2823.75,2835.833333,2790.500,2453.0,2789.25,2767.333333,2739.750,3036.0,2831.00,2772.166667,2807.250,2568.0,2778.75,2738.166667,2705.000,569.200000,469.541667,489.100000,483.575000,397.166667,489.141667,478.566667,477.008333,498.000000,492.058333,483.705556,481.945833,413.800000,494.058333,486.816667,478.591667,647.600000,497.608333,490.627778,478.575000,408.833333,464.875000,461.222222,456.625000,506.000000,471.833333,462.027778,467.875000,428.000000,463.125000,456.361111,450.833333,48130.0
150513,1115,20,5,2015,5,0,0,1,0,0,1,5350.0,0,1,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,47767.0,42783.25,41954.333333,41704.625,44909.0,37810.25,39621.833333,40383.250,32221.0,39762.50,40160.166667,40674.625,46236.0,40150.25,40989.333333,41386.000,27875.0,40626.00,41156.666667,41092.625,52718.0,42956.25,42829.500000,41771.375,33772.0,41586.75,41358.000000,40577.750,48139.0,42621.75,41280.166667,42283.875,15.188235,14.990753,14.952791,14.917554,15.779691,14.561868,14.716023,14.829578,13.521192,14.687204,14.728752,14.761955,15.473896,14.748814,14.889290,14.880585,13.472692,14.844355,14.850093,14.886965,16.281038,15.097288,15.016349,14.880206,13.767631,14.836706,14.890332,14.750487,15.856061,15.012356,14.832163,15.010029,7961.166667,7504.783333,7689.622222,7473.695833,8981.800000,6908.241667,7300.872222,7253.466667,5370.166667,7298.691667,7141.100000,7114.908333,7706.000000,7363.316667,7279.294444,7233.470833,5575.000000,7442.608333,7307.183333,7184.575000,10543.600000,7598.691667,7431.127778,7181.554167,5628.666667,6931.125000,6893.000000,6762.958333,8023.166667,7103.625000,6880.027778,7047.312500,3145.0,2840.50,2778.166667,2769.750,2846.0,2571.50,2662.833333,2697.625,2383.0,2669.50,2694.500000,2729.375,2988.0,2687.00,2725.333333,2759.000,2069.0,2699.00,2744.000000,2738.875,3238.0,2823.75,2835.833333,2790.500,2453.0,2789.25,2767.333333,2739.750,3036.0,2831.00,2772.166667,2807.250,524.166667,497.133333,508.322222,495.595833,569.200000,469.541667,489.100000,483.575000,397.166667,489.141667,478.566667,477.008333,498.000000,492.058333,483.705556,481.945833,413.800000,494.058333,486.816667,478.591667,647.600000,497.608333,490.627778,478.575000,408.833333,464.875000,461.222222,456.625000,506.000000,471.833333,462.027778,467.875000,36233.0
150514,1115,21,5,2015,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,33638.0,39633.75,38774.333333,39892.000,47767.0,42783.25,41954.333333,41704.625,44909.0,37810.25,39621.833333,40383.250,32221.0,39762.50,40160.166667,40674.625,46236.0,40150.25,40989.333333,41386.000,27875.0,40626.00,41156.666667,41092.625,52718.0,42956.25,42829.500000,41771.375,33772.0,41586.75,41358.000000,40577.750,14.098072,14.646797,14.588963,14.697806,15.188235,14.990753,14.952791,14.917554,15.779691,14.561868,14.716023,14.829578,13.521192,14.687204,14.728752,14.761955,15.473896,14.748814,14.889290,14.880585,13.472692,14.844355,14.850093,14.886965,16.281038,15.097288,15.016349,14.880206,13.767631,14.836706,14.890332,14.750487,6727.600000,7260.183333,7053.622222,7311.750000,7961.166667,7504.783333,7689.622222,7473.695833,8981.800000,6908.241667,7300.872222,7253.466667,5370.166667,7298.691667,7141.100000,7114.908333,7706.000000,7363.316667,7279.294444,7233.470833,5575.000000,7442.608333,7307.183333,7184.575000,10543.600000,7598.691667,7431.127778,7181.554167,5628.666667,6931.125000,6893.000000,6762.958333,2386.0,2690.00,2636.166667,2688.500,3145.0,2840.50,2778.166667,2769.750,2846.0,2571.50,2662.833333,2697.625,2383.0,2669.50,2694.500000,2729.375,2988.0,2687.00,2725.333333,2759.000,2069.0,2699.00,2744.000000,2738.875,3238.0,2823.75,2835.833333,2790.500,2453.0,2789.25,2767.333333,2739.750,477.200000,491.933333,479.922222,491.995833,524.166667,497.133333,508.322222,495.595833,569.200000,469.541667,489.100000,483.575000,397.166667,489.141667,478.566667,477.008333,498.000000,492.058333,483.705556,481.945833,413.800000,494.058333,486.816667,478.591667,647.600000,497.608333,490.627778,478.575000,408.833333,464.875000,461.222222,456.625000,45927.0
150515,1115,22,5,2015,5,0,0,1,0,0,1,5350.0,0,1,1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,46629.0,43235.75,41900.000000,41499.125,33638.0,39633.75,38774.333333,39892.000,47767.0,42783.25,41954.333333,41704.625,44909.0,37810.25,39621.833333,40383.250,32221.0,39762.50,40160.166667,40674.625,46236.0,40150.25,40989.333333,41386.000,27875.0,40626.00,41156.666667,41092.625,52718.0,42956.25,42829.500000,41771.375,15.085409,15.037852,14.857749,14.862528,14.098072,14.646797,14.588963,14.697806,15.188235,14.990753,14.952791,14.917554,15.779691,14.561868,14.716023,14.829578,13.521192,14.687204,14.728752,14.761955,15.473896,14.748814,14.889290,14.880585,13.472692,14.844355,14.850093,14.886965,16.281038,15.097288,15.016349,14.880206,7771.500000,7860.516667,7419.705556,7579.604167,6727.600000,7260.183333,7053.622222,7311.750000,7961.166667,7504.783333,7689.622222,7473.695833,8981.800000,6908.241667,7300.872222,7253.466667,5370.166667,7298.691667,7141.100000,7114.908333,7706.000000,7363.316667,7279.294444,7233.470833,5575.000000,7442.608333,7307.183333,7184.575000,10543.600000,7598.691667,7431.127778,7181.554167,3091.0,2867.00,2806.500000,2768.250,2386.0,2690.00,2636.166667,2688.500,3145.0,2840.50,2778.166667,2769.750,2846.0,2571.50,2662.833333,2697.625,2383.0,2669.50,2694.500000,2729.375,2988.0,2687.00,2725.333333,2759.000,2069.0,2699.00,2744.000000,2738.875,3238.0,2823.75,2835.833333,2790.500,515.166667,521.433333,496.816667,505.287500,477.200000,491.933333,479.922222,491.995833,524.166667,497.133333,508.322222,495.595833,569.200000,469.541667,489.100000,483.575000,397.166667,489.141667,478.566667,477.008333,498.000000,492.058333,483.705556,481.945833,413.800000,494.058333,486.816667,478.591667,647.600000,497.608333,490.627778,478.575000,35362.0


### New additional Features

#### Ploynominal Features

In [82]:
# Before
testModelsTestSplit8W(df_nans_handeled_cat, None)

{'Model': 'LinearRegression', 'RMSE_Train': 9349.951020349768, 'MAE_Train': 6453.03660119806, 'R2_Train': 0.7366461886920278, 'Adj_R2_Train': 0.7362116940306376, 'RMSE_Test': 6031.943678923188, 'MAE_Test': 4500.710893128057, 'R2_Test': 0.8599439761981675, 'Adj_R2_Test': 0.8568790471713401}
{'Model': 'XGBRegressor', 'RMSE_Train': 7410.980693987589, 'MAE_Train': 5030.000077158206, 'R2_Train': 0.8345478937990809, 'Adj_R2_Train': 0.8342749224055666, 'RMSE_Test': 6058.149998380618, 'MAE_Test': 4515.475634765625, 'R2_Test': 0.8587243607024304, 'Adj_R2_Test': 0.8556327421064364}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9349.95102,6453.036601,0.736646,0.736212,6031.943679,4500.710893,0.859944,0.856879
1,XGBRegressor,7410.980694,5030.000077,0.834548,0.834275,6058.149998,4515.475635,0.858724,0.855633


In [None]:
from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures(degree=2)
features = ['Open', 'Promo', 'IsPromo', 'IsStateHoliday', 'SchoolHoliday', 'IsSchoolHoliday', 'CompetitionDistance', 'IsCompetition', 'Promo2Member', 'Assortment_a', 'Assortment_b', 'Assortment_c']

feat_array = pf.fit_transform(df_nans_handeled_cat[features])
# generate names for the new features
feature_names = pf.get_feature_names_out(input_features=features)
poly_features_df = pd.DataFrame(feat_array, columns=feature_names, index=df_nans_handeled_cat.index)
# remove the '1' column
poly_features_df = poly_features_df.drop('1', axis=1)
# Concatenate the polynomial features with the original DataFrame
df_nans_handeled_cat_poly = pd.concat([df_nans_handeled_cat, poly_features_df], axis=1)
testModelsTestSplit8W(df_nans_handeled_cat_poly, None)

{'Model': 'LinearRegression', 'RMSE_Train': 9196.055248192568, 'MAE_Train': 6324.875376361263, 'R2_Train': 0.745244200308042, 'Adj_R2_Train': 0.7446253585255644, 'RMSE_Test': 6753.577811387857, 'MAE_Test': 5219.836299709872, 'R2_Test': 0.8244280822823356, 'Adj_R2_Test': 0.8187166086913812}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9196.055248,6324.875376,0.745244,0.744625,6753.577811,5219.8363,0.824428,0.818717


**Results:**
- Before Polynominal Features: MAE: 4495.405879	R2: 0.860075
- After Polynominal Features: MAE: 5219.8363	R2: 0.824428

-> Polynominal Features did not improve the model and will not be used

## Skewness

In [33]:
#pd.set_option('display.max_rows', None)
num_columns_float = df_nans_handeled_cat.select_dtypes(include='number').columns
skew = df_nans_handeled_cat[num_columns_float].skew().sort_values(ascending=False)
skew[skew >= 3]

Assortment_b            10.995450
StoreType_b              7.912346
StateHoliday_c           7.611515
StateHoliday_b           5.003762
Customers_Lag_1_MA_8     3.079217
Customers_Lag_2_MA_8     3.073890
Customers_Lag_3_MA_8     3.068401
Customers_Lag_1_MA_6     3.065142
Customers_Lag_4_MA_8     3.063038
Customers_Lag_2_MA_6     3.060226
Customers_Lag_5_MA_8     3.059181
Customers_Lag_3_MA_6     3.055243
Customers_Lag_6_MA_8     3.054436
Customers_Lag_7_MA_8     3.050417
Customers_Lag_1_MA_4     3.049608
Customers_Lag_4_MA_6     3.049238
Customers_Lag_8_MA_8     3.046789
Customers_Lag_2_MA_4     3.046094
Customers_Lag_5_MA_6     3.044501
Customers_Lag_3_MA_4     3.040599
Customers_Lag_6_MA_6     3.038744
Customers_Lag_4_MA_4     3.034761
Customers_Lag_7_MA_6     3.034684
Customers_Lag_5_MA_4     3.031263
Customers_Lag_8_MA_6     3.030890
Customers_Lag_6_MA_4     3.024336
Customers_Lag_7_MA_4     3.018899
Customers_Lag_8_MA_4     3.014113
dtype: float64

In [35]:
# Before
testModelsTestSplit8W(df_nans_handeled_cat, None)

{'Model': 'LinearRegression', 'RMSE_Train': 9348.624262728845, 'MAE_Train': 6448.18666177155, 'R2_Train': 0.7367209231767318, 'Adj_R2_Train': 0.7362865518161379, 'RMSE_Test': 6029.131809472689, 'MAE_Test': 4495.405879127806, 'R2_Test': 0.8600745236580398, 'Adj_R2_Test': 0.8570124514786959}




{'Model': 'XGBRegressor', 'RMSE_Train': 7410.980873242441, 'MAE_Train': 5030.000189809052, 'R2_Train': 0.8345478857952568, 'Adj_R2_Train': 0.8342749143885373, 'RMSE_Test': 6058.148984876124, 'MAE_Test': 4515.474892052621, 'R2_Test': 0.858724407972136, 'Adj_R2_Test': 0.8556327904105729}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9348.624263,6448.186662,0.736721,0.736287,6029.131809,4495.405879,0.860075,0.857012
1,XGBRegressor,7410.980873,5030.00019,0.834548,0.834275,6058.148985,4515.474892,0.858724,0.855633


In [85]:
# Log transformation
df_nans_handeled_cat_log = df_nans_handeled_cat.copy()
for_log_transform = skew[skew >= 3].index
df_nans_handeled_cat_log[for_log_transform] = np.log(df_nans_handeled_cat_log[for_log_transform]+1)
testModelsTestSplit8W(df_nans_handeled_cat_log, None)

{'Model': 'LinearRegression', 'RMSE_Train': 8996.267242713275, 'MAE_Train': 6127.220069480263, 'R2_Train': 0.7561933021058949, 'Adj_R2_Train': 0.7557910572774641, 'RMSE_Test': 5895.625910323417, 'MAE_Test': 4370.480975307157, 'R2_Test': 0.8662027847087163, 'Adj_R2_Test': 0.8632748209002108}
{'Model': 'XGBRegressor', 'RMSE_Train': 7410.980693987589, 'MAE_Train': 5030.000077158206, 'R2_Train': 0.8345478937990809, 'Adj_R2_Train': 0.8342749224055666, 'RMSE_Test': 6058.149998380618, 'MAE_Test': 4515.475634765625, 'R2_Test': 0.8587243607024304, 'Adj_R2_Test': 0.8556327421064364}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,8996.267243,6127.220069,0.756193,0.755791,5895.62591,4370.480975,0.866203,0.863275
1,XGBRegressor,7410.980694,5030.000077,0.834548,0.834275,6058.149998,4515.475635,0.858724,0.855633


In [36]:
# use PowerTransformer to transform the data
df_nans_handeled_cat_power = df_nans_handeled_cat.copy()
for_log_transform = skew[skew >= 3].index
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer()
df_nans_handeled_cat_power[for_log_transform] = pt.fit_transform(df_nans_handeled_cat_power[for_log_transform])
testModelsTestSplit8W(df_nans_handeled_cat_power, None)


{'Model': 'LinearRegression', 'RMSE_Train': 9080.95770607921, 'MAE_Train': 6230.562682192093, 'R2_Train': 0.7515813243113636, 'Adj_R2_Train': 0.7511714704047873, 'RMSE_Test': 5959.227147139775, 'MAE_Test': 4414.332558622488, 'R2_Test': 0.8633004401241766, 'Adj_R2_Test': 0.860308962587939}




{'Model': 'XGBRegressor', 'RMSE_Train': 7410.980873242441, 'MAE_Train': 5030.000189809052, 'R2_Train': 0.8345478857952568, 'Adj_R2_Train': 0.8342749143885373, 'RMSE_Test': 6058.148984876124, 'MAE_Test': 4515.474892052621, 'R2_Test': 0.858724407972136, 'Adj_R2_Test': 0.8556327904105729}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9080.957706,6230.562682,0.751581,0.751171,5959.227147,4414.332559,0.8633,0.860309
1,XGBRegressor,7410.980873,5030.00019,0.834548,0.834275,6058.148985,4515.474892,0.858724,0.855633


**Results:**
- Before: MAE:4793.96383	R2:0.844227
- After Power Transformation skew>=2: MAE:4615.821447	R2:	0.850871
- After Power Transformation skew>=3: MAE:4682.241185	R2: 0.847547
- After log transformation skew>=2: MAE: 4641.626858	R2: 0.851209
- After log transformation skew>=3: MAE: 4528.091135	R2: 0.854112

-> Log transformation with skew>=3 will be used

After adding ma3:
- Before: MAE: 4495.405879	R2: 0.860075
- After log transformation skew>=3: MAE: 4512.055025	R2:0.857954  
- After Power transformation skew>=3: MAE: 4414.332559	R2: 0.8633

--> Power transformation with skew>=3 will be used

## Feature Scaling

In [87]:
# Before
testModelsTestSplit8W(df_nans_handeled_cat_power, None)

{'Model': 'LinearRegression', 'RMSE_Train': 9080.722238731532, 'MAE_Train': 6231.216114108374, 'R2_Train': 0.7515942070336924, 'Adj_R2_Train': 0.7511843743816939, 'RMSE_Test': 5962.597616137577, 'MAE_Test': 4418.725909494953, 'R2_Test': 0.8631457650590293, 'Adj_R2_Test': 0.8601509026766134}
{'Model': 'XGBRegressor', 'RMSE_Train': 7410.980693987589, 'MAE_Train': 5030.000077158206, 'R2_Train': 0.8345478937990809, 'Adj_R2_Train': 0.8342749224055666, 'RMSE_Test': 6058.149998380618, 'MAE_Test': 4515.475634765625, 'R2_Test': 0.8587243607024304, 'Adj_R2_Test': 0.8556327421064364}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9080.722239,6231.216114,0.751594,0.751184,5962.597616,4418.725909,0.863146,0.860151
1,XGBRegressor,7410.980694,5030.000077,0.834548,0.834275,6058.149998,4515.475635,0.858724,0.855633


### StandardScaler

In [88]:
testModelsTestSplit8W(df_nans_handeled_cat_power, StandardScaler())

{'Model': 'LinearRegression', 'RMSE_Train': 9080.673704354314, 'MAE_Train': 6230.754311109099, 'R2_Train': 0.7515968623705528, 'Adj_R2_Train': 0.7511870340994656, 'RMSE_Test': 5960.074585783354, 'MAE_Test': 4417.419570876309, 'R2_Test': 0.8632615583283878, 'Adj_R2_Test': 0.8602692299187547}
{'Model': 'XGBRegressor', 'RMSE_Train': 7410.980693987589, 'MAE_Train': 5030.000077158206, 'R2_Train': 0.8345478937990809, 'Adj_R2_Train': 0.8342749224055666, 'RMSE_Test': 6058.149998380618, 'MAE_Test': 4515.475634765625, 'R2_Test': 0.8587243607024304, 'Adj_R2_Test': 0.8556327421064364}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9080.673704,6230.754311,0.751597,0.751187,5960.074586,4417.419571,0.863262,0.860269
1,XGBRegressor,7410.980694,5030.000077,0.834548,0.834275,6058.149998,4515.475635,0.858724,0.855633


### MinMaxScaler

In [89]:
testModelsTestSplit8W(df_nans_handeled_cat_power, MinMaxScaler())

{'Model': 'LinearRegression', 'RMSE_Train': 9080.879286303141, 'MAE_Train': 6231.961943773715, 'R2_Train': 0.751585614795569, 'Adj_R2_Train': 0.7511757679676542, 'RMSE_Test': 5965.482629551052, 'MAE_Test': 4419.235538116592, 'R2_Test': 0.8630132986919783, 'Adj_R2_Test': 0.8600155374694952}
{'Model': 'XGBRegressor', 'RMSE_Train': 7410.980693987589, 'MAE_Train': 5030.000077158206, 'R2_Train': 0.8345478937990809, 'Adj_R2_Train': 0.8342749224055666, 'RMSE_Test': 6058.149998380618, 'MAE_Test': 4515.475634765625, 'R2_Test': 0.8587243607024304, 'Adj_R2_Test': 0.8556327421064364}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9080.879286,6231.961944,0.751586,0.751176,5965.48263,4419.235538,0.863013,0.860016
1,XGBRegressor,7410.980694,5030.000077,0.834548,0.834275,6058.149998,4515.475635,0.858724,0.855633


### RobustScaler

In [90]:
testModelsTestSplit8W(df_nans_handeled_cat_power, RobustScaler())

{'Model': 'LinearRegression', 'RMSE_Train': 9080.67444153205, 'MAE_Train': 6230.54950333089, 'R2_Train': 0.7515968220393449, 'Adj_R2_Train': 0.7511869937017172, 'RMSE_Test': 5959.1882336524495, 'MAE_Test': 4416.791974670684, 'R2_Test': 0.8633022254023986, 'Adj_R2_Test': 0.8603107869344631}
{'Model': 'XGBRegressor', 'RMSE_Train': 7410.980693987589, 'MAE_Train': 5030.000077158206, 'R2_Train': 0.8345478937990809, 'Adj_R2_Train': 0.8342749224055666, 'RMSE_Test': 6058.149998380618, 'MAE_Test': 4515.475634765625, 'R2_Test': 0.8587243607024304, 'Adj_R2_Test': 0.8556327421064364}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9080.674442,6230.549503,0.751597,0.751187,5959.188234,4416.791975,0.863302,0.860311
1,XGBRegressor,7410.980694,5030.000077,0.834548,0.834275,6058.149998,4515.475635,0.858724,0.855633


**Result: StandardScaler, MinMaxScaler, RobustScaler have just a small effect on the model performance.**
- Before at LinearRegression: MAE: 4414.332559	R2: 0.8633
- After MinMaxScaler at LinearRegression: MAE: 4398.551738	R2: 0.864094

-> MinMaxScaler will be used

In [None]:
# Save df as csv
df_nans_handeled_cat_power.to_csv('df_nans_handeled_cat_power.csv', index=False)

In [43]:
df_nans_handeled_cat_power

Unnamed: 0,Store,CW,Month,Year,Open,Promo,IsPromo,IsStateHoliday,SchoolHoliday,IsSchoolHoliday,NumStateHoliday,CompetitionDistance,IsCompetition,Promo2,Promo2Member,Promo2Active,StateHoliday_0,StateHoliday_a,StateHoliday_b,StateHoliday_c,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,PromoInterval_0,"PromoInterval_Feb,May,Aug,Nov","PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Mar,Jun,Sept,Dec",Sales_Lag_1,Sales_Lag_1_MA_4,Sales_Lag_1_MA_6,Sales_Lag_1_MA_8,Sales_Lag_2,Sales_Lag_2_MA_4,Sales_Lag_2_MA_6,Sales_Lag_2_MA_8,Sales_Lag_3,Sales_Lag_3_MA_4,Sales_Lag_3_MA_6,Sales_Lag_3_MA_8,Sales_Lag_4,Sales_Lag_4_MA_4,Sales_Lag_4_MA_6,Sales_Lag_4_MA_8,Sales_Lag_5,Sales_Lag_5_MA_4,Sales_Lag_5_MA_6,Sales_Lag_5_MA_8,Sales_Lag_6,Sales_Lag_6_MA_4,Sales_Lag_6_MA_6,Sales_Lag_6_MA_8,Sales_Lag_7,Sales_Lag_7_MA_4,Sales_Lag_7_MA_6,Sales_Lag_7_MA_8,Sales_Lag_8,Sales_Lag_8_MA_4,Sales_Lag_8_MA_6,Sales_Lag_8_MA_8,SalesPerCustomer_Lag_1,SalesPerCustomer_Lag_1_MA_4,SalesPerCustomer_Lag_1_MA_6,SalesPerCustomer_Lag_1_MA_8,SalesPerCustomer_Lag_2,SalesPerCustomer_Lag_2_MA_4,SalesPerCustomer_Lag_2_MA_6,SalesPerCustomer_Lag_2_MA_8,SalesPerCustomer_Lag_3,SalesPerCustomer_Lag_3_MA_4,SalesPerCustomer_Lag_3_MA_6,SalesPerCustomer_Lag_3_MA_8,SalesPerCustomer_Lag_4,SalesPerCustomer_Lag_4_MA_4,SalesPerCustomer_Lag_4_MA_6,SalesPerCustomer_Lag_4_MA_8,SalesPerCustomer_Lag_5,SalesPerCustomer_Lag_5_MA_4,SalesPerCustomer_Lag_5_MA_6,SalesPerCustomer_Lag_5_MA_8,SalesPerCustomer_Lag_6,SalesPerCustomer_Lag_6_MA_4,SalesPerCustomer_Lag_6_MA_6,SalesPerCustomer_Lag_6_MA_8,SalesPerCustomer_Lag_7,SalesPerCustomer_Lag_7_MA_4,SalesPerCustomer_Lag_7_MA_6,SalesPerCustomer_Lag_7_MA_8,SalesPerCustomer_Lag_8,SalesPerCustomer_Lag_8_MA_4,SalesPerCustomer_Lag_8_MA_6,SalesPerCustomer_Lag_8_MA_8,SalesPerOpenDay_Lag_1,SalesPerOpenDay_Lag_1_MA_4,SalesPerOpenDay_Lag_1_MA_6,SalesPerOpenDay_Lag_1_MA_8,SalesPerOpenDay_Lag_2,SalesPerOpenDay_Lag_2_MA_4,SalesPerOpenDay_Lag_2_MA_6,SalesPerOpenDay_Lag_2_MA_8,SalesPerOpenDay_Lag_3,SalesPerOpenDay_Lag_3_MA_4,SalesPerOpenDay_Lag_3_MA_6,SalesPerOpenDay_Lag_3_MA_8,SalesPerOpenDay_Lag_4,SalesPerOpenDay_Lag_4_MA_4,SalesPerOpenDay_Lag_4_MA_6,SalesPerOpenDay_Lag_4_MA_8,SalesPerOpenDay_Lag_5,SalesPerOpenDay_Lag_5_MA_4,SalesPerOpenDay_Lag_5_MA_6,SalesPerOpenDay_Lag_5_MA_8,SalesPerOpenDay_Lag_6,SalesPerOpenDay_Lag_6_MA_4,SalesPerOpenDay_Lag_6_MA_6,SalesPerOpenDay_Lag_6_MA_8,SalesPerOpenDay_Lag_7,SalesPerOpenDay_Lag_7_MA_4,SalesPerOpenDay_Lag_7_MA_6,SalesPerOpenDay_Lag_7_MA_8,SalesPerOpenDay_Lag_8,SalesPerOpenDay_Lag_8_MA_4,SalesPerOpenDay_Lag_8_MA_6,SalesPerOpenDay_Lag_8_MA_8,Customers_Lag_1,Customers_Lag_1_MA_4,Customers_Lag_1_MA_6,Customers_Lag_1_MA_8,Customers_Lag_2,Customers_Lag_2_MA_4,Customers_Lag_2_MA_6,Customers_Lag_2_MA_8,Customers_Lag_3,Customers_Lag_3_MA_4,Customers_Lag_3_MA_6,Customers_Lag_3_MA_8,Customers_Lag_4,Customers_Lag_4_MA_4,Customers_Lag_4_MA_6,Customers_Lag_4_MA_8,Customers_Lag_5,Customers_Lag_5_MA_4,Customers_Lag_5_MA_6,Customers_Lag_5_MA_8,Customers_Lag_6,Customers_Lag_6_MA_4,Customers_Lag_6_MA_6,Customers_Lag_6_MA_8,Customers_Lag_7,Customers_Lag_7_MA_4,Customers_Lag_7_MA_6,Customers_Lag_7_MA_8,Customers_Lag_8,Customers_Lag_8_MA_4,Customers_Lag_8_MA_6,Customers_Lag_8_MA_8,CustomersPerOpenDay_Lag_1,CustomersPerOpenDay_Lag_1_MA_4,CustomersPerOpenDay_Lag_1_MA_6,CustomersPerOpenDay_Lag_1_MA_8,CustomersPerOpenDay_Lag_2,CustomersPerOpenDay_Lag_2_MA_4,CustomersPerOpenDay_Lag_2_MA_6,CustomersPerOpenDay_Lag_2_MA_8,CustomersPerOpenDay_Lag_3,CustomersPerOpenDay_Lag_3_MA_4,CustomersPerOpenDay_Lag_3_MA_6,CustomersPerOpenDay_Lag_3_MA_8,CustomersPerOpenDay_Lag_4,CustomersPerOpenDay_Lag_4_MA_4,CustomersPerOpenDay_Lag_4_MA_6,CustomersPerOpenDay_Lag_4_MA_8,CustomersPerOpenDay_Lag_5,CustomersPerOpenDay_Lag_5_MA_4,CustomersPerOpenDay_Lag_5_MA_6,CustomersPerOpenDay_Lag_5_MA_8,CustomersPerOpenDay_Lag_6,CustomersPerOpenDay_Lag_6_MA_4,CustomersPerOpenDay_Lag_6_MA_6,CustomersPerOpenDay_Lag_6_MA_8,CustomersPerOpenDay_Lag_7,CustomersPerOpenDay_Lag_7_MA_4,CustomersPerOpenDay_Lag_7_MA_6,CustomersPerOpenDay_Lag_7_MA_8,CustomersPerOpenDay_Lag_8,CustomersPerOpenDay_Lag_8_MA_4,CustomersPerOpenDay_Lag_8_MA_6,CustomersPerOpenDay_Lag_8_MA_8,Future_Sales
15,1,16,4,2013,6,0,0,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,-0.19245,-0.129189,0.0,-0.124429,1.0,0.0,1.0,-0.090208,0.0,1.0,0.0,0.0,0.0,30865.0,31186.25,31656.666667,31239.750,23867.0,30514.75,31017.000000,30866.625,34492.0,33802.00,32531.000000,32354.625,35521.0,31935.75,31429.000000,31629.750,28179.0,31293.25,31470.666667,31323.500,37016.0,31218.50,31556.333333,31048.375,27027.0,30907.25,30898.833333,30540.375,32951.0,31323.75,30724.000000,29579.500,8.509788,8.375659,8.250852,8.220224,7.910839,8.127814,8.126579,8.114707,8.670689,8.271122,8.223527,8.218030,8.411319,8.044487,8.056022,8.141390,7.518410,8.064789,8.110372,8.160373,8.484071,8.101601,8.200232,8.170893,7.764148,8.164938,8.213417,8.161686,8.492526,8.238293,8.186488,8.158168,5144.166667,5684.033333,5600.327778,5449.787500,4773.400000,5572.116667,5493.716667,5387.600000,6898.400000,5921.100000,5613.455556,5536.154167,5920.166667,5322.625000,5238.166667,5271.625000,4696.500000,5215.541667,5245.111111,5220.583333,6169.333333,5203.083333,5259.388889,5174.729167,4504.500000,5151.208333,5149.805556,5090.062500,5491.833333,5220.625000,5120.666667,5131.375000,3627.0,-0.133593,-0.082972,-0.101458,3017.0,-0.119424,-0.094299,-0.101028,3978.0,0.031804,-0.028801,-0.039229,4223.0,-0.023905,-0.054915,-0.063314,3748.0,-0.061795,-0.064418,-0.083690,4363.0,-0.074907,-0.078952,-0.102739,3481.0,-0.105598,-0.117138,-0.128250,3880.0,-0.096421,-0.121592,-0.184843,604.500000,676.833333,676.527778,660.750000,603.400000,681.875000,672.472222,660.958333,795.600000,712.816667,679.683333,670.825000,703.833333,658.958333,648.111111,645.562500,624.666667,644.666667,644.527778,638.041667,727.166667,640.041667,639.333333,631.145833,580.166667,628.833333,625.416667,621.875000,646.666667,632.166667,623.638889,627.479167,24215.0
16,1,17,4,2013,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,-0.19245,-0.129189,0.0,-0.124429,1.0,0.0,1.0,-0.090208,0.0,1.0,0.0,0.0,0.0,22552.0,27944.00,29246.000000,29939.875,30865.0,31186.25,31656.666667,31239.750,23867.0,30514.75,31017.000000,30866.625,34492.0,33802.00,32531.000000,32354.625,35521.0,31935.75,31429.000000,31629.750,28179.0,31293.25,31470.666667,31323.500,37016.0,31218.50,31556.333333,31048.375,27027.0,30907.25,30898.833333,30540.375,7.560174,8.162872,8.096870,8.103680,8.509788,8.375659,8.250852,8.220224,7.910839,8.127814,8.126579,8.114707,8.670689,8.271122,8.223527,8.218030,8.411319,8.044487,8.056022,8.141390,7.518410,8.064789,8.110372,8.160373,8.484071,8.101601,8.200232,8.170893,7.764148,8.164938,8.213417,8.161686,3758.666667,5143.658333,5198.550000,5233.141667,5144.166667,5684.033333,5600.327778,5449.787500,4773.400000,5572.116667,5493.716667,5387.600000,6898.400000,5921.100000,5613.455556,5536.154167,5920.166667,5322.625000,5238.166667,5271.625000,4696.500000,5215.541667,5245.111111,5220.583333,6169.333333,5203.083333,5259.388889,5174.729167,4504.500000,5151.208333,5149.805556,5090.062500,2983.0,-0.279007,-0.189855,-0.153859,3627.0,-0.133326,-0.083129,-0.101608,3017.0,-0.119671,-0.094374,-0.101217,3978.0,0.031362,-0.029136,-0.039432,4223.0,-0.023222,-0.054626,-0.062905,3748.0,-0.062348,-0.064718,-0.083590,4363.0,-0.075008,-0.078762,-0.102366,3481.0,-0.105540,-0.116664,-0.127183,497.166667,625.166667,638.194444,642.062500,604.500000,676.833333,676.527778,660.750000,603.400000,681.875000,672.472222,660.958333,795.600000,712.816667,679.683333,670.825000,703.833333,658.958333,648.111111,645.562500,624.666667,644.666667,644.527778,638.041667,727.166667,640.041667,639.333333,631.145833,580.166667,628.833333,625.416667,621.875000,28675.0
17,1,18,5,2013,5,5,1,1,0,0,1,1270.0,1,0,0,0,0.0,1.0,-0.19245,-0.129189,0.0,-0.124429,1.0,0.0,1.0,-0.090208,0.0,1.0,0.0,0.0,0.0,28979.0,26565.75,29379.333333,30183.875,22552.0,27944.00,29246.000000,29939.875,30865.0,31186.25,31656.666667,31239.750,23867.0,30514.75,31017.000000,30866.625,34492.0,33802.00,32531.000000,32354.625,35521.0,31935.75,31429.000000,31629.750,28179.0,31293.25,31470.666667,31323.500,37016.0,31218.50,31556.333333,31048.375,8.402146,8.095737,8.244159,8.183429,7.560174,8.162872,8.096870,8.103680,8.509788,8.375659,8.250852,8.220224,7.910839,8.127814,8.126579,8.114707,8.670689,8.271122,8.223527,8.218030,8.411319,8.044487,8.056022,8.141390,7.518410,8.064789,8.110372,8.160373,8.484071,8.101601,8.200232,8.170893,4829.833333,4626.516667,5220.772222,5273.808333,3758.666667,5143.658333,5198.550000,5233.141667,5144.166667,5684.033333,5600.327778,5449.787500,4773.400000,5572.116667,5493.716667,5387.600000,6898.400000,5921.100000,5613.455556,5536.154167,5920.166667,5322.625000,5238.166667,5271.625000,4696.500000,5215.541667,5245.111111,5220.583333,6169.333333,5203.083333,5259.388889,5174.729167,3449.0,-0.342767,-0.213400,-0.155741,2983.0,-0.278714,-0.189985,-0.153998,3627.0,-0.133569,-0.083206,-0.101797,3017.0,-0.120071,-0.094696,-0.101404,3978.0,0.032038,-0.028852,-0.039027,4223.0,-0.023784,-0.054928,-0.062810,3748.0,-0.062452,-0.064532,-0.083222,4363.0,-0.074957,-0.078296,-0.101301,574.833333,569.975000,629.888889,641.395833,497.166667,625.166667,638.194444,642.062500,604.500000,676.833333,676.527778,660.750000,603.400000,681.875000,672.472222,660.958333,795.600000,712.816667,679.683333,670.825000,703.833333,658.958333,648.111111,645.562500,624.666667,644.666667,644.527778,638.041667,727.166667,640.041667,639.333333,631.145833,25716.0
18,1,19,5,2013,5,0,0,1,0,0,1,1270.0,1,0,0,0,0.0,1.0,-0.19245,-0.129189,0.0,-0.124429,1.0,0.0,1.0,-0.090208,0.0,1.0,0.0,0.0,0.0,30171.0,28141.75,28487.666667,29328.250,28979.0,26565.75,29379.333333,30183.875,22552.0,27944.00,29246.000000,29939.875,30865.0,31186.25,31656.666667,31239.750,23867.0,30514.75,31017.000000,30866.625,34492.0,33802.00,32531.000000,32354.625,35521.0,31935.75,31429.000000,31629.750,28179.0,31293.25,31470.666667,31323.500,8.735090,8.301799,8.298121,8.214807,8.402146,8.095737,8.244159,8.183429,7.560174,8.162872,8.096870,8.103680,8.509788,8.375659,8.250852,8.220224,7.910839,8.127814,8.126579,8.114707,8.670689,8.271122,8.223527,8.218030,8.411319,8.044487,8.056022,8.141390,7.518410,8.064789,8.110372,8.160373,6034.200000,4941.716667,5239.777778,5256.916667,4829.833333,4626.516667,5220.772222,5273.808333,3758.666667,5143.658333,5198.550000,5233.141667,5144.166667,5684.033333,5600.327778,5449.787500,4773.400000,5572.116667,5493.716667,5387.600000,6898.400000,5921.100000,5613.455556,5536.154167,5920.166667,5322.625000,5238.166667,5271.625000,4696.500000,5215.541667,5245.111111,5220.583333,3454.0,-0.290018,-0.274624,-0.209589,3449.0,-0.342462,-0.213524,-0.155880,2983.0,-0.278910,-0.190036,-0.154173,3627.0,-0.133965,-0.083530,-0.101984,3017.0,-0.119376,-0.094399,-0.100987,3978.0,0.031465,-0.029160,-0.038937,4223.0,-0.023896,-0.054744,-0.062445,3748.0,-0.062404,-0.064068,-0.082158,690.800000,591.825000,627.716667,636.850000,574.833333,569.975000,629.888889,641.395833,497.166667,625.166667,638.194444,642.062500,604.500000,676.833333,676.527778,660.750000,603.400000,681.875000,672.472222,660.958333,795.600000,712.816667,679.683333,670.825000,703.833333,658.958333,648.111111,645.562500,624.666667,644.666667,644.527778,638.041667,32134.0
19,1,20,5,2013,6,5,1,0,0,0,0,1270.0,1,0,0,0,1.0,0.0,-0.19245,-0.129189,0.0,-0.124429,1.0,0.0,1.0,-0.090208,0.0,1.0,0.0,0.0,0.0,24895.0,26649.25,26888.166667,28917.750,30171.0,28141.75,28487.666667,29328.250,28979.0,26565.75,29379.333333,30183.875,22552.0,27944.00,29246.000000,29939.875,30865.0,31186.25,31656.666667,31239.750,23867.0,30514.75,31017.000000,30866.625,34492.0,33802.00,32531.000000,32354.625,35521.0,31935.75,31429.000000,31629.750,8.251574,8.237246,8.228268,8.306452,8.735090,8.301799,8.298121,8.214807,8.402146,8.095737,8.244159,8.183429,7.560174,8.162872,8.096870,8.103680,8.509788,8.375659,8.250852,8.220224,7.910839,8.127814,8.126579,8.114707,8.670689,8.271122,8.223527,8.218030,8.411319,8.044487,8.056022,8.141390,4979.000000,4900.425000,4919.877778,5292.229167,6034.200000,4941.716667,5239.777778,5256.916667,4829.833333,4626.516667,5220.772222,5273.808333,3758.666667,5143.658333,5198.550000,5233.141667,5144.166667,5684.033333,5600.327778,5449.787500,4773.400000,5572.116667,5493.716667,5387.600000,6898.400000,5921.100000,5613.455556,5536.154167,5920.166667,5322.625000,5238.166667,5271.625000,3017.0,-0.363858,-0.352552,-0.253442,3454.0,-0.289723,-0.274732,-0.209716,3449.0,-0.342638,-0.213569,-0.156054,2983.0,-0.279264,-0.190337,-0.154347,3627.0,-0.133269,-0.083235,-0.101567,3017.0,-0.119916,-0.094690,-0.100883,3978.0,0.031341,-0.028982,-0.038577,4223.0,-0.023856,-0.054282,-0.061383,603.400000,591.550000,595.683333,634.191667,690.800000,591.825000,627.716667,636.850000,574.833333,569.975000,629.888889,641.395833,497.166667,625.166667,638.194444,642.062500,604.500000,676.833333,676.527778,660.750000,603.400000,681.875000,672.472222,660.958333,795.600000,712.816667,679.683333,670.825000,703.833333,658.958333,648.111111,645.562500,24687.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150512,1115,19,5,2015,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,-0.19245,-0.129189,0.0,-0.124429,0.0,1.0,0.0,-0.090208,1.0,0.0,0.0,0.0,1.0,44909.0,37810.25,39621.833333,40383.250,32221.0,39762.50,40160.166667,40674.625,46236.0,40150.25,40989.333333,41386.000,27875.0,40626.00,41156.666667,41092.625,52718.0,42956.25,42829.500000,41771.375,33772.0,41586.75,41358.000000,40577.750,48139.0,42621.75,41280.166667,42283.875,37196.0,41559.25,40451.833333,40031.750,15.779691,14.561868,14.716023,14.829578,13.521192,14.687204,14.728752,14.761955,15.473896,14.748814,14.889290,14.880585,13.472692,14.844355,14.850093,14.886965,16.281038,15.097288,15.016349,14.880206,13.767631,14.836706,14.890332,14.750487,15.856061,15.012356,14.832163,15.010029,14.484424,14.929575,14.730034,14.725610,8981.800000,6908.241667,7300.872222,7253.466667,5370.166667,7298.691667,7141.100000,7114.908333,7706.000000,7363.316667,7279.294444,7233.470833,5575.000000,7442.608333,7307.183333,7184.575000,10543.600000,7598.691667,7431.127778,7181.554167,5628.666667,6931.125000,6893.000000,6762.958333,8023.166667,7103.625000,6880.027778,7047.312500,6199.333333,6926.541667,6741.972222,6671.958333,2846.0,-0.699299,-0.657816,-0.645918,2383.0,-0.646530,-0.640875,-0.628875,2988.0,-0.637339,-0.624393,-0.613075,2069.0,-0.631244,-0.614693,-0.623872,3238.0,-0.565326,-0.566050,-0.595852,2453.0,-0.583637,-0.602110,-0.622646,3036.0,-0.561986,-0.599253,-0.586282,2568.0,-0.588906,-0.616630,-0.639688,569.200000,469.541667,489.100000,483.575000,397.166667,489.141667,478.566667,477.008333,498.000000,492.058333,483.705556,481.945833,413.800000,494.058333,486.816667,478.591667,647.600000,497.608333,490.627778,478.575000,408.833333,464.875000,461.222222,456.625000,506.000000,471.833333,462.027778,467.875000,428.000000,463.125000,456.361111,450.833333,48130.0
150513,1115,20,5,2015,5,0,0,1,0,0,1,5350.0,0,1,1,0,0.0,1.0,-0.19245,-0.129189,0.0,-0.124429,0.0,1.0,0.0,-0.090208,1.0,0.0,0.0,0.0,1.0,47767.0,42783.25,41954.333333,41704.625,44909.0,37810.25,39621.833333,40383.250,32221.0,39762.50,40160.166667,40674.625,46236.0,40150.25,40989.333333,41386.000,27875.0,40626.00,41156.666667,41092.625,52718.0,42956.25,42829.500000,41771.375,33772.0,41586.75,41358.000000,40577.750,48139.0,42621.75,41280.166667,42283.875,15.188235,14.990753,14.952791,14.917554,15.779691,14.561868,14.716023,14.829578,13.521192,14.687204,14.728752,14.761955,15.473896,14.748814,14.889290,14.880585,13.472692,14.844355,14.850093,14.886965,16.281038,15.097288,15.016349,14.880206,13.767631,14.836706,14.890332,14.750487,15.856061,15.012356,14.832163,15.010029,7961.166667,7504.783333,7689.622222,7473.695833,8981.800000,6908.241667,7300.872222,7253.466667,5370.166667,7298.691667,7141.100000,7114.908333,7706.000000,7363.316667,7279.294444,7233.470833,5575.000000,7442.608333,7307.183333,7184.575000,10543.600000,7598.691667,7431.127778,7181.554167,5628.666667,6931.125000,6893.000000,6762.958333,8023.166667,7103.625000,6880.027778,7047.312500,3145.0,-0.557396,-0.596498,-0.607270,2846.0,-0.698923,-0.657819,-0.645938,2383.0,-0.646605,-0.640803,-0.628914,2988.0,-0.637580,-0.624586,-0.613124,2069.0,-0.630486,-0.614291,-0.623348,3238.0,-0.565748,-0.566202,-0.595625,2453.0,-0.583617,-0.601798,-0.622167,3036.0,-0.561820,-0.598686,-0.585183,524.166667,497.133333,508.322222,495.595833,569.200000,469.541667,489.100000,483.575000,397.166667,489.141667,478.566667,477.008333,498.000000,492.058333,483.705556,481.945833,413.800000,494.058333,486.816667,478.591667,647.600000,497.608333,490.627778,478.575000,408.833333,464.875000,461.222222,456.625000,506.000000,471.833333,462.027778,467.875000,36233.0
150514,1115,21,5,2015,6,5,1,0,0,0,0,5350.0,0,1,1,0,1.0,0.0,-0.19245,-0.129189,0.0,-0.124429,0.0,1.0,0.0,-0.090208,1.0,0.0,0.0,0.0,1.0,33638.0,39633.75,38774.333333,39892.000,47767.0,42783.25,41954.333333,41704.625,44909.0,37810.25,39621.833333,40383.250,32221.0,39762.50,40160.166667,40674.625,46236.0,40150.25,40989.333333,41386.000,27875.0,40626.00,41156.666667,41092.625,52718.0,42956.25,42829.500000,41771.375,33772.0,41586.75,41358.000000,40577.750,14.098072,14.646797,14.588963,14.697806,15.188235,14.990753,14.952791,14.917554,15.779691,14.561868,14.716023,14.829578,13.521192,14.687204,14.728752,14.761955,15.473896,14.748814,14.889290,14.880585,13.472692,14.844355,14.850093,14.886965,16.281038,15.097288,15.016349,14.880206,13.767631,14.836706,14.890332,14.750487,6727.600000,7260.183333,7053.622222,7311.750000,7961.166667,7504.783333,7689.622222,7473.695833,8981.800000,6908.241667,7300.872222,7253.466667,5370.166667,7298.691667,7141.100000,7114.908333,7706.000000,7363.316667,7279.294444,7233.470833,5575.000000,7442.608333,7307.183333,7184.575000,10543.600000,7598.691667,7431.127778,7181.554167,5628.666667,6931.125000,6893.000000,6762.958333,2386.0,-0.636039,-0.672157,-0.650839,3145.0,-0.557049,-0.596518,-0.607301,2846.0,-0.698979,-0.657742,-0.645972,2383.0,-0.646842,-0.640992,-0.628957,2988.0,-0.636821,-0.624182,-0.612602,2069.0,-0.630889,-0.614428,-0.623114,3238.0,-0.565733,-0.565900,-0.595151,2453.0,-0.583446,-0.601231,-0.621065,477.200000,491.933333,479.922222,491.995833,524.166667,497.133333,508.322222,495.595833,569.200000,469.541667,489.100000,483.575000,397.166667,489.141667,478.566667,477.008333,498.000000,492.058333,483.705556,481.945833,413.800000,494.058333,486.816667,478.591667,647.600000,497.608333,490.627778,478.575000,408.833333,464.875000,461.222222,456.625000,45927.0
150515,1115,22,5,2015,5,0,0,1,0,0,1,5350.0,0,1,1,0,0.0,1.0,-0.19245,-0.129189,0.0,-0.124429,0.0,1.0,0.0,-0.090208,1.0,0.0,0.0,0.0,1.0,46629.0,43235.75,41900.000000,41499.125,33638.0,39633.75,38774.333333,39892.000,47767.0,42783.25,41954.333333,41704.625,44909.0,37810.25,39621.833333,40383.250,32221.0,39762.50,40160.166667,40674.625,46236.0,40150.25,40989.333333,41386.000,27875.0,40626.00,41156.666667,41092.625,52718.0,42956.25,42829.500000,41771.375,15.085409,15.037852,14.857749,14.862528,14.098072,14.646797,14.588963,14.697806,15.188235,14.990753,14.952791,14.917554,15.779691,14.561868,14.716023,14.829578,13.521192,14.687204,14.728752,14.761955,15.473896,14.748814,14.889290,14.880585,13.472692,14.844355,14.850093,14.886965,16.281038,15.097288,15.016349,14.880206,7771.500000,7860.516667,7419.705556,7579.604167,6727.600000,7260.183333,7053.622222,7311.750000,7961.166667,7504.783333,7689.622222,7473.695833,8981.800000,6908.241667,7300.872222,7253.466667,5370.166667,7298.691667,7141.100000,7114.908333,7706.000000,7363.316667,7279.294444,7233.470833,5575.000000,7442.608333,7307.183333,7184.575000,10543.600000,7598.691667,7431.127778,7181.554167,3091.0,-0.543735,-0.581604,-0.608069,2386.0,-0.635676,-0.672155,-0.650859,3145.0,-0.557154,-0.596459,-0.607346,2846.0,-0.699199,-0.657926,-0.646011,2383.0,-0.646082,-0.640585,-0.628433,2988.0,-0.637222,-0.624316,-0.612371,2069.0,-0.630856,-0.614114,-0.622634,3238.0,-0.565566,-0.565340,-0.594052,515.166667,521.433333,496.816667,505.287500,477.200000,491.933333,479.922222,491.995833,524.166667,497.133333,508.322222,495.595833,569.200000,469.541667,489.100000,483.575000,397.166667,489.141667,478.566667,477.008333,498.000000,492.058333,483.705556,481.945833,413.800000,494.058333,486.816667,478.591667,647.600000,497.608333,490.627778,478.575000,35362.0


## Feature reduction

In [47]:
train_data = []
test_data = []
# Group by store and split into training and test data
amount_test_weeks = 8
for store_id, group in df_nans_handeled_cat_power.groupby('Store'):
	train_data.append(group[: -amount_test_weeks])
	test_data.append(group[-amount_test_weeks:])
# Combine the list entries to one dataframe
train_df = pd.concat(train_data)
test_df = pd.concat(test_data)
X_train = train_df.drop(columns=['Future_Sales'])
y_train = train_df['Future_Sales']
X_test = test_df.drop(columns=['Future_Sales'])
y_test = test_df['Future_Sales']

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit

model = LinearRegression(n_jobs=-1)
tscv = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(model, n_features_to_select=40, direction='forward', cv=tscv)

# perform feature selection
sfs.fit(X_train, y_train)

# get the selected feature indices
selected_features_boolean = sfs.get_support()

# extract selected column names
selected_columns = X_train.columns[selected_features_boolean]

print("Ausgewählte Spaltennamen:", selected_columns)

Ausgewählte Spaltennamen: Index(['CW', 'Month', 'Open', 'Promo', 'IsPromo', 'IsSchoolHoliday',
       'StateHoliday_a', 'StateHoliday_b', 'Assortment_b', 'Assortment_c',
       'Sales_Lag_1', 'Sales_Lag_1_MA_6', 'Sales_Lag_1_MA_8',
       'Sales_Lag_2_MA_8', 'Sales_Lag_3', 'Sales_Lag_3_MA_8',
       'Sales_Lag_4_MA_4', 'Sales_Lag_5_MA_8', 'Sales_Lag_7_MA_8',
       'Sales_Lag_8_MA_8', 'SalesPerCustomer_Lag_3_MA_6',
       'SalesPerCustomer_Lag_4_MA_6', 'SalesPerCustomer_Lag_7',
       'SalesPerCustomer_Lag_8', 'SalesPerOpenDay_Lag_1',
       'SalesPerOpenDay_Lag_4_MA_6', 'SalesPerOpenDay_Lag_5_MA_8',
       'SalesPerOpenDay_Lag_7', 'Customers_Lag_1_MA_4', 'Customers_Lag_1_MA_6',
       'Customers_Lag_3_MA_4', 'Customers_Lag_4', 'Customers_Lag_4_MA_8',
       'Customers_Lag_6', 'Customers_Lag_7_MA_8', 'Customers_Lag_8_MA_8',
       'CustomersPerOpenDay_Lag_6', 'CustomersPerOpenDay_Lag_6_MA_4',
       'CustomersPerOpenDay_Lag_7_MA_4', 'CustomersPerOpenDay_Lag_8'],
      dtype='object')


In [53]:
train_data = []
test_data = []
# Group by store and split into training and test data
amount_test_weeks = 8
df_to_reduce = df_nans_handeled_cat_power[['Store', 'Future_Sales', 'CW', 'Month', 'Open', 'Promo', 'IsPromo', 'IsSchoolHoliday',
       'StateHoliday_a', 'StateHoliday_b', 'Assortment_b', 'Assortment_c',
       'Sales_Lag_1', 'Sales_Lag_1_MA_6', 'Sales_Lag_1_MA_8',
       'Sales_Lag_2_MA_8', 'Sales_Lag_3', 'Sales_Lag_3_MA_8',
       'Sales_Lag_4_MA_4', 'Sales_Lag_5_MA_8', 'Sales_Lag_7_MA_8',
       'Sales_Lag_8_MA_8', 'SalesPerCustomer_Lag_3_MA_6',
       'SalesPerCustomer_Lag_4_MA_6', 'SalesPerCustomer_Lag_7',
       'SalesPerCustomer_Lag_8', 'SalesPerOpenDay_Lag_1',
       'SalesPerOpenDay_Lag_4_MA_6', 'SalesPerOpenDay_Lag_5_MA_8',
       'SalesPerOpenDay_Lag_7', 'Customers_Lag_1_MA_4', 'Customers_Lag_1_MA_6',
       'Customers_Lag_3_MA_4', 'Customers_Lag_4', 'Customers_Lag_4_MA_8',
       'Customers_Lag_6', 'Customers_Lag_7_MA_8', 'Customers_Lag_8_MA_8',
       'CustomersPerOpenDay_Lag_6', 'CustomersPerOpenDay_Lag_6_MA_4',
       'CustomersPerOpenDay_Lag_7_MA_4', 'CustomersPerOpenDay_Lag_8']]
for store_id, group in df_to_reduce.groupby('Store'):
	train_data.append(group[: -amount_test_weeks])
	test_data.append(group[-amount_test_weeks:])
# Combine the list entries to one dataframe
train_df = pd.concat(train_data)
test_df = pd.concat(test_data)
X_train = train_df.drop(columns=['Future_Sales'])
y_train = train_df['Future_Sales']
X_test = test_df.drop(columns=['Future_Sales'])
y_test = test_df['Future_Sales']

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit

model = LinearRegression(n_jobs=-1)
tscv = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(model, n_features_to_select=20, direction='forward', cv=tscv)

# perform feature selection
sfs.fit(X_train, y_train)

# get the selected feature indices
selected_features_boolean = sfs.get_support()

# extract selected column names
selected_columns = X_train.columns[selected_features_boolean]

print("Ausgewählte Spaltennamen:", selected_columns)

Ausgewählte Spaltennamen: Index(['Open', 'Promo', 'IsPromo', 'StateHoliday_b', 'Sales_Lag_1',
       'Sales_Lag_1_MA_6', 'Sales_Lag_1_MA_8', 'Sales_Lag_2_MA_8',
       'Sales_Lag_3_MA_8', 'Sales_Lag_5_MA_8', 'Sales_Lag_8_MA_8',
       'SalesPerCustomer_Lag_3_MA_6', 'SalesPerCustomer_Lag_8',
       'SalesPerOpenDay_Lag_1', 'SalesPerOpenDay_Lag_4_MA_6',
       'Customers_Lag_1_MA_6', 'Customers_Lag_3_MA_4', 'Customers_Lag_6',
       'Customers_Lag_8_MA_8', 'CustomersPerOpenDay_Lag_6'],
      dtype='object')


In [63]:
# with 40 selected features
df_test = df_nans_handeled_cat_power[['Store', 'Future_Sales', 'CW', 'Month', 'Open', 'Promo', 'IsPromo', 'IsSchoolHoliday',
       'StateHoliday_a', 'StateHoliday_b', 'Assortment_b', 'Assortment_c',
       'Sales_Lag_1', 'Sales_Lag_1_MA_6', 'Sales_Lag_1_MA_8',
       'Sales_Lag_2_MA_8', 'Sales_Lag_3', 'Sales_Lag_3_MA_8',
       'Sales_Lag_4_MA_4', 'Sales_Lag_5_MA_8', 'Sales_Lag_7_MA_8',
       'Sales_Lag_8_MA_8', 'SalesPerCustomer_Lag_3_MA_6',
       'SalesPerCustomer_Lag_4_MA_6', 'SalesPerCustomer_Lag_7',
       'SalesPerCustomer_Lag_8', 'SalesPerOpenDay_Lag_1',
       'SalesPerOpenDay_Lag_4_MA_6', 'SalesPerOpenDay_Lag_5_MA_8',
       'SalesPerOpenDay_Lag_7', 'Customers_Lag_1_MA_4', 'Customers_Lag_1_MA_6',
       'Customers_Lag_3_MA_4', 'Customers_Lag_4', 'Customers_Lag_4_MA_8',
       'Customers_Lag_6', 'Customers_Lag_7_MA_8', 'Customers_Lag_8_MA_8',
       'CustomersPerOpenDay_Lag_6', 'CustomersPerOpenDay_Lag_6_MA_4',
       'CustomersPerOpenDay_Lag_7_MA_4', 'CustomersPerOpenDay_Lag_8']]
testModelsTestSplit8W(df_test, None)

{'Model': 'LinearRegression', 'RMSE_Train': 9243.960853576737, 'MAE_Train': 6346.849079059969, 'R2_Train': 0.7425830551488228, 'Adj_R2_Train': 0.7424920072120149, 'RMSE_Test': 6076.866293181645, 'MAE_Test': 4604.182223657593, 'R2_Test': 0.8578500868741831, 'Adj_R2_Test': 0.8571936162233429}




{'Model': 'XGBRegressor', 'RMSE_Train': 7586.747425730487, 'MAE_Train': 5128.276978852914, 'R2_Train': 0.8266067494562147, 'Adj_R2_Train': 0.8265454205575768, 'RMSE_Test': 6139.361351649675, 'MAE_Test': 4697.845359576443, 'R2_Test': 0.8549112868641222, 'Adj_R2_Test': 0.854241244372731}
{'Model': 'RandomForestRegressor', 'RMSE_Train': 6833.118635588801, 'MAE_Train': 4610.654046070799, 'R2_Train': 0.8593438071493607, 'Adj_R2_Train': 0.859294057292506, 'RMSE_Test': 6633.261060700181, 'MAE_Test': 5053.971024294538, 'R2_Test': 0.8306280776890549, 'Adj_R2_Test': 0.8298458915193377}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9243.960854,6346.849079,0.742583,0.742492,6076.866293,4604.182224,0.85785,0.857194
1,XGBRegressor,7586.747426,5128.276979,0.826607,0.826545,6139.361352,4697.84536,0.854911,0.854241
2,RandomForestRegressor,6833.118636,4610.654046,0.859344,0.859294,6633.261061,5053.971024,0.830628,0.829846


In [49]:
# with 20 selected features

df_test = df_nans_handeled_cat_power[['Store', 'Future_Sales', 'Open', 'Promo', 'IsPromo', 'StateHoliday_b', 'Sales_Lag_1',
       'Sales_Lag_1_MA_6', 'Sales_Lag_1_MA_8', 'Sales_Lag_2_MA_8',
       'Sales_Lag_3_MA_8', 'Sales_Lag_5_MA_8', 'Sales_Lag_8_MA_8',
       'SalesPerCustomer_Lag_3_MA_6', 'SalesPerCustomer_Lag_8',
       'SalesPerOpenDay_Lag_1', 'SalesPerOpenDay_Lag_4_MA_6',
       'Customers_Lag_1_MA_6', 'Customers_Lag_3_MA_4', 'Customers_Lag_6',
       'Customers_Lag_8_MA_8', 'CustomersPerOpenDay_Lag_6']]
testModelsTestSplit8W(df_test, None)

{'Model': 'LinearRegression', 'RMSE_Train': 9536.329179581322, 'MAE_Train': 6673.629827187907, 'R2_Train': 0.7260423678114407, 'Adj_R2_Train': 0.7259927455109356, 'RMSE_Test': 5775.47628316968, 'MAE_Test': 4295.672175582573, 'R2_Test': 0.8716006437113271, 'Adj_R2_Test': 0.8712976108407874}




{'Model': 'XGBRegressor', 'RMSE_Train': 9457.068947317517, 'MAE_Train': 6669.648039198498, 'R2_Train': 0.7305773851169427, 'Adj_R2_Train': 0.7305285842499919, 'RMSE_Test': 6298.190214583256, 'MAE_Test': 4920.265377574972, 'R2_Test': 0.8473071214064962, 'Adj_R2_Test': 0.8469467538575567}


  X, y = self._initialize(X, y)


{'Model': 'NeuralNetwork', 'RMSE_Train': 12397.468610666763, 'MAE_Train': 9263.2136661244, 'R2_Train': 0.5369936247054163, 'Adj_R2_Train': 0.5369097597613843, 'RMSE_Test': 11164.801701721253, 'MAE_Test': 9001.246323877172, 'R2_Test': 0.5201679368692144, 'Adj_R2_Test': 0.5190354943736258}




{'Model': 'MLPRegressor', 'RMSE_Train': 8739.366559683718, 'MAE_Train': 5797.459515852285, 'R2_Train': 0.7699189517258402, 'Adj_R2_Train': 0.7698772768477695, 'RMSE_Test': 6647.838726860592, 'MAE_Test': 5149.058839719626, 'R2_Test': 0.8298828151753054, 'Adj_R2_Test': 0.8294813248537366}


Unnamed: 0,Model,RMSE_Train,MAE_Train,R2_Train,Adj_R2_Train,RMSE_Test,MAE_Test,R2_Test,Adj_R2_Test
0,LinearRegression,9536.32918,6673.629827,0.726042,0.725993,5775.476283,4295.672176,0.871601,0.871298
1,XGBRegressor,9457.068947,6669.648039,0.730577,0.730529,6298.190215,4920.265378,0.847307,0.846947
2,NeuralNetwork,12397.468611,9263.213666,0.536994,0.53691,11164.801702,9001.246324,0.520168,0.519035
3,MLPRegressor,8739.36656,5797.459516,0.769919,0.769877,6647.838727,5149.05884,0.829883,0.829481


**Result:**
- LinearRegression increased slightly from R2:0,86 to R2:0,87
- XGBRegressor decreased slightly from R2: 0,859 to R2:0,847

-> The Top 20 will be used