In [1]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore") #, category=DeprecationWarning) # to avoid deprecation warnings
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots

# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
pio.renderers.default = "vscode"
#pio.renderers.default = "iframe" # to be replaced by "iframe" if working on JULIE

import datetime

# Part 1 : Preprocessings & EDA

In [2]:
df = pd.read_csv("Walmart_Store_sales.csv")

In [3]:
print(df.shape)
df.head()

(150, 8)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


## 1.1 Preprocessings

In [4]:
#Since the goal is to analyse the weekly sales of the stores, columns without this data won't be useful
#Same thing for columns without a Date, this is one 

In [5]:
mask = ~df.Date.isnull()
df = df.loc[mask, :]
df = df.reset_index(drop=True)
print(df.shape)

mask = ~df.Weekly_Sales.isnull()
df = df.loc[mask, :]
df = df.reset_index(drop=True)
print(df.shape)

(132, 8)
(118, 8)


In [6]:
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092
3,4.0,28-05-2010,1857533.7,0.0,,2.756,126.160226,7.896
4,15.0,03-06-2011,695396.19,0.0,69.8,4.069,134.855161,7.658


In [7]:
#df.sort_values('Unemployment', ascending=False)

In [8]:
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
df['Year'], df['Month'] = df['Date'].dt.year, df['Date'].dt.month
df['Day'], df['Day_of_week'] = df['Date'].dt.day, df['Date'].dt.dayofweek
#df.set_index('Date', inplace=True)

In [9]:
# Basic stats
print("Number of rows : {}".format(df.shape[0]))
print()

print("Display of dataset: ")
display(df.head())
print()

print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

Number of rows : 118

Display of dataset: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Day_of_week
0,6.0,2011-02-18,1572117.54,,59.61,3.045,214.777523,6.858,2011,2,18,4
1,13.0,2011-03-25,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011,3,25,4
2,6.0,2010-05-28,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010,5,28,4
3,4.0,2010-05-28,1857533.7,0.0,,2.756,126.160226,7.896,2010,5,28,4
4,15.0,2011-06-03,695396.19,0.0,69.8,4.069,134.855161,7.658,2011,6,3,4



Basics statistics: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Day_of_week
count,118.0,118,118.0,109.0,107.0,107.0,109.0,107.0,118.0,118.0,118.0,118.0
unique,,79,,,,,,,,,,
top,,2012-10-19 00:00:00,,,,,,,,,,
freq,,3,,,,,,,,,,
first,,2010-02-05 00:00:00,,,,,,,,,,
last,,2012-10-19 00:00:00,,,,,,,,,,
mean,9.949153,,1257271.0,0.073394,60.711589,3.291542,177.718009,7.681299,2010.822034,6.338983,16.440678,4.0
std,6.066367,,662505.6,0.261987,17.961676,0.486784,39.816657,1.678155,0.812628,3.173664,8.209378,0.0
min,1.0,,268929.0,0.0,18.79,2.514,126.111903,5.143,2010.0,1.0,1.0,4.0
25%,4.25,,577427.6,0.0,45.465,2.824,131.637,6.664,2010.0,4.0,10.0,4.0



Percentage of missing values: 


Store           0.000000
Date            0.000000
Weekly_Sales    0.000000
Holiday_Flag    7.627119
Temperature     9.322034
Fuel_Price      9.322034
CPI             7.627119
Unemployment    9.322034
Year            0.000000
Month           0.000000
Day             0.000000
Day_of_week     0.000000
dtype: float64

In [10]:
df['Day_of_week'].value_counts()

4    118
Name: Day_of_week, dtype: int64

In [11]:
#Day_of_week is always 4, one report is made per week always on the same day, this column can be dropped 

In [12]:
df = df.drop('Day_of_week', axis=1)
df.head()
print(df.shape)

(118, 11)


In [13]:
#dataset = df[df['Weekly_Sales'].isnull() == False]
display(100*df.isnull().sum()/df.shape[0])

Store           0.000000
Date            0.000000
Weekly_Sales    0.000000
Holiday_Flag    7.627119
Temperature     9.322034
Fuel_Price      9.322034
CPI             7.627119
Unemployment    9.322034
Year            0.000000
Month           0.000000
Day             0.000000
dtype: float64

In [14]:
#Let's make sure all the data is numerical

In [15]:
numeric_features = []

for i,t in df.dtypes.iteritems():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
        
print(numeric_features)

['Store', 'Weekly_Sales', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day']


## 1.2 Creating a function to check outliers

In [16]:
def check_outliers(feature):
    feat_std = round(df[feature].std(), 2)
    feat_avg = round(df[feature].mean(), 2)
    feat_min = df[feature].min()
    feat_max = df[feature].max()
    feat_out_bot = round((feat_avg - 3*feat_std), 2)
    feat_out_top = round((feat_avg + 3*feat_std), 2)
    print(f'Mean {feature} is : {feat_avg}')
    print(f'{feature} std is : {feat_std}')
    print(f'Outliers for {feature} are : {feat_out_bot} and {feat_out_top}')
    print(f'Min {feature} is : {feat_min}, Max {feature} is: {feat_max}')
    print(f'There are {df[feature].isnull().sum()} missing values for {feature}')
    if feat_min < feat_out_bot:
        out_bot_count = df[feature][df[feature] < feat_out_bot].count()
        print(f'There are {out_bot_count} minimal outliers for {feature}')

    else :
        print(f'No minimal outliers for {feature}')

    if feat_max > feat_out_top:
        out_top_count = df[feature][df[feature] > feat_out_top].count()
        print(f'There are {out_top_count} maximal outliers for {feature}')

    else :
        print(f'No maximal outliers for {feature}')


In [17]:
check_outliers('Temperature')
print('')
check_outliers('Fuel_Price')
print('')
check_outliers('CPI')
print('')
check_outliers('Unemployment')

Mean Temperature is : 60.71
Temperature std is : 17.96
Outliers for Temperature are : 6.83 and 114.59
Min Temperature is : 18.79, Max Temperature is: 91.65
There are 11 missing values for Temperature
No minimal outliers for Temperature
No maximal outliers for Temperature

Mean Fuel_Price is : 3.29
Fuel_Price std is : 0.49
Outliers for Fuel_Price are : 1.82 and 4.76
Min Fuel_Price is : 2.514, Max Fuel_Price is: 4.1930000000000005
There are 11 missing values for Fuel_Price
No minimal outliers for Fuel_Price
No maximal outliers for Fuel_Price

Mean CPI is : 177.72
CPI std is : 39.82
Outliers for CPI are : 58.26 and 297.18
Min CPI is : 126.1119032, Max CPI is: 226.9688442
There are 9 missing values for CPI
No minimal outliers for CPI
No maximal outliers for CPI

Mean Unemployment is : 7.68
Unemployment std is : 1.68
Outliers for Unemployment are : 2.64 and 12.72
Min Unemployment is : 5.143, Max Unemployment is: 14.313
There are 11 missing values for Unemployment
No minimal outliers for Une

In [18]:
#There are 5 maximal outliers for the Unemployment feature
#Let's create a function to pop outliers, it will also drop rows with missing values for Temperature, Fuel_Price, CPI and Unemployment

In [19]:
def pop_outliers(feature):
    feat_std = round(df[feature].std(), 2)
    feat_avg = round(df[feature].mean(), 2)
    feat_out_bot = round((feat_avg - 3*feat_std), 2)
    feat_out_top = round((feat_avg + 3*feat_std), 2)

    df2 = df.loc[df[feature] < feat_out_top]
    df2 = df2.loc[df[feature] > feat_out_bot]

    return df2

In [20]:
df

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day
0,6.0,2011-02-18,1572117.54,,59.61,3.045,214.777523,6.858,2011,2,18
1,13.0,2011-03-25,1807545.43,0.0,42.38,3.435,128.616064,7.470,2011,3,25
2,6.0,2010-05-28,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010,5,28
3,4.0,2010-05-28,1857533.70,0.0,,2.756,126.160226,7.896,2010,5,28
4,15.0,2011-06-03,695396.19,0.0,69.80,4.069,134.855161,7.658,2011,6,3
...,...,...,...,...,...,...,...,...,...,...,...
113,3.0,2012-10-19,424513.08,0.0,73.44,3.594,226.968844,6.034,2012,10,19
114,14.0,2010-06-18,2248645.59,0.0,72.62,2.780,182.442420,8.899,2010,6,18
115,17.0,2010-06-11,845252.21,0.0,57.14,2.841,126.111903,,2010,6,11
116,8.0,2011-08-12,856796.10,0.0,86.05,3.638,219.007525,,2011,8,12


In [21]:
num_list = ['Temperature', 'Fuel_Price', 'CPI','Unemployment']

for i in num_list:
    df = pop_outliers(i)

In [22]:
df

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day
0,6.0,2011-02-18,1572117.54,,59.61,3.045,214.777523,6.858,2011,2,18
1,13.0,2011-03-25,1807545.43,0.0,42.38,3.435,128.616064,7.470,2011,3,25
2,6.0,2010-05-28,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010,5,28
4,15.0,2011-06-03,695396.19,0.0,69.80,4.069,134.855161,7.658,2011,6,3
5,20.0,2012-02-03,2203523.20,0.0,39.93,3.617,213.023622,6.961,2012,2,3
...,...,...,...,...,...,...,...,...,...,...,...
109,7.0,2012-05-25,532739.77,0.0,50.60,3.804,197.588605,8.090,2012,5,25
112,3.0,2010-06-04,396968.80,0.0,78.53,2.705,214.495838,7.343,2010,6,4
113,3.0,2012-10-19,424513.08,0.0,73.44,3.594,226.968844,6.034,2012,10,19
114,14.0,2010-06-18,2248645.59,0.0,72.62,2.780,182.442420,8.899,2010,6,18


In [23]:
#Let's check for outliers again

In [24]:
check_outliers('Temperature')
print('')
check_outliers('Fuel_Price')
print('')
check_outliers('CPI')
print('')
check_outliers('Unemployment')

Mean Temperature is : 61.13
Temperature std is : 17.45
Outliers for Temperature are : 8.78 and 113.48
Min Temperature is : 18.79, Max Temperature is: 91.65
There are 0 missing values for Temperature
No minimal outliers for Temperature
No maximal outliers for Temperature

Mean Fuel_Price is : 3.29
Fuel_Price std is : 0.49
Outliers for Fuel_Price are : 1.82 and 4.76
Min Fuel_Price is : 2.548, Max Fuel_Price is: 4.17
There are 0 missing values for Fuel_Price
No minimal outliers for Fuel_Price
No maximal outliers for Fuel_Price

Mean CPI is : 181.08
CPI std is : 38.85
Outliers for CPI are : 64.53 and 297.63
Min CPI is : 126.1392, Max CPI is: 226.9688442
There are 0 missing values for CPI
No minimal outliers for CPI
No maximal outliers for CPI

Mean Unemployment is : 7.3
Unemployment std is : 0.96
Outliers for Unemployment are : 4.42 and 10.18
Min Unemployment is : 5.143, Max Unemployment is: 9.342
There are 0 missing values for Unemployment
No minimal outliers for Unemployment
No maximal o

In [25]:
df.shape

(80, 11)

In [26]:
#This leaves us with 80 rows

In [27]:
fig_vis= make_subplots(rows = 11, cols = 2, subplot_titles = ("Feature distribution", "Target in function of the feature"), row_heights= [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500])
n_row = 1
for col in df.columns:

    fig = (px.scatter(x = df.loc[:,col], 
                        y = (df.loc[:,'Weekly_Sales']) ))
    fig_vis.add_trace(fig.data[0], 
                    row = n_row,
                    col = 2,
                     )

    fig_vis.add_trace(go.Histogram(x = df[col], nbinsx = df[col].nunique(), marker=dict(color='#1f77b4')), 
                row = n_row,
                col = 1,
                 )

    n_row +=1

fig_vis.update_layout(
    height=3000, 
    width=1200,
    yaxis1_title = 'Count',
    yaxis2_title = 'Weekly Sales',
    yaxis3_title = 'Count',
    yaxis4_title = 'Weekly Sales',
    yaxis5_title = 'Count',
    yaxis6_title = 'Weekly Sales',
    yaxis7_title = 'Count',
    yaxis8_title = 'Weekly Sales',
    yaxis9_title = 'Count',
    yaxis10_title = 'Weekly Sales',
    yaxis11_title = 'Count',
    yaxis12_title = 'Weekly Sales',
    yaxis13_title = 'Count',
    yaxis14_title = 'Weekly Sales',
    yaxis15_title = 'Count',
    yaxis16_title = 'Weekly Sales',
    yaxis17_title = 'Count',
    yaxis18_title = 'Weekly Sales',
    yaxis19_title = 'Count',
    yaxis20_title = 'Weekly Sales',
    yaxis21_title = 'Count',
    yaxis22_title = 'Weekly Sales',
    xaxis1_title = 'Store',
    xaxis2_title = 'Store',
    xaxis3_title = 'Date',
    xaxis4_title = 'Date',
    xaxis5_title = 'Weekly_Sales',
    xaxis6_title = 'Weekly_Sales',
    xaxis7_title = 'Holiday_Flag',
    xaxis8_title = 'Holiday_Flag',
    xaxis9_title = 'Temperature',
    xaxis10_title = 'Temperature',
    xaxis11_title = 'Fuel_Price',
    xaxis12_title = 'Fuel_Price',
    xaxis13_title = 'CPI',
    xaxis14_title = 'CPI',
    xaxis15_title = 'Unemployment',
    xaxis16_title = 'Unemployment',
    xaxis17_title = 'Year',
    xaxis18_title = 'Year',
    xaxis19_title = 'Month',
    xaxis20_title = 'Month',
    xaxis21_title = 'Day',
    xaxis22_title = 'Day',
    
    title={
        'y': 1,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    showlegend = False,
    margin=go.layout.Margin(l=150,
                                                r=70,
                                                b=50,
                                                t=80,
                                                pad = 5
                                                ),
) 
fig_vis.update_annotations(yshift=30)

In [30]:
df = df.drop('Date', axis = 1)

# Part 2 : Machine Learnining

## 2.1 Baseline model

In [52]:
target_variable = 'Weekly_Sales'

X = df.drop(target_variable, axis = 1)
Y = df.loc[:,target_variable]

In [53]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [54]:
#numeric_features = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'year', 'month', 'day']
#categorical_features = ['Store', 'Holiday_Flag']

#numeric_transformer = StandardScaler()
#categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='first')

#preprocessor = ColumnTransformer(
#    transformers=[
#        ('num', numeric_transformer, numeric_features),
#        ('cat', categorical_transformer, categorical_features)
#    ])

In [55]:
numeric_features = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day']
categorical_features = ['Store', 'Holiday_Flag']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('encoder', OneHotEncoder(drop='first', handle_unknown = 'ignore')) 
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [56]:
#X_train_s = preprocessor.fit_transform(X_train)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [57]:
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

LinearRegression()

In [58]:
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
Y_train_pred = regressor.predict(X_train) 
Y_test_pred = regressor.predict(X_test)

In [59]:
# Print R^2 scores
print("R2 score on training set : ", regressor.score(X_train, Y_train))
print("R2 score on test set : ", regressor.score(X_test, Y_test))

R2 score on training set :  0.9836190433886791
R2 score on test set :  0.9584738281103978


In [60]:
#The baseline model, although very strong, shows some overfitting, let's analyse the coeffs

In [61]:
column_names = []
for name, pipeline, features_list in preprocessor.transformers_: 
    if name == 'num': 
        features = features_list 
    else: 
        features = pipeline.named_steps['encoder'].get_feature_names_out() 
    column_names.extend(features)

In [73]:
print("Names of columns corresponding to each coefficient: ", column_names)

Names of columns corresponding to each coefficient:  ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day', 'x0_2.0', 'x0_3.0', 'x0_4.0', 'x0_5.0', 'x0_6.0', 'x0_7.0', 'x0_8.0', 'x0_9.0', 'x0_10.0', 'x0_11.0', 'x0_13.0', 'x0_14.0', 'x0_15.0', 'x0_16.0', 'x0_17.0', 'x0_18.0', 'x0_19.0', 'x0_20.0', 'x1_1.0']


In [62]:
coefs = pd.DataFrame(index = (column_names), data = regressor.coef_.transpose(), columns=['coefficients'])
feature_importance = abs(coefs).sort_values(by='coefficients')
feature_importance.head()

Unnamed: 0,coefficients
Month,38889.755625
Day,46730.726243
Temperature,57618.832434
x0_11.0,59141.378505
x0_6.0,66594.50088


In [64]:
coefs

Unnamed: 0,coefficients
Temperature,-57618.83
Fuel_Price,-83708.49
CPI,1635968.0
Unemployment,-93577.07
Year,-97008.21
Month,38889.76
Day,-46730.73
x0_2.0,178266.8
x0_3.0,-1457641.0
x0_4.0,4032686.0


In [65]:
fig = px.line(coefs, x = coefs.index, y = 'coefficients')
fig.show()

## 2.2 : Tackle overfitting

### 2.2.1 : Ridge

In [79]:
# Perform grid search
print("Grid search...")
ridge = Ridge()
# Grid of values to be tested
params = {
    'alpha': np.arange(0, 0.50, 1.00)
}
best_ridge = GridSearchCV(ridge, param_grid = params, cv = 10, verbose = 1)
best_ridge.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", best_ridge.best_params_)
print("Best R2 score : ", best_ridge.best_score_)

Grid search...
Fitting 10 folds for each of 1 candidates, totalling 10 fits
...Done.
Best hyperparameters :  {'alpha': 0.0}
Best R2 score :  0.9360990883852244


In [86]:
#The best value for alpha being 0, no regularisation is required. This Ridge will give the same result as our baseline model.

In [80]:
# Print R^2 scores
print("R2 score on training set : ", best_ridge.score(X_train, Y_train))
print("R2 score on test set : ", best_ridge.score(X_test, Y_test))

R2 score on training set :  0.983619043388679
R2 score on test set :  0.958473828110418


In [81]:
scores = cross_val_score(best_ridge.best_estimator_, X_train, Y_train, cv = 10)

print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

The cross-validated R2-score is :  0.9360990883852244
The standard deviation is :  0.02706700912223818


### 2.2.2 : Lasso

In [87]:
# Perform grid search
print("Grid search...")
lasso = Lasso()
# Grid of values to be tested
params = {
    'alpha': [0, 1, 10, 35, 50, 75, 100, 300, 500, 600, 650, 700, 750, 1000]
}
best_lasso = GridSearchCV(lasso, param_grid = params, cv = 10) # cv : the number of folds to be used for CV
best_lasso.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", best_lasso.best_params_)
print("Best R2 score : ", best_lasso.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'alpha': 50}
Best R2 score :  0.9279550993436751


In [88]:
print("LASSO / R2 score on training set : ", best_lasso.score(X_train, Y_train))
print("LASSO / R2 score on test set : ", best_lasso.score(X_test, Y_test))

LASSO / R2 score on training set :  0.9818541399925064
LASSO / R2 score on test set :  0.9745506844605815


In [89]:
scores = cross_val_score(best_lasso.best_estimator_, X_train, Y_train, cv = 10)

print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

The cross-validated R2-score is :  0.9279550993436751
The standard deviation is :  0.024725865661505878


In [92]:
data_dict = {
    'Feature': column_names,
    'Best_Ridge': best_ridge.best_estimator_.coef_,
    'Best_Lasso': best_lasso.best_estimator_.coef_
            }

coefficients = pd.DataFrame(data = data_dict)
coefficients.head()

Unnamed: 0,Feature,Best_Ridge,Best_Lasso
0,Temperature,-57618.83,-53035.36331
1,Fuel_Price,-83708.49,-97031.767561
2,CPI,1635968.0,278283.457046
3,Unemployment,-93577.07,-59981.579319
4,Year,-97008.21,35772.871528


In [94]:
fig = px.line(coefficients, x = 'Feature', y = ['Best_Ridge', 'Best_Lasso'])
fig.show()

# Conclusion

Although a linear regression already gives a very precise estimation of our test data, regularizing overfitting with Lasso pushes this precision to another level by altering the overfitting observed mostly on the Store feature.
In the end the most relevant Feature to determine weekly sales is the Store number and on a lower level the Consumer Price Index. This makes sense as "macro variables" such as Unemployment, Temperature and the Date will have a much lower impact on a 
store's weekly sales than local variables such as the store size, the number of products it distributes, it's consumer base and other more relevant variables related to each store taken individually.