In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder 
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn import metrics

In [2]:
df = pd.read_csv("F:\Document\JEDHA\Projet_ML\Projet_Walmart\Walmart_Store_sales.csv")

#Création de colonnes de type datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
df.info()

df['dayofweek'] = df['Date'].dt.weekday
df['dayofmonth'] = df['Date'].dt.day
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month

#Statistique basique
print("Number of rows : {}".format(df.shape[0]))
print()

print("Display of dataset: ")
display(df.head())
print()

print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Store         150 non-null    float64       
 1   Date          132 non-null    datetime64[ns]
 2   Weekly_Sales  136 non-null    float64       
 3   Holiday_Flag  138 non-null    float64       
 4   Temperature   132 non-null    float64       
 5   Fuel_Price    136 non-null    float64       
 6   CPI           138 non-null    float64       
 7   Unemployment  135 non-null    float64       
dtypes: datetime64[ns](1), float64(7)
memory usage: 9.5 KB
Number of rows : 150

Display of dataset: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,dayofweek,dayofmonth,year,month
0,6.0,2011-02-18,1572117.54,,59.61,3.045,214.777523,6.858,4.0,18.0,2011.0,2.0
1,13.0,2011-03-25,1807545.43,0.0,42.38,3.435,128.616064,7.47,4.0,25.0,2011.0,3.0
2,17.0,2012-07-27,,0.0,,,130.719581,5.936,4.0,27.0,2012.0,7.0
3,11.0,NaT,1244390.03,0.0,84.57,,214.556497,7.346,,,,
4,6.0,2010-05-28,1644470.66,0.0,78.89,2.759,212.412888,7.092,4.0,28.0,2010.0,5.0



Basics statistics: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,dayofweek,dayofmonth,year,month
count,150.0,132,136.0,138.0,132.0,136.0,138.0,135.0,132.0,132.0,132.0,132.0
mean,9.866667,2011-05-07 09:05:27.272727296,1249536.0,0.07971,61.398106,3.320853,179.898509,7.59843,4.0,16.522727,2010.856061,6.393939
min,1.0,2010-02-05 00:00:00,268929.0,0.0,18.79,2.514,126.111903,5.143,4.0,1.0,2010.0,1.0
25%,4.0,2010-08-16 12:00:00,605075.7,0.0,45.5875,2.85225,131.970831,6.5975,4.0,10.0,2010.0,4.0
50%,9.0,2011-05-09 12:00:00,1261424.0,0.0,62.985,3.451,197.908893,7.47,4.0,17.0,2011.0,6.0
75%,15.75,2012-01-14 18:00:00,1806386.0,0.0,76.345,3.70625,214.934616,8.15,4.0,24.0,2012.0,9.0
max,20.0,2012-10-19 00:00:00,2771397.0,1.0,91.65,4.193,226.968844,14.313,4.0,31.0,2012.0,12.0
std,6.231191,,647463.0,0.271831,18.378901,0.478149,40.274956,1.577173,0.0,8.307511,0.811488,3.21437



Percentage of missing values: 


Store            0.000000
Date            12.000000
Weekly_Sales     9.333333
Holiday_Flag     8.000000
Temperature     12.000000
Fuel_Price       9.333333
CPI              8.000000
Unemployment    10.000000
dayofweek       12.000000
dayofmonth      12.000000
year            12.000000
month           12.000000
dtype: float64

In [3]:
#Enlèvement des lignes où Weekly_sales est NaN
df1= df.copy(deep=True)
df1=df1.dropna(subset=['Weekly_Sales'])

#Enlèvement de la colonne Data
useless_column= ['Date']
df1 = df1.drop(useless_column, axis=1)

display(100*df1.isnull().sum()/df1.shape[0])

Store            0.000000
Weekly_Sales     0.000000
Holiday_Flag     8.088235
Temperature     11.029412
Fuel_Price       8.823529
CPI              8.088235
Unemployment    10.294118
dayofweek       13.235294
dayofmonth      13.235294
year            13.235294
month           13.235294
dtype: float64

In [4]:
# Remplacement des valeurs NaN dans les colonnes
col_name = ['Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'dayofweek', 'dayofmonth', 'year', 'month']
for name in col_name:
    df1[name] = df1[name].fillna(df1[name].mean())
    
#Changement de type en integer
col_name = ['Store', 'Holiday_Flag', 'dayofmonth', 'dayofmonth', 'year', 'month']
for name in col_name:
    df1[name] = df1[name].astype('int')

display(df1.head())
print("Percentage of missing values: ")
display(100*df1.isnull().sum()/df1.shape[0])

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,dayofweek,dayofmonth,year,month
0,6,1572117.54,0,59.61,3.045,214.777523,6.858,4.0,18,2011,2
1,13,1807545.43,0,42.38,3.435,128.616064,7.47,4.0,25,2011,3
3,11,1244390.03,0,84.57,3.316992,214.556497,7.346,4.0,16,2010,6
4,6,1644470.66,0,78.89,2.759,212.412888,7.092,4.0,28,2010,5
5,4,1857533.7,0,60.853967,2.756,126.160226,7.896,4.0,28,2010,5


Percentage of missing values: 


Store           0.0
Weekly_Sales    0.0
Holiday_Flag    0.0
Temperature     0.0
Fuel_Price      0.0
CPI             0.0
Unemployment    0.0
dayofweek       0.0
dayofmonth      0.0
year            0.0
month           0.0
dtype: float64

Visualisation des outliers

In [5]:
fig = px.box(df1, y="Temperature")
fig.show()

In [6]:
fig = px.box(df1, y="Fuel_Price")
fig.show()

In [7]:
fig = px.box(df1, y="CPI")
fig.show()

In [8]:
fig = px.box(df1, y="Unemployment")
fig.show()

In [9]:
#Enlèvement des 
remove_outlier=['Temperature','Fuel_Price','CPI','Unemployment']

for col in remove_outlier:
    mean = df1[col].mean()
    std= df1[col].std()

    mask = np.abs((df1[col] - mean) <= 3* std) & ((df1[col] - mean) >= - 3* std)
    df1 = df1[mask]

display(df1.head())
display(df1.describe(include='all'))

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,dayofweek,dayofmonth,year,month
0,6,1572117.54,0,59.61,3.045,214.777523,6.858,4.0,18,2011,2
1,13,1807545.43,0,42.38,3.435,128.616064,7.47,4.0,25,2011,3
3,11,1244390.03,0,84.57,3.316992,214.556497,7.346,4.0,16,2010,6
4,6,1644470.66,0,78.89,2.759,212.412888,7.092,4.0,28,2010,5
5,4,1857533.7,0,60.853967,2.756,126.160226,7.896,4.0,28,2010,5


Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,dayofweek,dayofmonth,year,month
count,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0,131.0
mean,9.938931,1257990.0,0.061069,60.453783,3.304198,180.000711,7.427871,4.0,16.458015,2010.717557,6.236641
std,6.228663,657746.3,0.240376,17.444613,0.452979,38.009862,0.942684,0.0,7.649293,0.815945,2.953048
min,1.0,268929.0,0.0,18.79,2.514,126.111903,5.143,4.0,1.0,2010.0,1.0
25%,4.0,584243.9,0.0,47.085,2.8515,133.361048,6.833,4.0,12.0,2010.0,4.0
50%,9.0,1366396.0,0.0,60.853967,3.316992,192.826069,7.567,4.0,16.0,2010.0,6.0
75%,16.0,1809576.0,0.0,75.045,3.683,214.781675,8.059,4.0,22.0,2011.0,8.0
max,20.0,2771397.0,1.0,91.65,4.17,226.968844,9.524,4.0,31.0,2012.0,12.0


## 1-EDA sur le dataset

In [10]:
weekly_sales_mean = df1.groupby(['Store']).agg({'Weekly_Sales':'mean'})

fig = px.bar(weekly_sales_mean, x=weekly_sales_mean.index, y='Weekly_Sales', title="Moyenne des ventes par magasin", color='Weekly_Sales')
fig.update_xaxes(title="Magasin")
fig.update_yaxes(title="Moyenne des Ventes par magasin")
fig.show()

Les magasins 4 et 14 ont des dépenses moyennes au dessus de 2 millions.


In [11]:
weekly_sales_mean= df1.groupby(['Holiday_Flag']).agg({'Weekly_Sales':'mean'})
fig = px.bar(weekly_sales_mean, x=weekly_sales_mean.index, y='Weekly_Sales', title="Vente moyenne entre jour férié ou non", color='Weekly_Sales')
fig.update_xaxes(title="Holiday_flag")
fig.update_yaxes(title="vente moyenne")
fig.show()

Il n'y a pas de différence majeur entre les jours férié et les autres jours.

In [12]:
weekly_sales_mean= df1.groupby(['dayofmonth']).agg({'Weekly_Sales':'mean'})
fig = px.bar(weekly_sales_mean, x=weekly_sales_mean.index, y='Weekly_Sales', title="Vente moyenne pour chaque jour du mois", color='Weekly_Sales')
fig.update_xaxes(title="Jour du mois")
fig.update_yaxes(title="Magasin")
fig.show()

Le 8 du mois se démarque avec une moyenne à plus de 2 millions.

In [13]:
weekly_sales_mean= df1.groupby(['month']).agg({'Weekly_Sales':'mean'})
fig = px.bar(weekly_sales_mean, x=weekly_sales_mean.index, y='Weekly_Sales', title="Vente moyenne par mois", color='Weekly_Sales')
fig.update_xaxes(title="Mois")
fig.update_yaxes(title="Vente moyenne")
fig.show()

Le mois de décembre est sans surprise, le mois où les ventes moyennes sont à plus de 2 millions.

In [14]:
weekly_sales_mean= df1.groupby(['year']).agg({'Weekly_Sales':'mean'})
fig = px.bar(weekly_sales_mean, x=weekly_sales_mean.index, y='Weekly_Sales', title="Vente moyenne par année", color='Weekly_Sales')
fig.update_xaxes(title="Année")
fig.update_yaxes(title="Vente moyenne")
fig.show()

In [15]:
fig = px.scatter(df1, x='Temperature', y = 'Weekly_Sales', trendline="ols")
fig.show()

Lorsque les températures sont élevées, les ventes sont faibles.

In [16]:
fig = px.scatter(df1, x='Fuel_Price', y = 'Weekly_Sales', trendline="ols")
fig.show()

Le prix de l'essence n'a pas l'air d'avoir un impact sur les ventes.

In [17]:
fig = px.scatter(df1, x='CPI', y = 'Weekly_Sales', trendline="ols")
fig.show()

Les ventes baissent lorsque le CPI augmente.

In [18]:
fig = px.scatter(df1, x='Unemployment', y = 'Weekly_Sales', trendline="ols")
fig.show()

In [19]:
#Matrice de corrélation
corr_matrix = df1.corr().round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())
fig.show()

Nous pouvons remarquer que la colonne dayofweek ne donne aucune valeur. En cherchant dans le dataset, nous remarquons qu'il n'y a qu'une valeur, le 4 (vendredi). Nous pouvons supposer que cette colonne n'aura pas d'impact sur le modèle.

Fuel_Price et year, CPI et Store, sont fortement corrélées.

L'indice des prix à la consommation est très élevé dans le Store 1

In [21]:
fig = px.scatter(df1, x='year', y = 'Fuel_Price', trendline="ols")
fig.show()

On peut observer l'augmentation du prix de l'essence par année.

## 2-Modèle

In [22]:
# Separate target variable Y from features X
target_name = 'Weekly_Sales'

print("Separating labels from features...")
Y = df1.loc[:,target_name]
X = df1.drop(target_name, axis = 1)
print("...Done.")
print(Y.head())
print()
print(X.head())
print()

Separating labels from features...
...Done.
0    1572117.54
1    1807545.43
3    1244390.03
4    1644470.66
5    1857533.70
Name: Weekly_Sales, dtype: float64

   Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
0      6             0    59.610000    3.045000  214.777523         6.858   
1     13             0    42.380000    3.435000  128.616064         7.470   
3     11             0    84.570000    3.316992  214.556497         7.346   
4      6             0    78.890000    2.759000  212.412888         7.092   
5      4             0    60.853967    2.756000  126.160226         7.896   

   dayofweek  dayofmonth  year  month  
0        4.0          18  2011      2  
1        4.0          25  2011      3  
3        4.0          16  2010      6  
4        4.0          28  2010      5  
5        4.0          28  2010      5  



In [23]:
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [24]:
# Create pipeline for numeric features
numeric_features = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'dayofweek', 'dayofmonth', 'year', 'month'] 
numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_features = ['Store', 'Holiday_Flag']
categorical_transformer = Pipeline(steps=[
        ('encoder', OneHotEncoder(drop='first',))
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5])
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head())
X_test = preprocessor.transform(X_test)
print('...Done.')
print(X_test[0:5,:])
print()

Performing preprocessings on train set...
     Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
136      4             0        84.59    3.469000  129.112500      5.644000   
78       1             0        62.25    3.308000  218.220509      7.866000   
17      18             0        21.33    2.788000  131.527903      9.202000   
108     18             0        69.12    2.906000  132.293936      7.665582   
141      5             0        62.37    3.316992  212.560411      6.768000   

     dayofweek  dayofmonth  year  month  
136        4.0           8  2011      7  
78         4.0          18  2011     11  
17         4.0          16  2010      6  
108        4.0          28  2010      5  
141        4.0          12  2010     11  
...Done.
[[ 1.46357146  0.34189815 -1.33109628 -1.85331771  0.         -1.14448369
   0.28442728  0.22259605  0.          0.          1.          0.
   0.          0.          0.          0.          0.          0.
   0.          0

In [25]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")

Y_train_pred = regressor.predict(X_train)
Y_test_pred = regressor.predict(X_test)

Train model...
...Done.


In [26]:
# Print scores dans le cas où on n'a pas fait de prédictions.
print("R2 score on training set : ", regressor.score(X_train, Y_train))
print("R2 score on test set : ", regressor.score(X_test, Y_test))

R2 score on training set :  0.9731303495600531
R2 score on test set :  0.9316280242865035


Le modèle présente de bonne performance.

In [27]:
#Recherche des features les plus performantes
print(regressor.coef_)

[-3.84735763e+04 -3.78984176e+04  7.21891151e+04 -9.51393346e+04
 -7.47968443e-09 -3.26656122e+04 -2.41683510e+04  6.58526425e+04
  4.05765581e+05 -1.25323732e+06  6.00476303e+05 -1.40279710e+06
 -2.38819507e+04 -8.96103679e+05 -7.47265442e+05 -1.27162465e+06
  6.48294941e+05  2.38450731e+03  5.31968843e+05  7.13270344e+05
 -6.86826157e+05 -1.11100322e+06 -6.80007990e+05 -1.69759458e+05
  6.93304583e+04  3.69154450e+05 -4.37418671e+04]


In [28]:
column_names = []
for name, pipeline, features_list in preprocessor.transformers_:
    if name == 'num': 
        features = features_list 
    else: 
        features = pipeline.named_steps['encoder'].get_feature_names_out() 
    column_names.extend(features) 
        
print("Names of columns corresponding to each coefficient: ", column_names)

Names of columns corresponding to each coefficient:  ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'dayofweek', 'dayofmonth', 'year', 'month', 'Store_2', 'Store_3', 'Store_4', 'Store_5', 'Store_6', 'Store_7', 'Store_8', 'Store_9', 'Store_10', 'Store_11', 'Store_13', 'Store_14', 'Store_15', 'Store_16', 'Store_17', 'Store_18', 'Store_19', 'Store_20', 'Holiday_Flag_1']


In [29]:
coefs = pd.DataFrame(index = column_names, data = regressor.coef_.transpose(), columns=["coefficients"])
coefs

Unnamed: 0,coefficients
Temperature,-38473.58
Fuel_Price,-37898.42
CPI,72189.12
Unemployment,-95139.33
dayofweek,-7.479684e-09
dayofmonth,-32665.61
year,-24168.35
month,65852.64
Store_2,405765.6
Store_3,-1253237.0


## 3-Régularisation par Ridge et Lasso, suivi d'un gridsearch

In [30]:
#Ridge
print("3-fold cross-validation...")
regressor = Ridge()
scores = cross_val_score(regressor, X_train, Y_train, cv=3)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

3-fold cross-validation...
The cross-validated R2-score is :  0.8104152944047004
The standard deviation is :  0.03333932739565629


In [31]:
print("Grid search...")
regressor = Ridge()

params = {
    'alpha': [0.0, 0.1, 0.5, 1.0]
}
linreg_cv = GridSearchCV(regressor, param_grid = params, cv = 3)
linreg_cv.fit(X_train, Y_train)
print("...Done.")
print("")
print("Tuned Linear Regression Parameters: {}".format(linreg_cv.best_params_))
print("Best score is {}".format(linreg_cv.best_score_))
print("")
print("R2 score on training set : ", linreg_cv.score(X_train, Y_train))
print("R2 score on test set : ", linreg_cv.score(X_test, Y_test))

Grid search...
...Done.

Tuned Linear Regression Parameters: {'alpha': 0.0}
Best score is 0.9283701658158442

R2 score on training set :  0.963753497916666
R2 score on test set :  0.9004613588555153


In [32]:
#Lasso
print("3-fold cross-validation...")
regressor = Lasso()
scores = cross_val_score(regressor, X_train, Y_train, cv=3)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

3-fold cross-validation...
The cross-validated R2-score is :  0.9284151341693047
The standard deviation is :  0.014711791196219046


In [33]:
print("Grid search...")
regressor = Lasso()

params = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.5, 5.0]
}
linreg_cv = GridSearchCV(regressor, param_grid = params, cv = 3)
linreg_cv.fit(X_train, Y_train)
print("...Done.")
print("")
print("Tuned Linear Regression Parameters: {}".format(linreg_cv.best_params_))
print("Best score is {}".format(linreg_cv.best_score_))
print("")
print("R2 score on training set : ", linreg_cv.score(X_train, Y_train))
print("R2 score on test set : ", linreg_cv.score(X_test, Y_test))

Grid search...
...Done.

Tuned Linear Regression Parameters: {'alpha': 5.0}
Best score is 0.9284538345104072

R2 score on training set :  0.9731302891438096
R2 score on test set :  0.9317441661364763


La régularisation apporte les mêmes résultats que le modèle de base.