# Sales Prediction with Walmart Data (All Stores)


In [1]:
# Import the Dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from config import db_password
import psycopg2
from sqlalchemy import create_engine

# Data Selection:

The data was taken from Kaggle. The link to source is: https://www.kaggle.com/aditya6196/retail-analysis-with-walmart-data

# Data Cleaning:

The data was cleaned using an ETL function which was described in the Walmart_Wkly_Sales_ETL.ipynb file of the ETL_Analysis branch. The cleaned data was then stored in the postgres as 'Weekly_Sales', 'Features' and 'Holidays' tables.
The data was then stored in the RDS database of the Amazon Web Services(AWS) so that it can be easily imported to some other remote file. 

# Importing the Data:

Here, the data was imported from the RDS database of the AWS.

In [2]:
# Creating a connection with the postgres
db_string = f"postgres://postgres:{db_password}@walmartsales.ctixdh2hiprk.us-east-2.rds.amazonaws.com:5432/WMT_SALEs"
engine = create_engine(db_string)
# weekly_sales_df.to_sql(name='Weekly_Sales', con=engine, if_exists='append',index=False)

In [3]:
# Read the data from postgres into the pandas dataframe
sales_df = pd.read_sql_table("Weekly_Sales",con = engine)
sales_df.head(10)

Unnamed: 0,index_id,Rev_Date,Date,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Month,Year,Week
0,1-2010-05-02,2010-05-02,2010-05-02,1,1643690.9,0,42.31,2.572,211.096358,8.106,5,2010,17
1,1-2010-12-02,2010-12-02,2010-12-02,1,1641957.44,1,38.51,2.548,211.24217,8.106,12,2010,48
2,1-2010-02-19,2010-02-19,2010-02-19,1,1611968.17,0,39.93,2.514,211.289143,8.106,2,2010,7
3,1-2010-02-26,2010-02-26,2010-02-26,1,1409727.59,0,46.63,2.561,211.319643,8.106,2,2010,8
4,1-2010-05-03,2010-05-03,2010-05-03,1,1554806.68,0,46.5,2.625,211.350143,8.106,5,2010,18
5,1-2010-12-03,2010-12-03,2010-12-03,1,1439541.59,0,57.79,2.667,211.380643,8.106,12,2010,48
6,1-2010-03-19,2010-03-19,2010-03-19,1,1472515.79,0,54.58,2.72,211.215635,8.106,3,2010,11
7,1-2010-03-26,2010-03-26,2010-03-26,1,1404429.92,0,51.45,2.732,211.018042,8.106,3,2010,12
8,1-2010-02-04,2010-02-04,2010-02-04,1,1594968.28,0,62.27,2.719,210.82045,7.808,2,2010,5
9,1-2010-09-04,2010-09-04,2010-09-04,1,1545418.53,0,65.86,2.77,210.622857,7.808,9,2010,35


In [4]:
# Checking the data types 
sales_df.dtypes

index_id                object
Rev_Date        datetime64[ns]
Date            datetime64[ns]
Store                    int64
Weekly_Sales           float64
Holiday_Flag             int64
Temperature            float64
Fuel_Price             float64
CPI                    float64
Unemployment           float64
Month                    int64
Year                     int64
Week                     int64
dtype: object

In [49]:
sales_df.describe()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Month,Year,Week
count,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0
mean,23.0,1046965.0,0.06993,60.663782,3.358607,171.578394,7.999151,6.475524,2010.965035,26.0
std,12.988182,564366.6,0.255049,18.444933,0.45902,39.356712,1.875885,3.321797,0.797019,14.511794
min,1.0,209986.2,0.0,-2.06,2.472,126.064,3.879,1.0,2010.0,1.0
25%,12.0,553350.1,0.0,47.46,2.933,131.735,6.891,4.0,2010.0,14.0
50%,23.0,960746.0,0.0,62.67,3.445,182.616521,7.874,6.0,2011.0,26.0
75%,34.0,1420159.0,0.0,74.94,3.735,212.743293,8.622,9.0,2012.0,38.0
max,45.0,3818686.0,1.0,100.14,4.468,227.232807,14.313,12.0,2012.0,52.0


# Data Preprocessing:

The cleaned data was then divided into the input(X) and the target/output(y) features. Also, the non-relevant columns were dropped from the data. All the columns to be used in the model must contain a numerical data type.

In [5]:
# Output(target) and input data division
y = sales_df["Weekly_Sales"]
X = sales_df.drop(columns =["Weekly_Sales","Date","Rev_Date","index_id"]) 

In [6]:
 X[:5]

Unnamed: 0,Store,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Month,Year,Week
0,1,0,42.31,2.572,211.096358,8.106,5,2010,17
1,1,1,38.51,2.548,211.24217,8.106,12,2010,48
2,1,0,39.93,2.514,211.289143,8.106,2,2010,7
3,1,0,46.63,2.561,211.319643,8.106,2,2010,8
4,1,0,46.5,2.625,211.350143,8.106,5,2010,18


In [7]:
X.shape

(6435, 9)

In [8]:
X.dtypes

Store             int64
Holiday_Flag      int64
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
Month             int64
Year              int64
Week              int64
dtype: object

In [9]:
# Correlation Matrix of Temperature and Weekly_Sales
r_temp = np.corrcoef(X.Temperature, y)
r_temp

array([[ 1.        , -0.06381001],
       [-0.06381001,  1.        ]])

In [10]:
# Correlation Matrix of Holiday_Flag and Weekly_Sales
r_holi = np.corrcoef(X.Holiday_Flag, y)
r_holi

array([[1.        , 0.03689097],
       [0.03689097, 1.        ]])

In [11]:
# Correlation Matrix of Fuel_Price and Weekly_Sales
r_fuel = np.corrcoef(X.Fuel_Price, y)
r_fuel

array([[1.        , 0.00946379],
       [0.00946379, 1.        ]])

In [12]:
# Correlation Matrix of CPI and Weekly_Sales
r_cpi = np.corrcoef(X.CPI, y)
r_cpi

array([[ 1.        , -0.07263416],
       [-0.07263416,  1.        ]])

In [13]:
# Correlation Matrix of Unemployment and Weekly_Sales
r_unemp = np.corrcoef(X.Unemployment, y)
r_unemp

array([[ 1.        , -0.10617609],
       [-0.10617609,  1.        ]])

In [14]:
# Correlation Matrix of Month and Weekly_Sales
r_month = np.corrcoef(X.Month, y)
r_month

array([[1.        , 0.06753523],
       [0.06753523, 1.        ]])

In [15]:
# Correlation Matrix of Week and Weekly_Sales
r_week = np.corrcoef(X.Week, y)
r_week

array([[1.        , 0.06610484],
       [0.06610484, 1.        ]])

In [16]:
# Correlation Matrix of Year and Weekly_Sales
r_year = np.corrcoef(X.Year, y)
r_year

array([[ 1.        , -0.01837754],
       [-0.01837754,  1.        ]])

In [17]:
# Correlation Matrix of Store and Weekly_Sales
r_store = np.corrcoef(X.Store, y)
r_store

array([[ 1.        , -0.33533201],
       [-0.33533201,  1.        ]])

## Splitting the Data into trainnig and testing datasets:

The data needs to be split into the training and testing data-sets in the ratio of 75-25% before fitting in the StandardScaler instance. This prevents testing data from influencing the standardization function.

In [18]:
# Splitting the data into trainnig and testing datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [19]:
X_train.shape

(4826, 9)

In [20]:
X_test.shape

(1609, 9)

In [21]:
y_train.shape

(4826,)

In [22]:
y_test.shape

(1609,)

## Scale the Data:

In [23]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Linear Regression Model:

Linear regression is a statistical model that is used to predict a continuous dependent variable based on one or more independent variables fitted to the equation of a line.
Multiple linear regression builds a linear regression model with two or more independent variables.
In this case, the dependent variable(target variable i.e. y) is dependent upon several independent variables(X). A regression model involving multiple variables can be represented as:

y = b0 + m1b1 + m2b2 + m3b3 + … … mnbn

This is the equation of a hyperplane.

In [24]:
# Instantiate the LinearRegression model
model=LinearRegression()

## Train the model:

In [25]:
# Fit the model
model.fit(X_train_scaled, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [41]:
coeff_df = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])  
coeff_df

Unnamed: 0,Coefficient
Store,-198780.473731
Holiday_Flag,8169.432349
Temperature,-20945.112668
Fuel_Price,36588.136604
CPI,-80746.386678
Unemployment,-50538.877582
Month,136458.224166
Year,-40971.926335
Week,-109371.483382


In [26]:
# Make the predictions
y_pred_01 = model.predict(X_test_scaled)
print(y_pred_01)

[1454090.00958145 1102274.96429005 1176495.51447722 ...  984368.72838373
 1302571.30357656  802214.13594909]


In [27]:
# Create the dataframe for Prediction and Actual values
predictions_01 = pd.DataFrame({"Prediction": y_pred_01, "Actual": y_test})
predictions_01.head(10)

Unnamed: 0,Prediction,Actual
447,1454090.0,1870619.23
2196,1102275.0,448391.99
3253,1176496.0,1272948.27
2873,1000270.0,744969.42
712,1285005.0,325345.41
2852,1029710.0,2080529.06
1165,1158633.0,528832.54
2202,1230308.0,457504.35
3140,1136820.0,921612.53
2804,1073955.0,2135982.79


## Calculation of Metrics:

### Root Mean Squared Error

Root Mean Square Error (RMSE) is the standard deviation of the residuals (prediction errors). Residuals are a measure of how far from the regression line data points are; RMSE is a measure of how spread out these residuals are. In other words, it tells you how concentrated the data is around the line of best fit.

In [50]:
# Calculate Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(predictions_01.Actual, predictions_01.Prediction))
rmse

529802.0649517294

### Mean Absolute Error

In statistics, mean absolute error (MAE) is a measure of errors between paired observations expressing the same phenomenon. 

In [30]:
# Calculate Mean Absolute Error
mae = mean_absolute_error(predictions_01.Actual, predictions_01.Prediction)
mae   

440285.20259857585

### R-squared 

R-squared (R2) is a statistical measure that represents the proportion of the variance for a dependent variable that's explained by an independent variable or variables in a regression model.
R-squared is always between 0 and 100%:
- 0% indicates that the model explains none of the variability of the response data around its mean.
- 100% indicates that the model explains all the variability of the response data around its mean.

In [31]:
# Calculate R-squared
r2 = r2_score(predictions_01.Actual, predictions_01.Prediction)
r2

0.1447931750333451

Since R-squared is only 14%, it means that this Linear Regression Model is not good in prediction and needs some improvement.

This Linear Regression Model can be improved by using the "lag". A "lag" is a fixed amount of passing time; One set of observations in a time series is plotted (lagged) against a second, later set of data. The kth lag is the time period that happened “k” time points before time i.
The "lag" has been implemented in Store-1 data. The link to the file is:
https://github.com/Franceskling/final_project/blob/machine_learning/machine_learning/sales_forecast_store1.ipynb

# Data Transformation:

Transforming the data into a simpler format for storage and future use, such as a CSV, spreadsheet, or database file.

In [44]:
# Saving predictions data 
predictions_01.to_csv("../Resources/predictions_LinearRegression.csv", index=False)

# Random Forest Regressor Model:

A random forest is an ensemble model that consists of many decision trees. Predictions are made by averaging the predictions of each decision tree.

In [32]:
# Create a random forest regressor.
regr = RandomForestRegressor(n_estimators=100, random_state=0)

## Train the model:

In [33]:
# Fitting the model
regr = regr.fit(X_train_scaled, y_train)
regr

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [34]:
# Evaluate the model
y_pred_02 = regr.predict(X_test_scaled)
print(regr.predict(X_test_scaled))

[1848556.341   473399.4062 1317183.1342 ...  646535.0265  953298.1766
  452513.6974]


In [35]:
# create the dataframe for Prediction and Actual values
predictions_02 = pd.DataFrame({"Prediction": y_pred_02, "Actual": y_test})
predictions_02.head(10)

Unnamed: 0,Prediction,Actual
447,1848556.0,1870619.23
2196,473399.4,448391.99
3253,1317183.0,1272948.27
2873,753312.2,744969.42
712,392744.8,325345.41
2852,2113131.0,2080529.06
1165,498645.0,528832.54
2202,487595.2,457504.35
3140,955419.0,921612.53
2804,2126777.0,2135982.79


## Calculation of Metrics:

In [37]:
# Calculate Root Mean Squared Error
rmse = np.sqrt(mean_squared_error(predictions_02.Actual, predictions_02.Prediction))
rmse

118809.7423062449

In [38]:
# Calculate Mean Absolute Error
mae = mean_absolute_error(predictions_02.Actual, predictions_02.Prediction)
mae 

65539.96687047854

In [39]:
# Calculate R-squared
r2 = r2_score(predictions_02.Actual, predictions_02.Prediction)
r2

0.9569921262077471

Since R-squared is 95%, it means that this Random Forest Regression Model is good in prediction as compared to the Linear Regression Model.

# Data Transformation:

Transforming the data into a simpler format for storage and future use, such as a CSV, spreadsheet, or database file.

In [45]:
# Saving predictions data 
predictions_02.to_csv("../Resources/predictions_RandomForest.csv", index=False)

Saving the data from pandas to postgre database.

In [40]:
# Connecting pandas and postgres
# predictions.to_sql(name='Prediction', con=engine, if_exists='append',index=False)