In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from dateutil import parser

# Team
- Group Name: Lone Wolf
- Name: Gao Mo
- Email: david113mo@gmail.com 
- Country: United States
- College: Carnegie Mellon University 
- Specialization: Data Science


# Project Objective:

1. Build atleast 4-5 multivariate forecasting model which included ML or Deep Learning based Model in PySpark leveraging parallel computing techniques(You can develop models without Pyspark if you are not comfortable with pyspark and parallel computing).
2. Demonstrate best in class forecast accuracy (Forecast Accuracy = 1 - Wt. MAPE where Wt. MAPE = sum(Error)/sum(Actual)
3. Write a code in such a way you run the model in least time
4. Demonstrate explainability in the form of contribution of each variables

### Note:

- Leveage Feature Engineering concepts to derive more variables to gain accuracy improvement
- You can build model and demostrate accuracy for Q3-Q4 of 2020

# Pre-Processing and Feature Engineering

In [2]:
data = pd.read_csv('forecasting.csv')
data['Price Discount (%)'] = data['Price Discount (%)'].apply(lambda x: int(x[:-1]))
data.head()

Unnamed: 0,Product,date,Sales,Price Discount (%),In-Store Promo,Catalogue Promo,Store End Promo,Google_Mobility,Covid_Flag,V_DAY,EASTER,CHRISTMAS
0,SKU1,2/5/2017,27750,0,0,0,0,0.0,0,0,0,0
1,SKU1,2/12/2017,29023,0,1,0,1,0.0,0,1,0,0
2,SKU1,2/19/2017,45630,17,0,0,0,0.0,0,0,0,0
3,SKU1,2/26/2017,26789,0,1,0,1,0.0,0,0,0,0
4,SKU1,3/5/2017,41999,17,0,0,0,0.0,0,0,0,0


In [3]:
data['Google_Mobility'] = data['Google_Mobility'].apply(lambda x: int(x))
data.head()

Unnamed: 0,Product,date,Sales,Price Discount (%),In-Store Promo,Catalogue Promo,Store End Promo,Google_Mobility,Covid_Flag,V_DAY,EASTER,CHRISTMAS
0,SKU1,2/5/2017,27750,0,0,0,0,0,0,0,0,0
1,SKU1,2/12/2017,29023,0,1,0,1,0,0,1,0,0
2,SKU1,2/19/2017,45630,17,0,0,0,0,0,0,0,0
3,SKU1,2/26/2017,26789,0,1,0,1,0,0,0,0,0
4,SKU1,3/5/2017,41999,17,0,0,0,0,0,0,0,0


In [4]:
data.date = data.date.apply(lambda x: parser.parse(x))
data.head()

Unnamed: 0,Product,date,Sales,Price Discount (%),In-Store Promo,Catalogue Promo,Store End Promo,Google_Mobility,Covid_Flag,V_DAY,EASTER,CHRISTMAS
0,SKU1,2017-02-05,27750,0,0,0,0,0,0,0,0,0
1,SKU1,2017-02-12,29023,0,1,0,1,0,0,1,0,0
2,SKU1,2017-02-19,45630,17,0,0,0,0,0,0,0,0
3,SKU1,2017-02-26,26789,0,1,0,1,0,0,0,0,0
4,SKU1,2017-03-05,41999,17,0,0,0,0,0,0,0,0


In [5]:
mean_sales = data.Sales.describe().mean()
std_sales = data.Sales.describe().std()
outliers = mean_sales + 1.5*std_sales
data = data[data.Sales <= outliers]
data.head()

Unnamed: 0,Product,date,Sales,Price Discount (%),In-Store Promo,Catalogue Promo,Store End Promo,Google_Mobility,Covid_Flag,V_DAY,EASTER,CHRISTMAS
0,SKU1,2017-02-05,27750,0,0,0,0,0,0,0,0,0
1,SKU1,2017-02-12,29023,0,1,0,1,0,0,1,0,0
2,SKU1,2017-02-19,45630,17,0,0,0,0,0,0,0,0
3,SKU1,2017-02-26,26789,0,1,0,1,0,0,0,0,0
4,SKU1,2017-03-05,41999,17,0,0,0,0,0,0,0,0


In [6]:
q3q4_2020 = parser.parse('2020-07-01')
train_df = data[data.date <= q3q4_2020]
test_df = data[data.date > q3q4_2020]

In [7]:
train_prod1 = train_df[train_df.Product == 'SKU1']
train_prod2 = train_df[train_df.Product == 'SKU2']
train_prod3 = train_df[train_df.Product == 'SKU3']
train_prod4 = train_df[train_df.Product == 'SKU4']
train_prod5 = train_df[train_df.Product == 'SKU5']
train_prod6 = train_df[train_df.Product == 'SKU6']

test_prod1 = test_df[test_df.Product == 'SKU1']
test_prod2 = test_df[test_df.Product == 'SKU2']
test_prod3 = test_df[test_df.Product == 'SKU3']
test_prod4 = test_df[test_df.Product == 'SKU4']
test_prod5 = test_df[test_df.Product == 'SKU5']
test_prod6 = test_df[test_df.Product == 'SKU6']

In [8]:
train_prod6.head()

Unnamed: 0,Product,date,Sales,Price Discount (%),In-Store Promo,Catalogue Promo,Store End Promo,Google_Mobility,Covid_Flag,V_DAY,EASTER,CHRISTMAS
1020,SKU6,2017-02-05,32138,28,0,0,0,0,0,0,0,0
1021,SKU6,2017-02-12,11659,5,0,0,0,0,0,1,0,0
1022,SKU6,2017-02-19,12140,5,1,0,1,0,0,0,0,0
1023,SKU6,2017-02-26,29635,28,0,0,0,0,0,0,0,0
1024,SKU6,2017-03-05,11666,5,0,1,1,0,0,0,0,0


# Model building requirements:
Select your base model and then explore 1 model of each family if its classification problem then 1 model for Linear models, 1- Model for Ensemble, 1-Model for boosting and other models if you have time (like stacking)

In [9]:
train_df.columns

Index(['Product', 'date', 'Sales', 'Price Discount (%)', 'In-Store Promo',
       'Catalogue Promo', 'Store End Promo', 'Google_Mobility', 'Covid_Flag',
       'V_DAY', 'EASTER', 'CHRISTMAS'],
      dtype='object')

In [10]:
feat_lst = ['Price Discount (%)','In-Store Promo',
       'Catalogue Promo', 'Store End Promo', 'Google_Mobility', 'Covid_Flag',
       'V_DAY', 'EASTER', 'CHRISTMAS']

In [11]:
X = train_df[feat_lst].to_numpy()

y = train_df['Sales'].to_numpy()

In [12]:
test_x1 = test_prod1[feat_lst].to_numpy()
test_x2 = test_prod2[feat_lst].to_numpy()
test_x3 = test_prod3[feat_lst].to_numpy()
test_x4 = test_prod4[feat_lst].to_numpy()
test_x5 = test_prod5[feat_lst].to_numpy()
test_x6 = test_prod6[feat_lst].to_numpy()

test_y1 = test_prod1['Sales'].to_numpy()
test_y2 = test_prod2['Sales'].to_numpy()
test_y3 = test_prod3['Sales'].to_numpy()
test_y4 = test_prod4['Sales'].to_numpy()
test_y5 = test_prod5['Sales'].to_numpy()
test_y6 = test_prod6['Sales'].to_numpy()

# 1. Base Model

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

base_model = Pipeline([('lin', LinearRegression())])
base_model.fit(X,y)

Pipeline(steps=[('lin', LinearRegression())])

In [14]:
def forecast_accuracy(pred, label):
    error = [i - j for i,j in zip(pred,label)]
    mape = np.absolute(sum(error)) / sum(label)
    acc = 1 - mape
    return np.absolute(acc)

In [15]:
feat_vals = base_model.named_steps['lin'].coef_
df_importances = pd.DataFrame(data = list(zip(feat_lst,feat_vals)), columns=['feat', 'importance'])
df_importances.importance = df_importances.importance.apply(lambda x: np.absolute(x))
sum_val = sum(list(df_importances.importance))
df_importances.importance = df_importances.importance.apply(lambda x: x/sum_val)
df_importances

Unnamed: 0,feat,importance
0,Price Discount (%),0.01968
1,In-Store Promo,0.083361
2,Catalogue Promo,0.222547
3,Store End Promo,0.265528
4,Google_Mobility,0.002988
5,Covid_Flag,0.343029
6,V_DAY,0.050993
7,EASTER,0.000558
8,CHRISTMAS,0.011315


In [16]:
prediction = base_model.predict(test_x6)
forecast_accuracy(prediction, test_y6)

0.7699930582071464

In [17]:
test_Xs = [test_x1,test_x2,test_x3,test_x4,test_x5,test_x6]
test_ys = [test_y1,test_y2,test_y3,test_y4,test_y5,test_y6]

accs = []
for i,j in zip(test_Xs, test_ys):
    prediction = base_model.predict(i)
    accs.append(forecast_accuracy(prediction, j))

d = list(zip(['SKU1','SKU2','SKU3','SKU4','SKU5','SKU6'],accs))
df_acc = pd.DataFrame(data = d, columns=['Product', 'Forecast_Accuracy'])
df_acc

Unnamed: 0,Product,Forecast_Accuracy
0,SKU1,0.428766
1,SKU2,0.073225
2,SKU3,0.749226
3,SKU4,0.56245
4,SKU5,0.895168
5,SKU6,0.769993


# 2. Linear Model

In [18]:
from sklearn.svm import SVC

lin_model = Pipeline([('svc', SVC(kernel='linear'))])
lin_model.fit(X,y)

Pipeline(steps=[('svc', SVC(kernel='linear'))])

In [19]:
accs = []
for i,j in zip(test_Xs, test_ys):
    prediction = lin_model.predict(i)
    accs.append(forecast_accuracy(prediction, j))

d = list(zip(['SKU1','SKU2','SKU3','SKU4','SKU5','SKU6'],accs))
df_acc = pd.DataFrame(data = d, columns=['Product', 'Forecast_Accuracy'])
df_acc

Unnamed: 0,Product,Forecast_Accuracy
0,SKU1,0.51594
1,SKU2,0.040987
2,SKU3,0.737935
3,SKU4,0.42222
4,SKU5,0.741711
5,SKU6,0.894752


# 3. Ensemble Model

In [20]:
from sklearn.ensemble import RandomForestClassifier

ensemble_model = Pipeline([('RF', RandomForestClassifier(n_estimators=50, 
                                      max_depth=20,  
                                      random_state=10, 
                                      verbose=0))])
ensemble_model.fit(X,y)

Pipeline(steps=[('RF',
                 RandomForestClassifier(max_depth=20, n_estimators=50,
                                        random_state=10))])

In [21]:
feat_vals = ensemble_model.named_steps['RF'].feature_importances_
df_importances = pd.DataFrame(data = list(zip(feat_lst,feat_vals)), columns=['feat', 'importance'])
df_importances

Unnamed: 0,feat,importance
0,Price Discount (%),0.653999
1,In-Store Promo,0.040013
2,Catalogue Promo,0.026686
3,Store End Promo,0.045055
4,Google_Mobility,0.133397
5,Covid_Flag,0.030393
6,V_DAY,0.02522
7,EASTER,0.023251
8,CHRISTMAS,0.021986


In [22]:
accs = []
for i,j in zip(test_Xs, test_ys):
    prediction = ensemble_model.predict(i)
    accs.append(forecast_accuracy(prediction, j))

d = list(zip(['SKU1','SKU2','SKU3','SKU4','SKU5','SKU6'],accs))
df_acc = pd.DataFrame(data = d, columns=['Product', 'Forecast_Accuracy'])
df_acc

Unnamed: 0,Product,Forecast_Accuracy
0,SKU1,0.510794
1,SKU2,0.226012
2,SKU3,0.969402
3,SKU4,0.873024
4,SKU5,0.956232
5,SKU6,0.677644


# 4. Boosting Model

In [23]:
from sklearn.ensemble import GradientBoostingRegressor

boost_model = Pipeline([('boost', GradientBoostingRegressor(random_state=10))])
boost_model.fit(X,y)

Pipeline(steps=[('boost', GradientBoostingRegressor(random_state=10))])

In [24]:
feat_vals = boost_model.named_steps['boost'].feature_importances_
df_importances = pd.DataFrame(data = list(zip(feat_lst,feat_vals)), columns=['feat', 'importance'])
df_importances

Unnamed: 0,feat,importance
0,Price Discount (%),0.835192
1,In-Store Promo,0.006514
2,Catalogue Promo,0.009072
3,Store End Promo,0.030644
4,Google_Mobility,0.013762
5,Covid_Flag,0.087619
6,V_DAY,0.016612
7,EASTER,0.000434
8,CHRISTMAS,0.000151


In [25]:
accs = []
for i,j in zip(test_Xs, test_ys):
    prediction = boost_model.predict(i)
    accs.append(forecast_accuracy(prediction, j))

d = list(zip(['SKU1','SKU2','SKU3','SKU4','SKU5','SKU6'],accs))
df_acc = pd.DataFrame(data = d, columns=['Product', 'Forecast_Accuracy'])
df_acc

Unnamed: 0,Product,Forecast_Accuracy
0,SKU1,0.908625
1,SKU2,0.294121
2,SKU3,0.77293
3,SKU4,0.379625
4,SKU5,0.721423
5,SKU6,0.808857
