## Task 1 - Assess How Competitors and Promotions Influence Sales and Customer Count for the Mass-market Retailer

In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import pandas_profiling as pp

import pickle
from scipy import stats as st
import statsmodels.formula.api as smf

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')

### Task1.1 Read Data

In [2]:
sales_cust = pd.read_csv("../dataset/task1_data/sales_cust.csv", index_col="Store", parse_dates=["Date"])
sales_cust = sales_cust[sales_cust.Open == 1] # we don't need to learn "Open"'s effect on Sales or Customers.
del sales_cust["Open"]
sales_cust["StateHoliday"] = sales_cust.StateHoliday.astype("str").str.strip()
sales_cust["SchoolHoliday"] = sales_cust.SchoolHoliday.astype("str").str\
                            .replace("nan","0").replace("1.0", "1").replace("0.0", "0").str.strip()
# sales_cust = pd.get_dummies(sales_cust, drop_first=True)

sales_cust.head(2)

Unnamed: 0_level_0,DayOfWeek,Date,Sales,Customers,Promo,StateHoliday,SchoolHoliday
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,5,2015-07-31,5263,555,1,0,1
2,5,2015-07-31,6064,625,1,0,1


In [3]:
store = pd.read_csv("../dataset/task1_data/store.csv", index_col="Store")

store["CompetitionOpenSinceDate"] = pd.to_datetime("")
competetion_non_null = ~((store.CompetitionOpenSinceYear.isnull())|(store.CompetitionOpenSinceMonth.isnull()))
s1 = store.loc[competetion_non_null, "CompetitionOpenSinceYear"].astype(int)
s2 = store.loc[competetion_non_null, "CompetitionOpenSinceMonth"].astype(int)
store.loc[competetion_non_null, "CompetitionOpenSinceDate"] = \
pd.to_datetime(["{}-{}".format(x,y) for x,y in zip(s1,s2)])

store["Promo2SinceDate"] = pd.to_datetime("")
promo2_non_null = ~((store.Promo2SinceWeek.isnull())|(store.Promo2SinceYear.isnull()))
s1 = store.loc[promo2_non_null, "Promo2SinceYear"].astype(int)
s2 = store.loc[promo2_non_null, "Promo2SinceWeek"].astype(int)
store.loc[promo2_non_null, "Promo2SinceDate"] = \
pd.to_datetime([datetime.strptime("{}-W{}-1".format(x,y), "%Y-W%W-%w") for x,y in zip(s1,s2)])

In [4]:
del store["CompetitionOpenSinceMonth"], store["CompetitionOpenSinceYear"]
del store["Promo2SinceWeek"], store["Promo2SinceYear"]
store.head(2)

Unnamed: 0_level_0,StoreType,Assortment,CompetitionDistance,Promo2,PromoInterval,CompetitionOpenSinceDate,Promo2SinceDate
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,c,a,1270.0,0,,2008-09-01,NaT
2,a,a,570.0,1,"Jan,Apr,Jul,Oct",2007-11-01,2010-03-29


### Task1.2 Exploratory Data Analyses (EDA)

In [5]:
sales_cust.columns

Index(['DayOfWeek', 'Date', 'Sales', 'Customers', 'Promo', 'StateHoliday',
       'SchoolHoliday'],
      dtype='object')

In [6]:
pp.ProfileReport(sales_cust)

0,1
Number of variables,8
Number of observations,844347
Total Missing (%),0.0%
Total size in memory,51.5 MiB
Average record size in memory,64.0 B

0,1
Numeric,4
Categorical,2
Boolean,1
Date,1
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,1115
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,558.43
Minimum,1
Maximum,1115
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,56
Q1,280
Median,558
Q3,837
95-th percentile,1060
Maximum,1115
Range,1114
Interquartile range,557

0,1
Standard deviation,321.74
Coef of variation,0.57615
Kurtosis,-1.1989
Mean,558.43
MAD,278.64
Skewness,0.00033808
Sum,471510251
Variance,103520
Memory size,6.4 MiB

Value,Count,Frequency (%),Unnamed: 3
1097,942,0.1%,
85,942,0.1%,
562,942,0.1%,
335,942,0.1%,
733,942,0.1%,
262,942,0.1%,
769,942,0.1%,
682,942,0.1%,
494,942,0.1%,
423,942,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1,781,0.1%,
2,784,0.1%,
3,779,0.1%,
4,784,0.1%,
5,779,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1111,779,0.1%,
1112,779,0.1%,
1113,784,0.1%,
1114,784,0.1%,
1115,781,0.1%,

0,1
Distinct count,7
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,3.5203
Minimum,1
Maximum,7
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,1
Q1,2
Median,3
Q3,5
95-th percentile,6
Maximum,7
Range,6
Interquartile range,3

0,1
Standard deviation,1.7237
Coef of variation,0.48963
Kurtosis,-1.2593
Mean,3.5203
MAD,1.5145
Skewness,0.019337
Sum,2972385
Variance,2.971
Memory size,6.4 MiB

Value,Count,Frequency (%),Unnamed: 3
6,144043,17.1%,
2,143961,17.0%,
3,141936,16.8%,
5,138625,16.4%,
1,137545,16.3%,
4,134644,15.9%,
7,3593,0.4%,

Value,Count,Frequency (%),Unnamed: 3
1,137545,16.3%,
2,143961,17.0%,
3,141936,16.8%,
4,134644,15.9%,
5,138625,16.4%,

Value,Count,Frequency (%),Unnamed: 3
3,141936,16.8%,
4,134644,15.9%,
5,138625,16.4%,
6,144043,17.1%,
7,3593,0.4%,

0,1
Distinct count,942
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Minimum,2013-01-01 00:00:00
Maximum,2015-07-31 00:00:00

0,1
Distinct count,21734
Unique (%),2.6%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,6955.5
Minimum,0
Maximum,41551
Zeros (%),0.0%

0,1
Minimum,0
5-th percentile,3173
Q1,4859
Median,6369
Q3,8360
95-th percentile,12668
Maximum,41551
Range,41551
Interquartile range,3501

0,1
Standard deviation,3104.2
Coef of variation,0.4463
Kurtosis,4.8521
Mean,6955.5
MAD,2291.7
Skewness,1.594
Sum,5872854480
Variance,9636300
Memory size,6.4 MiB

Value,Count,Frequency (%),Unnamed: 3
5674,215,0.0%,
5558,197,0.0%,
5483,196,0.0%,
6214,195,0.0%,
6049,195,0.0%,
5723,194,0.0%,
5449,192,0.0%,
5489,191,0.0%,
5140,191,0.0%,
5041,190,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,54,0.0%,
46,1,0.0%,
124,1,0.0%,
133,1,0.0%,
286,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
38037,1,0.0%,
38367,1,0.0%,
38484,1,0.0%,
38722,1,0.0%,
41551,1,0.0%,

0,1
Distinct count,4086
Unique (%),0.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,762.73
Minimum,0
Maximum,7388
Zeros (%),0.0%

0,1
Minimum,0
5-th percentile,349
Q1,519
Median,676
Q3,893
95-th percentile,1440
Maximum,7388
Range,7388
Interquartile range,374

0,1
Standard deviation,401.23
Coef of variation,0.52605
Kurtosis,13.314
Mean,762.73
MAD,268.14
Skewness,2.7881
Sum,644006223
Variance,160990
Memory size,6.4 MiB

Value,Count,Frequency (%),Unnamed: 3
560,2414,0.3%,
576,2363,0.3%,
603,2337,0.3%,
571,2330,0.3%,
555,2328,0.3%,
566,2327,0.3%,
517,2326,0.3%,
539,2309,0.3%,
651,2299,0.3%,
533,2297,0.3%,

Value,Count,Frequency (%),Unnamed: 3
0,52,0.0%,
3,1,0.0%,
5,1,0.0%,
8,1,0.0%,
13,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
5297,1,0.0%,
5387,1,0.0%,
5458,1,0.0%,
5494,1,0.0%,
7388,1,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.44636

0,1
0,467466
1,376881

Value,Count,Frequency (%),Unnamed: 3
0,467466,55.4%,
1,376881,44.6%,

0,1
Distinct count,4
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
0,843437
a,694
b,145

Value,Count,Frequency (%),Unnamed: 3
0,843437,99.9%,
a,694,0.1%,
b,145,0.0%,
c,71,0.0%,

0,1
Distinct count,2
Unique (%),0.0%
Missing (%),0.0%
Missing (n),0

0,1
0,680971
1,163376

Value,Count,Frequency (%),Unnamed: 3
0,680971,80.7%,
1,163376,19.3%,

Unnamed: 0_level_0,DayOfWeek,Date,Sales,Customers,Promo,StateHoliday,SchoolHoliday
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,5,2015-07-31,5263,555,1,0,1
2,5,2015-07-31,6064,625,1,0,1
3,5,2015-07-31,8314,821,1,0,1
4,5,2015-07-31,13995,1498,1,0,1
5,5,2015-07-31,4822,559,1,0,1


* The two dependent variable "Sales" and "Customers" are positively correlationed, and both are positively correlated with the variable "Promo", and negatively correlated with "DayOfWeek". 
* Between the predictor variables, variable "Promo" is negatively correlated with "DayOfWeek". 

In [7]:
pp.ProfileReport(store)

0,1
Number of variables,8
Number of observations,1115
Total Missing (%),16.2%
Total size in memory,69.8 KiB
Average record size in memory,64.1 B

0,1
Numeric,2
Categorical,3
Boolean,1
Date,2
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,1115
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,558
Minimum,1
Maximum,1115
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,56.7
Q1,279.5
Median,558.0
Q3,836.5
95-th percentile,1059.3
Maximum,1115.0
Range,1114.0
Interquartile range,557.0

0,1
Standard deviation,322.02
Coef of variation,0.57709
Kurtosis,-1.2
Mean,558
MAD,278.75
Skewness,0
Sum,622170
Variance,103700
Memory size,8.8 KiB

Value,Count,Frequency (%),Unnamed: 3
1115,1,0.1%,
374,1,0.1%,
368,1,0.1%,
369,1,0.1%,
370,1,0.1%,
371,1,0.1%,
372,1,0.1%,
373,1,0.1%,
375,1,0.1%,
383,1,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1,1,0.1%,
2,1,0.1%,
3,1,0.1%,
4,1,0.1%,
5,1,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1111,1,0.1%,
1112,1,0.1%,
1113,1,0.1%,
1114,1,0.1%,
1115,1,0.1%,

0,1
Distinct count,4
Unique (%),0.4%
Missing (%),0.0%
Missing (n),0

0,1
a,602
d,348
c,148

Value,Count,Frequency (%),Unnamed: 3
a,602,54.0%,
d,348,31.2%,
c,148,13.3%,
b,17,1.5%,

0,1
Distinct count,3
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0

0,1
a,593
c,513
b,9

Value,Count,Frequency (%),Unnamed: 3
a,593,53.2%,
c,513,46.0%,
b,9,0.8%,

0,1
Distinct count,655
Unique (%),58.7%
Missing (%),0.3%
Missing (n),3
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,5404.9
Minimum,20
Maximum,75860
Zeros (%),0.0%

0,1
Minimum,20.0
5-th percentile,140.0
Q1,717.5
Median,2325.0
Q3,6882.5
95-th percentile,20145.0
Maximum,75860.0
Range,75840.0
Interquartile range,6165.0

0,1
Standard deviation,7663.2
Coef of variation,1.4178
Kurtosis,13.098
Mean,5404.9
MAD,5240.7
Skewness,2.9293
Sum,6010200
Variance,58724000
Memory size,8.8 KiB

Value,Count,Frequency (%),Unnamed: 3
250.0,12,1.1%,
1200.0,9,0.8%,
50.0,8,0.7%,
350.0,8,0.7%,
190.0,8,0.7%,
330.0,7,0.6%,
90.0,7,0.6%,
180.0,7,0.6%,
150.0,7,0.6%,
1070.0,6,0.5%,

Value,Count,Frequency (%),Unnamed: 3
20.0,1,0.1%,
30.0,4,0.4%,
40.0,5,0.4%,
50.0,8,0.7%,
60.0,3,0.3%,

Value,Count,Frequency (%),Unnamed: 3
45740.0,1,0.1%,
46590.0,1,0.1%,
48330.0,1,0.1%,
58260.0,1,0.1%,
75860.0,1,0.1%,

0,1
Distinct count,2
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Mean,0.51211

0,1
1,571
0,544

Value,Count,Frequency (%),Unnamed: 3
1,571,51.2%,
0,544,48.8%,

0,1
Distinct count,4
Unique (%),0.4%
Missing (%),48.8%
Missing (n),544

0,1
"Jan,Apr,Jul,Oct",335
"Feb,May,Aug,Nov",130
"Mar,Jun,Sept,Dec",106
(Missing),544

Value,Count,Frequency (%),Unnamed: 3
"Jan,Apr,Jul,Oct",335,30.0%,
"Feb,May,Aug,Nov",130,11.7%,
"Mar,Jun,Sept,Dec",106,9.5%,
(Missing),544,48.8%,

0,1
Distinct count,172
Unique (%),15.4%
Missing (%),31.7%
Missing (n),354
Infinite (%),0.0%
Infinite (n),0

0,1
Minimum,1900-01-01 00:00:00
Maximum,2015-08-01 00:00:00

0,1
Distinct count,56
Unique (%),5.0%
Missing (%),48.8%
Missing (n),544
Infinite (%),0.0%
Infinite (n),0

0,1
Minimum,2009-08-03 00:00:00
Maximum,2015-06-08 00:00:00

Unnamed: 0_level_0,StoreType,Assortment,CompetitionDistance,Promo2,PromoInterval,CompetitionOpenSinceDate,Promo2SinceDate
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,c,a,1270.0,0,,2008-09-01,NaT
2,a,a,570.0,1,"Jan,Apr,Jul,Oct",2007-11-01,2010-03-29
3,a,a,14130.0,1,"Jan,Apr,Jul,Oct",2006-12-01,2011-04-04
4,c,c,620.0,0,,2009-09-01,NaT
5,a,a,29910.0,0,,2015-04-01,NaT


The above shows the Store Type and Store Inventory Assortment Strategies are not well balanced, whereas there are almost equal number of stores with and without promotions.

In [14]:
# creation of the modeling data
sales_cust_expd = sales_cust.join(store, how="left")
sales_cust_expd["YearsOfCompetition"] = \
(sales_cust_expd.Date- sales_cust_expd.CompetitionOpenSinceDate)/np.timedelta64(1, 'Y')
sales_cust_expd["YearsOfPromo2"] = \
(sales_cust_expd.Date- sales_cust_expd.Promo2SinceDate)/np.timedelta64(1, 'Y')
# cond = (sales_cust_expd.CompetitionDistance.isnull()) & (sales_cust_expd.YearsOfCompetition.isnull())
# sales_cust_expd["Competition"] = 0
# sales_cust_expd.loc[~cond, "Competition"] = 1
del sales_cust_expd["CompetitionOpenSinceDate"], sales_cust_expd["Promo2SinceDate"]
sales_cust_expd[~sales_cust_expd.YearsOfPromo2.isnull()].head(2)

Unnamed: 0_level_0,DayOfWeek,Date,Sales,Customers,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,PromoInterval,YearsOfCompetition,YearsOfPromo2
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2,5,2015-07-31,6064,625,1,0,1,a,a,570.0,1,"Jan,Apr,Jul,Oct",7.745539,5.338919
2,4,2015-07-30,5567,601,1,0,1,a,a,570.0,1,"Jan,Apr,Jul,Oct",7.742801,5.336181


In [15]:
sales_cust_expd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 844347 entries, 1 to 1115
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   DayOfWeek            844347 non-null  int64         
 1   Date                 844347 non-null  datetime64[ns]
 2   Sales                844347 non-null  int64         
 3   Customers            844347 non-null  int64         
 4   Promo                844347 non-null  int64         
 5   StateHoliday         844347 non-null  object        
 6   SchoolHoliday        844347 non-null  object        
 7   StoreType            844347 non-null  object        
 8   Assortment           844347 non-null  object        
 9   CompetitionDistance  842161 non-null  float64       
 10  Promo2               844347 non-null  int64         
 11  PromoInterval        421067 non-null  object        
 12  YearsOfCompetition   575752 non-null  float64       
 13  YearsOfPromo2   

The joined dataframe has 844347 records and 14 columns. Among them, columns "CompetitionDistance",
"PromoInterval", "YearsOfCompetition", "YearsOfPromo2" have missing values.

In [102]:
pp.ProfileReport(sales_cust_expd[["CompetitionDistance", "YearsOfCompetition", "YearsOfPromo2"]])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

0,1
Number of variables,5
Number of observations,844347
Total Missing (%),16.4%
Total size in memory,32.2 MiB
Average record size in memory,40.0 B

0,1
Numeric,4
Categorical,0
Boolean,0
Date,1
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,1115
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,558.43
Minimum,1
Maximum,1115
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,56
Q1,280
Median,558
Q3,837
95-th percentile,1060
Maximum,1115
Range,1114
Interquartile range,557

0,1
Standard deviation,321.74
Coef of variation,0.57615
Kurtosis,-1.1989
Mean,558.43
MAD,278.64
Skewness,0.00033808
Sum,471510251
Variance,103520
Memory size,6.4 MiB

Value,Count,Frequency (%),Unnamed: 3
1097,942,0.1%,
85,942,0.1%,
562,942,0.1%,
335,942,0.1%,
733,942,0.1%,
262,942,0.1%,
769,942,0.1%,
682,942,0.1%,
494,942,0.1%,
423,942,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1,781,0.1%,
2,784,0.1%,
3,779,0.1%,
4,784,0.1%,
5,779,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1111,779,0.1%,
1112,779,0.1%,
1113,784,0.1%,
1114,784,0.1%,
1115,781,0.1%,

0,1
Distinct count,942
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Minimum,2013-01-01 00:00:00
Maximum,2015-07-31 00:00:00

0,1
Distinct count,655
Unique (%),0.1%
Missing (%),0.3%
Missing (n),2186
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,5458
Minimum,20
Maximum,75860
Zeros (%),0.0%

0,1
Minimum,20
5-th percentile,130
Q1,710
Median,2320
Q3,6890
95-th percentile,20390
Maximum,75860
Range,75840
Interquartile range,6180

0,1
Standard deviation,7809.5
Coef of variation,1.4308
Kurtosis,13.413
Mean,5458
MAD,5315
Skewness,2.9751
Sum,4596600000
Variance,60989000
Memory size,6.4 MiB

Value,Count,Frequency (%),Unnamed: 3
250.0,9210,1.1%,
50.0,6249,0.7%,
350.0,6239,0.7%,
1200.0,6072,0.7%,
190.0,6066,0.7%,
90.0,5609,0.7%,
180.0,5422,0.6%,
330.0,5294,0.6%,
150.0,5294,0.6%,
140.0,4684,0.6%,

Value,Count,Frequency (%),Unnamed: 3
20.0,779,0.1%,
30.0,3116,0.4%,
40.0,3890,0.5%,
50.0,6249,0.7%,
60.0,2342,0.3%,

Value,Count,Frequency (%),Unnamed: 3
45740.0,780,0.1%,
46590.0,784,0.1%,
48330.0,784,0.1%,
58260.0,885,0.1%,
75860.0,887,0.1%,

0,1
Distinct count,10714
Unique (%),1.3%
Missing (%),31.8%
Missing (n),268595
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,5.0596
Minimum,-2.5764
Maximum,115.58
Zeros (%),0.0%

0,1
Minimum,-2.5764
5-th percentile,-0.90625
Q1,1.2238
Median,4.2492
Q3,7.8687
95-th percentile,12.844
Maximum,115.58
Range,118.15
Interquartile range,6.6449

0,1
Standard deviation,6.0131
Coef of variation,1.1884
Kurtosis,120.74
Mean,5.0596
MAD,3.8394
Skewness,7.4526
Sum,2913100
Variance,36.157
Memory size,6.4 MiB

Value,Count,Frequency (%),Unnamed: 3
0.8925576842782534,199,0.0%,
0.8733923352293339,199,0.0%,
0.6242427975933798,197,0.0%,
0.6434081466422993,197,0.0%,
0.9117230333271731,196,0.0%,
0.6050774485444602,196,0.0%,
0.37509325995742554,194,0.0%,
0.528416052348782,194,0.0%,
0.355927910908506,194,0.0%,
0.5092507032998624,194,0.0%,

Value,Count,Frequency (%),Unnamed: 3
-2.576370493576185,3,0.0%,
-2.573632586569197,3,0.0%,
-2.5708946795622087,3,0.0%,
-2.5681567725552203,3,0.0%,
-2.562680958541243,3,0.0%,

Value,Count,Frequency (%),Unnamed: 3
115.5670547649849,1,0.0%,
115.56979267199188,1,0.0%,
115.57253057899888,1,0.0%,
115.57526848600588,1,0.0%,
115.57800639301286,1,0.0%,

0,1
Distinct count,2865
Unique (%),0.3%
Missing (%),50.1%
Missing (n),423280
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2.0722
Minimum,-2.4285
Maximum,5.9905
Zeros (%),0.0%

0,1
Minimum,-2.4285
5-th percentile,-0.92541
Q1,0.69817
Median,2.0808
Q3,3.4771
95-th percentile,5.0049
Maximum,5.9905
Range,8.4191
Interquartile range,2.779

0,1
Standard deviation,1.791
Coef of variation,0.8643
Kurtosis,-0.77108
Mean,2.0722
MAD,1.4945
Skewness,-0.052874
Sum,872520
Variance,3.2076
Memory size,6.4 MiB

Value,Count,Frequency (%),Unnamed: 3
1.8617767647521852,312,0.0%,
1.8836800208080933,312,0.0%,
1.8699904857731506,312,0.0%,
1.8453493227102542,312,0.0%,
1.8809421138011047,312,0.0%,
1.8645146717591736,312,0.0%,
1.8535630437312196,312,0.0%,
1.8727283927801393,312,0.0%,
1.8672525787661622,310,0.0%,
1.8508251367242312,310,0.0%,

Value,Count,Frequency (%),Unnamed: 3
-2.4285235151988065,3,0.0%,
-2.4257856081918177,3,0.0%,
-2.4230477011848293,3,0.0%,
-2.420309794177841,3,0.0%,
-2.414833980163864,3,0.0%,

Value,Count,Frequency (%),Unnamed: 3
5.979588903262901,7,0.0%,
5.982326810269889,7,0.0%,
5.985064717276877,7,0.0%,
5.987802624283866,7,0.0%,
5.990540531290855,7,0.0%,

Unnamed: 0_level_0,Unnamed: 1_level_0,CompetitionDistance,YearsOfCompetition,YearsOfPromo2
Store,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2013-01-02,1270.0,4.336845,
1,2013-01-03,1270.0,4.339583,
1,2013-01-04,1270.0,4.342321,
1,2013-01-05,1270.0,4.345058,
1,2013-01-07,1270.0,4.350534,


The correlations between variables "CompetitionDistance", "YearsOfCompetition" and "YearsOfPromo2" are very small.

In [18]:
store[["StoreType", "Assortment"]].drop_duplicates().sort_values(["StoreType", "Assortment"])

Unnamed: 0_level_0,StoreType,Assortment
Store,Unnamed: 1_level_1,Unnamed: 2_level_1
2,a,a
7,a,c
85,b,a
259,b,b
562,b,c
1,c,a
4,c,c
13,d,a
15,d,c


There are only 9 StoreType-Assortment value combinations.

### Task1.3 Data Transformation

In [19]:
sales_cust_expd.tail(1)

Unnamed: 0_level_0,DayOfWeek,Date,Sales,Customers,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,PromoInterval,YearsOfCompetition,YearsOfPromo2
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1115,3,2013-01-02,3697,305,0,0,1,d,c,5350.0,1,"Mar,Jun,Sept,Dec",,0.599602


* Transformation of the dependent variables: The Profiling Report of the sales_cust data shows that the histograms of both the "Sales" and the "Customers" variables are heavily right skewed. Especially, "Customers" is a count data given store and time interval (1 day). As a result, we should take logrithm of both columns before building the two regressins.

* Transformation of the explanatory variables: The Profiling Report of the sales_cust_expd data shows that the histogram of "CompetitionDistance" and "YearsOfCompetition" are highly right skewed. These variables should be transformed with logrithm.

Summary of explanatory variables: 
* Core variables: DayOfWeek, trend variable (customer preference), seasonality variable, StateHoliday,	SchoolHoliday
* Promo variables: Promo, Promo2, YearsOfPromo2, PromoInterval (this is to be dropped due to 44% records are missing), 
* Competitor variables: Competetion, ln(CompetitionDistance), ln(YearsOfCompetition)

New features: 
* seasonality is intended to explain the regular annual seasonal pattern of sales, exclusive of holidays, we can transform the month of the year from "Date" into a categorical variable and use it to represent seasonality.
* Trend represents the change in sales due-to long term changes in consumer preferences. This can be represented by 3 new variables basing on store's historical sales data, namely (average lnSales during 1-6 months before Date's month), (average lnSales during 7-12 months before Date's month), (average lnSales during 13-24 months before Date's month). However, these three variables are found to be highly correlated rho>=0.9, I will only include one variable, which is (average lnSales during 1 year before Date's month).

In [20]:
sales_cust_expd["Month"] = sales_cust_expd.Date.dt.month.astype(str)

In [21]:
sales_cust_expd["lnSales"] = np.log(sales_cust_expd.Sales.map(lambda x: 0.0001 if x==0 else x))
sales1 = sales_cust_expd[["Date", "lnSales"]]
sales2 = sales1.copy()

def sum_count(s):
    s1 = s.set_index("Date").lnSales.resample('1M').sum()
    s2 = s.set_index("Date").lnSales.resample('1M').count()
    return pd.DataFrame({"Sum":s1, "Count":s2})

monthly_lnsales = sales2.reset_index().groupby("Store")\
.apply(sum_count)\
.reset_index("Date")
monthly_lnsales.columns = ["Month", "lnSalesSum", "lnSalesCount"]
monthly_lnsales.head(2)

Unnamed: 0_level_0,Month,lnSalesSum,lnSalesCount
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2013-01-31,220.838275,26
1,2013-02-28,205.131315,24


In [22]:
cross_join = sales1.join(monthly_lnsales)
cross_join.head(1)

Unnamed: 0_level_0,Date,lnSales,Month,lnSalesSum,lnSalesCount
Store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2015-07-31,8.568456,2013-01-31,220.838275,26


In [23]:
def calc_meanlnSales(df):
    return df.lnSalesSum.sum() / df.lnSalesCount.sum()
    
prevyearMeanLnSales = cross_join[(cross_join.Month < cross_join.Date) & 
           (cross_join.Month >= cross_join.Date - pd.tseries.offsets.DateOffset(months=12) )].reset_index()\
.groupby(["Store", "Date"])[["lnSalesSum","lnSalesCount"]].apply(calc_meanlnSales)

prevyearMeanLnSales.shape[0]

815412

In [24]:
sales_cust_expd = sales_cust_expd.reset_index().set_index(["Store","Date"])
sales_cust_expd = pd.concat([sales_cust_expd, 
                             prevyearMeanLnSales.to_frame(name="prevyearMeanLnSales")], axis=1)
sales_cust_expd.tail(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,DayOfWeek,Sales,Customers,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,PromoInterval,YearsOfCompetition,YearsOfPromo2,Month,lnSales,prevyearMeanLnSales
Store,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1115,2015-07-30,4,8405,502,1,0,1,d,c,5350.0,1,"Mar,Jun,Sept,Dec",,3.170496,7,9.036582,8.817564
1115,2015-07-31,5,8680,538,1,0,1,d,c,5350.0,1,"Mar,Jun,Sept,Dec",,3.173234,7,9.068777,8.817564


In [25]:
# sales_cust_expd["lnCustomers"] = np.log(sales_cust_expd.Customers)

In [26]:
del sales_cust_expd["lnCustomers"]

### Task1.4 Modeling

In the EDA session, we find there are 9 StoreType-Assortment value combinations. I would like to build 1 model for each of the 9 StoreType-Assortment combination.

In [68]:
class StoreType_Assortment_Model(object):
    def __init__(self, data, store_type, assortment, test_size=0.2, seed=0):
        self._store_type = store_type
        self._assortment = assortment
        self._data = data[(data.StoreType == store_type) & (data.Assortment == assortment)]
        self._formula_id = None
        
        # missing value control
        self._data.loc[self._data.Promo2 == 0,"PromoInterval"] = "0No_Promo"
        self._data.loc[self._data.Promo2 == 0,"YearsOfPromo2"] = 0
        self._data.loc[self._data.YearsOfCompetition.isnull(),"YearsOfCompetition"] = self._data.YearsOfCompetition.mean()
        self._data = self._data.loc[~self._data.prevyearMeanLnSales.isnull()]
        self._data["lnCompetitionDistance"] = np.log(self._data.CompetitionDistance.map(lambda x: 0.0001 if x == 0 else x))
        self._data["lnYearsOfCompetition"] = np.log(self._data.YearsOfCompetition.map(lambda x: 0.0001 if x == 0 else x))
        
        self._n = self._data.shape[0]
        np.random.seed(seed)
        self.train_indices = np.random.choice(range(self._n), size=int(self._n * (1 - test_size)), replace=False)
        self.test_indices = [i for i in range(self._n) if i not in self.train_indices]          
        
    def fit(self, depvar="lnSales", seed=0, formula_id=1):
        
        self._formula_id = formula_id
        train = self._data.iloc[self.train_indices,:].reset_index().dropna()
        
        if self._formula_id == 1:
            formula = """
            {} ~ C(Promo) + C(Promo2) + YearsOfPromo2 + C(PromoInterval) + C(PromoInterval):C(Month)
            + C(DayOfWeek) + C(Month) + C(SchoolHoliday) + C(StateHoliday) + lnCompetitionDistance + YearsOfCompetition
            """.format(depvar)
        elif self._formula_id == 2:
            formula = """
             {} ~ C(Promo) + YearsOfPromo2 + C(PromoInterval) + C(PromoInterval):C(Month)
            + C(DayOfWeek) + C(Month) + C(SchoolHoliday) + C(StateHoliday) + lnCompetitionDistance + YearsOfCompetition
            """.format(depvar)
        elif self._formula_id == 3:
            formula = """
             {} ~ C(Promo) + YearsOfPromo2 + C(PromoInterval) + C(PromoInterval):C(Month)
            + C(DayOfWeek) + C(Month) + C(SchoolHoliday) + C(StateHoliday) + lnCompetitionDistance + lnYearsOfCompetition
            """.format(depvar)
        elif self._formula_id == 4:
            formula = """
             {} ~ C(Promo) + YearsOfPromo2 + C(PromoInterval)
            + C(DayOfWeek) + C(Month) + C(SchoolHoliday) + C(StateHoliday) + lnCompetitionDistance + YearsOfCompetition
            """.format(depvar)       
        elif self._formula_id == 5:
            formula = """
             {} ~ C(Promo) + YearsOfPromo2 + C(PromoInterval)
            + C(DayOfWeek) + C(Month) + C(SchoolHoliday) + C(StateHoliday) + YearsOfCompetition
            """.format(depvar)       
        elif self._formula_id == 6:
            formula = """
             {} ~ C(Promo) + YearsOfPromo2 
            + C(DayOfWeek) + C(Month) + C(SchoolHoliday) + C(StateHoliday) + YearsOfCompetition
            """.format(depvar)              
        elif self._formula_id == 7:
            formula = """
             {} ~ C(Promo) + YearsOfPromo2 
            + C(DayOfWeek) + C(Month) + C(SchoolHoliday) + C(StateHoliday) + YearsOfCompetition
            + C(Promo):YearsOfCompetition
            """.format(depvar)  

        elif self._formula_id == 8:
            formula = """
             {} ~ C(Promo) + YearsOfPromo2 
            + C(DayOfWeek) + C(Month) + C(SchoolHoliday) + C(StateHoliday) + YearsOfCompetition
            + YearsOfPromo2:YearsOfCompetition
            """.format(depvar) 
        elif self._formula_id == 9:
            formula = """
             {} ~ C(Promo) + YearsOfPromo2 
            + C(DayOfWeek) + C(Month) + C(SchoolHoliday) + C(StateHoliday) + YearsOfCompetition
            + YearsOfPromo2:YearsOfCompetition + C(Promo):YearsOfCompetition
            """.format(depvar) 
            
        # the following is fitting a mixed effect model with random intercept for each Store
        results = smf.mixedlm(formula, data=train, groups=train["Store"]).fit(method=["lbfgs"])
        
        return results

In [44]:
sales_cust_expd.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,DayOfWeek,Sales,Customers,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,PromoInterval,YearsOfCompetition,YearsOfPromo2,Month,lnSales,prevyearMeanLnSales
Store,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,2013-01-02,3,5530,668,0,0,1,c,a,1270.0,0,,4.336845,,1,8.617943,


In [45]:
sales_cust_expd.columns

Index(['DayOfWeek', 'Sales', 'Customers', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'Promo2', 'PromoInterval', 'YearsOfCompetition', 'YearsOfPromo2',
       'Month', 'lnSales', 'prevyearMeanLnSales'],
      dtype='object')

In [74]:
model_results = {}
fitted_models = {}
for s, a in sales_cust_expd[["StoreType", "Assortment"]].drop_duplicates().values.tolist():
    model_results[(s,a)] = []
    fitted_models[(s,a)] = []
    sam_obj = StoreType_Assortment_Model(sales_cust_expd, store_type=s, assortment=a, test_size=0)
    
    for i in range(1, 8):
        try:
            fitted = sam_obj.fit(formula_id=i)
            llf = fitted.llf
            fitted_models[(s,a)].append(fitted)
            model_results[(s,a)].append(llf)
        except:
            fitted_models[(s,a)].append(None)
            model_results[(s,a)].append(None)
    print((s,a))
    print(model_results[(s,a)])

('c', 'a')
[None, None, None, None, None, -8614.151444936404, -8614.829903761856]
('a', 'a')
[-3781.018084328389, -3795.9777766554616, -3839.13803251856, -3782.9374363734387, -3782.2607177896425, -3779.1691944894847, -3768.858218682697]
('c', 'c')
[5452.735056206991, 5435.0268176744285, 5316.966755430913, 5520.710325742868, 5522.71805672528, 5515.179980426939, 5613.362943804299]
('a', 'c')
[5344.998250169214, 5329.310783795663, 5190.519112684415, 5376.461212933995, 5375.547457066597, 5370.293876291951, 5385.977842317545]
('d', 'a')
[16109.188381007058, 16093.291292121517, 16081.031787774176, 16156.098602381186, 16158.147841153492, 16155.55855016614, 16157.46249320492]
('d', 'c')
[35867.93152807758, 35853.18230128195, 35633.439429096295, 35892.92491552129, 35895.929251519265, 35881.75847598375, 35902.312036684016]
('b', 'a')
[182.95489780456774, 170.31101224828308, 168.87556047188264, 184.38963327744204, 184.87812725156618, 184.51841149682878, 474.7643611357162]
('b', 'b')
[None, None, 

In [75]:
sales_cust_expd[(sales_cust_expd.StoreType=="b") & (sales_cust_expd.Assortment=="c")].Promo2.unique()
# the last combination doesn't have promotion at all

array([0])

In [76]:
#store the fitted models and their llf results using pickle
filename1 = 'fitted_models'
outfile1 = open(filename1,'wb')
pickle.dump(fitted_models, outfile1)
outfile1.close()

In [77]:
filename2 = 'model_results_llf'
outfile2 = open(filename2,'wb')
pickle.dump(model_results, outfile2)
outfile2.close()

In [3]:
filename1 = 'fitted_models'
infile = open(filename1,'rb')
new_dict = pickle.load(infile)
infile.close()

In [80]:
sam = StoreType_Assortment_Model(sales_cust_expd, store_type='c', assortment='a', test_size=0)
fitted = sam.fit(formula_id=7)
fitted.summary().tables[1].loc[['C(Promo)[T.1]','YearsOfPromo2','YearsOfCompetition','C(Promo)[T.1]:YearsOfCompetition'],:]

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
C(Promo)[T.1],0.366,0.005,69.728,0.0,0.356,0.377
YearsOfPromo2,0.026,0.003,9.389,0.0,0.021,0.032
YearsOfCompetition,-0.001,0.002,-0.492,0.623,-0.006,0.004
C(Promo)[T.1]:YearsOfCompetition,-0.002,0.001,-3.36,0.001,-0.004,-0.001


In [81]:
for s, a in sales_cust_expd[["StoreType", "Assortment"]].drop_duplicates().values.tolist()[1:-1]:
    md1 = fitted_models[(s, a)][4]
    md2 = fitted_models[(s, a)][5]

    chi2 = -2*(md1.llf - md2.llf) 
    df = md1.df_modelwc - md2.df_modelwc
    chi2_test_pval = 1 - st.chi2.cdf(chi2, df)
    print(md1.converged, md2.converged)
    print(s,a, df, "chi2-test pval:",chi2_test_pval)

True True
a a 3 chi2-test pval: 0.1030363939284934
True True
c c 3 chi2-test pval: 1.0
True True
a c 3 chi2-test pval: 1.0
True True
d a 3 chi2-test pval: 1.0
True True
d c 3 chi2-test pval: 1.0
True True
b a 1 chi2-test pval: 1.0
True True
b b 3 chi2-test pval: 1.0


The above likelihood ratio tests show: the model 5, including C(PromoInterval) is not significantly better than model 6. The following are using model 7.

In [82]:
for s, a in sales_cust_expd[["StoreType", "Assortment"]].drop_duplicates().values.tolist()[:-1]:
    var = "YearsOfPromo2:YearsOfCompetition" # 7 'C(Promo)[T.1]:YearsOfCompetition'
    tbl = fitted_models[(s, a)][-1].summary().tables[1].loc[['C(Promo)[T.1]',
                                                             'YearsOfPromo2',
                                                             'YearsOfCompetition',
                                                             'C(Promo)[T.1]:YearsOfCompetition'],:]
    tb0 = fitted_models[(s, a)][-1].summary().tables[0]
    print(s,a)
    print(tb0)
    print(tbl)
    print("====================================================================")

c a
                   0        1                    2           3
0             Model:  MixedLM  Dependent Variable:     lnSales
1  No. Observations:    50113              Method:        REML
2        No. Groups:       77               Scale:      0.0815
3   Min. group size:       27      Log-Likelihood:  -8614.8299
4   Max. group size:      758           Converged:         Yes
5   Mean group size:    650.8                                 
                                   Coef. Std.Err.       z  P>|z|  [0.025  \
C(Promo)[T.1]                      0.366    0.005  69.728  0.000   0.356   
YearsOfPromo2                      0.026    0.003   9.389  0.000   0.021   
YearsOfCompetition                -0.001    0.002  -0.492  0.623  -0.006   
C(Promo)[T.1]:YearsOfCompetition  -0.002    0.001  -3.360  0.001  -0.004   

                                  0.975]  
C(Promo)[T.1]                      0.377  
YearsOfPromo2                      0.032  
YearsOfCompetition                 0.004  
C(

From above, we see the both Promo and YearsOfPromo2 have statistically siginificant (sginificant level = 0.1) positive effect on sales, and this is true for most StoreType:Assortment combinations ("c a", "a a", "c c", "a c", "d a", "d c", "b a"). Given the coefficient values of Promo and YearsOfPromo2. And being currently in promo period has bigger positive effect on sales than the effect of an incremental year of promo participation. For "b b" combination, the estimated coefficient for Promo is not statistically significant, and only the YearsOfPromo2 is showing statistically significantly postive effect on sales. This means in most cases, the store being in a promo or the more years of participating the promo will lead to higher sales. 

YearsOfCompetition has statistically significant effect on sales on most cases (except the "c a" case). Other than that, only the "b a" case is showing negative effect on sales, and all others are showing positive effect on sales. This means in most one incremental year of competition has a positive effect on sales.

Promo:YearsOfCompetition interaction are statistically significant in most cases (except the "b b" case). Among these majority cases where the interaction are significant, only one case ("c a") show negative interaction effect. This means in most cases effect of promo on sales increase with the increase of years of competition.

In Summary, whether there is a significant positive / negative effect of Promo, YearsOfPromo2, YearsOfCompetition and Promo:YearsOfCompetition interaction depends on the StoreType and Assortment values of the store. But in most cases (StoreType-Assortment combinations), all these 4 factors show statistically significant positive effect on sales.

In [99]:
invest2 = sales_cust_expd[(sales_cust_expd.StoreType=="b") & (sales_cust_expd.Assortment=="c")]
invest2.Promo2.value_counts()

0    942
Name: Promo2, dtype: int64

Above shows there's no promo in the "b c" case.

## Task 2 - Strategies for Store Promotions & Incentivization

### Task 2.0 - Read Data

In [10]:
store_promos = pd.read_csv("../dataset/task2_data/store_promotions.csv")

In [11]:
store_promos.head()

Unnamed: 0,storeid,StoreSize,StoreLayout,customerID,gender,WhoShoppingFor,Vegetarian,ShoppingStyle,CouponUser,week,CouponValue,AmountSpent,store_membership,StoreType
0,1,Medium,No emphasis,6193-QDKCN,Male,Self and spouse,No,Often; what's on sale,From newspaper,1,05 percent,135.24,0,Neighborhood_Market
1,1,Medium,No emphasis,6193-QDKCN,Male,Self and spouse,No,Often; what's on sale,From newspaper,2,25 percent,128.65,0,Neighborhood_Market
2,1,Medium,No emphasis,6193-QDKCN,Male,Self and spouse,No,Often; what's on sale,From newspaper,3,00 No value,112.86,0,Neighborhood_Market
3,1,Medium,No emphasis,6193-QDKCN,Male,Self and spouse,No,Often; what's on sale,From newspaper,4,15 percent,130.14,0,Neighborhood_Market
4,1,Medium,No emphasis,9245-DEZSB,Male,Self and family,No,Weekly; similar items,From mailings,1,25 percent,155.61,1,Neighborhood_Market


### Task 2.1 - EDA
#### Find the overall data insights and patterns [customer, channel and coupons]

In [12]:
# customer info
customers = store_promos[["customerID","gender","WhoShoppingFor", "Vegetarian", "ShoppingStyle", "CouponUser"]]\
.drop_duplicates()
customers.shape

(374, 6)

In [13]:
pp.ProfileReport(customers[["gender","WhoShoppingFor", "Vegetarian", "ShoppingStyle", "CouponUser"]])

0,1
Number of variables,6
Number of observations,374
Total Missing (%),0.0%
Total size in memory,17.7 KiB
Average record size in memory,48.3 B

0,1
Numeric,1
Categorical,5
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,374
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,746
Minimum,0
Maximum,1492
Zeros (%),0.3%

0,1
Minimum,0.0
5-th percentile,74.6
Q1,373.0
Median,746.0
Q3,1119.0
95-th percentile,1417.4
Maximum,1492.0
Range,1492.0
Interquartile range,746.0

0,1
Standard deviation,432.43
Coef of variation,0.57967
Kurtosis,-1.2
Mean,746
MAD,374
Skewness,0
Sum,279004
Variance,187000
Memory size,3.0 KiB

Value,Count,Frequency (%),Unnamed: 3
508,1,0.3%,
980,1,0.3%,
164,1,0.3%,
1028,1,0.3%,
1412,1,0.3%,
168,1,0.3%,
1316,1,0.3%,
1108,1,0.3%,
692,1,0.3%,
172,1,0.3%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.3%,
4,1,0.3%,
8,1,0.3%,
12,1,0.3%,
16,1,0.3%,

Value,Count,Frequency (%),Unnamed: 3
1476,1,0.3%,
1480,1,0.3%,
1484,1,0.3%,
1488,1,0.3%,
1492,1,0.3%,

0,1
Distinct count,2
Unique (%),0.5%
Missing (%),0.0%
Missing (n),0

0,1
Male,212
Female,162

Value,Count,Frequency (%),Unnamed: 3
Male,212,56.7%,
Female,162,43.3%,

0,1
Distinct count,3
Unique (%),0.8%
Missing (%),0.0%
Missing (n),0

0,1
Self,152
Self and spouse,135
Self and family,87

Value,Count,Frequency (%),Unnamed: 3
Self,152,40.6%,
Self and spouse,135,36.1%,
Self and family,87,23.3%,

0,1
Distinct count,2
Unique (%),0.5%
Missing (%),0.0%
Missing (n),0

0,1
No,312
Yes,62

Value,Count,Frequency (%),Unnamed: 3
No,312,83.4%,
Yes,62,16.6%,

0,1
Distinct count,3
Unique (%),0.8%
Missing (%),0.0%
Missing (n),0

0,1
Weekly; similar items,227
Often; what's on sale,74
Biweekly; in bulk,73

Value,Count,Frequency (%),Unnamed: 3
Weekly; similar items,227,60.7%,
Often; what's on sale,74,19.8%,
Biweekly; in bulk,73,19.5%,

0,1
Distinct count,4
Unique (%),1.1%
Missing (%),0.0%
Missing (n),0

0,1
From mailings,117
From both,100
No,85

Value,Count,Frequency (%),Unnamed: 3
From mailings,117,31.3%,
From both,100,26.7%,
No,85,22.7%,
From newspaper,72,19.3%,

Unnamed: 0,gender,WhoShoppingFor,Vegetarian,ShoppingStyle,CouponUser
0,Male,Self and spouse,No,Often; what's on sale,From newspaper
4,Male,Self and family,No,Weekly; similar items,From mailings
8,Male,Self and family,No,Biweekly; in bulk,From newspaper
12,Female,Self,No,Weekly; similar items,No
16,Female,Self,Yes,Weekly; similar items,From mailings


From above report we find there are 374 customers included in the store_promotions.csv dataset. Among them:
* 56.7% are males whereas 43.3% are females. 
* 40.6% are shopping for self, 36.1% are shopping for self and spouse, and 23.3% are shopping for self and family.
* 16.6% are vegetarians and 83.4% are not.
* 60.7% shop weekly and buy similar items, 19.8% shop often and buy what's on sale, and 19.5% buy biweekly and in bulk.
* 31.3% got coupons from mailings, 19.3% got coupons from newspaper, 26.7% got coupons from both and 22.7% use no coupon.

In [14]:
# channel info
stores = store_promos[["storeid", "StoreSize", "StoreType"]].drop_duplicates()
pp.ProfileReport(stores[["StoreSize", "StoreType"]])

0,1
Number of variables,3
Number of observations,60
Total Missing (%),0.0%
Total size in memory,1.5 KiB
Average record size in memory,26.1 B

0,1
Numeric,1
Categorical,2
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,60
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,777.8
Minimum,0
Maximum,1476
Zeros (%),1.7%

0,1
Minimum,0.0
5-th percentile,63.0
Q1,409.0
Median,818.0
Q3,1151.0
95-th percentile,1421.2
Maximum,1476.0
Range,1476.0
Interquartile range,742.0

0,1
Standard deviation,448.44
Coef of variation,0.57654
Kurtosis,-1.2034
Mean,777.8
MAD,386.08
Skewness,-0.16132
Sum,46668
Variance,201090
Memory size,608.0 B

Value,Count,Frequency (%),Unnamed: 3
112,1,1.7%,
1148,1,1.7%,
1388,1,1.7%,
1080,1,1.7%,
20,1,1.7%,
308,1,1.7%,
188,1,1.7%,
1408,1,1.7%,
432,1,1.7%,
940,1,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0,1,1.7%,
20,1,1.7%,
44,1,1.7%,
64,1,1.7%,
92,1,1.7%,

Value,Count,Frequency (%),Unnamed: 3
1408,1,1.7%,
1420,1,1.7%,
1444,1,1.7%,
1460,1,1.7%,
1476,1,1.7%,

0,1
Distinct count,3
Unique (%),5.0%
Missing (%),0.0%
Missing (n),0

0,1
Large,28
Medium,18
Small,14

Value,Count,Frequency (%),Unnamed: 3
Large,28,46.7%,
Medium,18,30.0%,
Small,14,23.3%,

0,1
Distinct count,2
Unique (%),3.3%
Missing (%),0.0%
Missing (n),0

0,1
Neighborhood_Market,55
Supercenter,5

Value,Count,Frequency (%),Unnamed: 3
Neighborhood_Market,55,91.7%,
Supercenter,5,8.3%,

Unnamed: 0,StoreSize,StoreType
0,Medium,Neighborhood_Market
20,Large,Neighborhood_Market
44,Large,Neighborhood_Market
64,Large,Neighborhood_Market
92,Medium,Neighborhood_Market


From above report, we find there are 60 stores included in the store_promotions.csv dataset. Among them:
* 46.7% are large stores, 30.0% are medium size stores and 23.3% are small stores.
* 91.7% are neighborhood market stores and 8.3% are supercenters.

In [15]:
customer_counts_by_channel = store_promos.groupby(["StoreSize", "StoreType"])\
.apply(lambda x: x.customerID.drop_duplicates().count())
customer_counts_by_channel

StoreSize  StoreType          
Large      Neighborhood_Market    202
Medium     Neighborhood_Market     98
           Supercenter             11
Small      Neighborhood_Market     52
           Supercenter             11
dtype: int64

It is found there are more Large Neighborhood_Market stores included in this data and less medium and small Supercenter stores included in this data. 

In [16]:
202 + 98+11+52+11

374

The above shows the number StoreSize-StoreType combination equals the number of distinct custermerID, meaning each customer only (repeatedly) go to once single store.

In [17]:
# coupon info
store_promos.CouponValue.value_counts()

15 percent     374
25 percent     374
05 percent     374
00 No value    374
Name: CouponValue, dtype: int64

In [18]:
store_promos.groupby(["StoreSize", "StoreType"]).apply(lambda x: x.CouponValue.value_counts())

StoreSize  StoreType                       
Large      Neighborhood_Market  05 percent     202
                                25 percent     202
                                15 percent     202
                                00 No value    202
Medium     Neighborhood_Market  25 percent      98
                                05 percent      98
                                00 No value     98
                                15 percent      98
           Supercenter          00 No value     11
                                05 percent      11
                                15 percent      11
                                25 percent      11
Small      Neighborhood_Market  05 percent      52
                                00 No value     52
                                15 percent      52
                                25 percent      52
           Supercenter          05 percent      11
                                15 percent      11
                                00 No 

The above analysis shows the store_promotions.csv samples each customer's trip one time for each of the 4 discount types. 

In [19]:
store_promos[store_promos.CouponUser=="No"].head()

Unnamed: 0,storeid,StoreSize,StoreLayout,customerID,gender,WhoShoppingFor,Vegetarian,ShoppingStyle,CouponUser,week,CouponValue,AmountSpent,store_membership,StoreType
12,1,Medium,No emphasis,7862-AMKML,Female,Self,No,Weekly; similar items,No,1,00 No value,80.11,1,Neighborhood_Market
13,1,Medium,No emphasis,7862-AMKML,Female,Self,No,Weekly; similar items,No,2,05 percent,76.88,1,Neighborhood_Market
14,1,Medium,No emphasis,7862-AMKML,Female,Self,No,Weekly; similar items,No,3,15 percent,83.84,1,Neighborhood_Market
15,1,Medium,No emphasis,7862-AMKML,Female,Self,No,Weekly; similar items,No,4,25 percent,77.31,1,Neighborhood_Market
56,3,Large,Emphasizes deli,6510-AAHBU,Male,Self and family,No,Weekly; similar items,No,1,25 percent,119.97,1,Neighborhood_Market


In addition, all the CouponUser actually has non-"00 No value" values, this means this data is fake data and should be fixed for the non Coupon Users. Here I reset their CouponValue all to be "00 No value"

In [20]:
store_promos.loc[store_promos.CouponUser=="No","CouponValue"] = "00 No value"

### Task 2.2 - Strategies for Store Promotions & Incentivization, Part I
#### The retailer is planning to roll-out an online channel to provide discount coupons to the audience. You can only give discounts to certain number of customers. Which customers will you reach out to improve your sales & footprint and Why?

In [21]:
store_promos.head()

Unnamed: 0,storeid,StoreSize,StoreLayout,customerID,gender,WhoShoppingFor,Vegetarian,ShoppingStyle,CouponUser,week,CouponValue,AmountSpent,store_membership,StoreType
0,1,Medium,No emphasis,6193-QDKCN,Male,Self and spouse,No,Often; what's on sale,From newspaper,1,05 percent,135.24,0,Neighborhood_Market
1,1,Medium,No emphasis,6193-QDKCN,Male,Self and spouse,No,Often; what's on sale,From newspaper,2,25 percent,128.65,0,Neighborhood_Market
2,1,Medium,No emphasis,6193-QDKCN,Male,Self and spouse,No,Often; what's on sale,From newspaper,3,00 No value,112.86,0,Neighborhood_Market
3,1,Medium,No emphasis,6193-QDKCN,Male,Self and spouse,No,Often; what's on sale,From newspaper,4,15 percent,130.14,0,Neighborhood_Market
4,1,Medium,No emphasis,9245-DEZSB,Male,Self and family,No,Weekly; similar items,From mailings,1,25 percent,155.61,1,Neighborhood_Market


Since above EDA tells us that each customer only go to one store, and has only one record for each Coupon Value, we can essentially calculate the elasticity of AmountSpent on CouponValue for each customer and do the recommendation given his/her elasticity. 

In [22]:
def elasticity(df):
    AS = df.sort_values("CouponValue")["AmountSpent"].values
    AS += 0.0001
    nums = [(x - AS[0])/AS[0] for x in AS[1:]]
    dens = [-x/100 for x in [5, 15, 25]]
    s = pd.Series([x/y for x,y in zip(nums, dens)])
    s.index = ["5%", "15%", "25%"]
    return s

In [23]:
def customer_recommendation_for_coupons(elasticity_thrld=-5):
    elasticity_res = store_promos.groupby("customerID").apply(elasticity)
    # the first criteria is to see if the negative elacity is big enough, the second criteria is 
    # to avoid the cases where AmountSpent of "00 No value" is smaller than 0.01 USD.
    cond1 = ((elasticity_res["5%"] < elasticity_thrld) & (elasticity_res["5%"]>-10**5))
    cond2 = ((elasticity_res["15%"] < elasticity_thrld) & (elasticity_res["15%"]>-10**5))
    cond3 = ((elasticity_res["25%"] < elasticity_thrld) & (elasticity_res["25%"]>-10**5))
    res = elasticity_res[cond1 | cond2 | cond3 ]
    print(res)
    print(res < elasticity_thrld)
    return res

In [30]:
res = customer_recommendation_for_coupons(-4)

                    5%         15%         25%
customerID                                    
0155-IZVLG   -4.213593   -0.692044   -0.711988
0826-YWEHA   -4.774079   -2.506137   -0.982621
1292-DMLIO -618.764690    6.666569 -129.376213
1665-DFZQY   -4.094206   -0.964238   -1.077816
1709-ICFXL -363.093628   -1.467648  -68.995625
1821-NHKHV -496.300589    6.666577 -108.063234
1827-LWWCD -241.517425   -1.639920  -50.870316
2244-AYZUM   -4.604616   -2.558546   -1.896289
2496-IMEIQ   -7.700515   -3.369287   -1.806257
2657-EVEQJ   -5.418285   -2.219744   -1.805947
3118-UAIXT   -5.007184   -2.150226   -0.903287
3614-ATKGN   -5.457732   -1.146491   -1.494262
3760-PJZMF   -4.020952   -2.011931   -0.768390
4588-TPMPP   -4.476689   -2.323934   -1.534466
4710-MHJUS   -4.866661   -1.527597   -1.040859
5069-ISCFT -684.906147    6.666566 -159.287416
5169-RLQMD   -4.734992   -1.198087   -0.684439
5226-DQMNK   -7.271333   -3.420073   -1.849335
5409-OUCAM -428.250965    6.666600  -92.708946
5511-JOUNL   

According to the threshold of elasticity (maximum value of elasticity for which we would like engage), I recommend to give coupons to above customerIDs, and offer coupon types ("5%", "15%", and "25%") whenever the value in the above table is True. If there are multiple coupon values identified to be True, I recommend to offer the coupon value with largest absolute value of elasticity.

Similar recommendation tables can be produced if the threshold of elasticity is set to another value.

### Task 3.3 - Strategies for Store Promotions & Incentivization, Part II
#### The business is launching a new product in its stores and are looking for those stores in which they should first launch. In order to promote this new item, can you recommend them which promotion medium they should use, which stores to target and which customers to choose to promote this new item? What factors will you look at?

* Promotion medium: We can count the number of elastic customer's coupon median types, the most used median types can be used for new product promotion.
* Store selection: Stores that have effective promotions (having elastic customers), and that sells the same category with steady and large enough demands. Since the category info is not available, I will try find stores with elastic customers where these elastic customers spent the most.
* Customer selection: Category buyer with low brand preference and high elasticity. Since the available data does not offer the category and brand preference information, we can now just focus on customer elasticity.

In [35]:
elastic_customers = store_promos[store_promos.customerID.isin(res.index)]
elastic_customers.CouponUser.value_counts()

From mailings     64
From both         48
No                24
From newspaper    20
Name: CouponUser, dtype: int64

From above, we learn promotion through mailing is prefered, since more elastic customers purchased products after being offered the product promotion through mailing.

In [42]:
elastic_customers.groupby(["storeid"])[["AmountSpent"]].sum("AmountSpent")\
.sort_values("AmountSpent", ascending=False).head(10)

Unnamed: 0_level_0,AmountSpent
storeid,Unnamed: 1_level_1
8,1592.39
6,1070.94
24,998.72
10,997.41
27,997.08
46,985.21
19,930.32
13,907.51
11,837.12
14,705.86


Above 10 stores are the stores (storeid: 8, 6, 24, 10, 27, 46, 19, 13, 11, 14) where elastic customers spent the most, I recommend to launch the new product in these 10 stores first. (The total number stores can be different according to budget for product launching.) In future, if the product category information and the store assortment details are available, I will recommend launching the product to stores where its category is sold.

In [44]:
res.index

Index(['0155-IZVLG', '0826-YWEHA', '1292-DMLIO', '1665-DFZQY', '1709-ICFXL',
       '1821-NHKHV', '1827-LWWCD', '2244-AYZUM', '2496-IMEIQ', '2657-EVEQJ',
       '3118-UAIXT', '3614-ATKGN', '3760-PJZMF', '4588-TPMPP', '4710-MHJUS',
       '5069-ISCFT', '5169-RLQMD', '5226-DQMNK', '5409-OUCAM', '5511-JOUNL',
       '5541-NMAHA', '5849-RTOXV', '5880-FHHWI', '5978-OQWCJ', '6126-QGKXG',
       '6184-IEHQV', '6338-EMHAS', '6454-TKWCQ', '6588-HKAAB', '7150-IMBEL',
       '7666-CYQPW', '7863-NSDDD', '8069-JWRYL', '8751-BHXVJ', '9074-YQGRY',
       '9096-DXYNN', '9439-GTZQH', '9785-KCBYU', '9939-EDLJX'],
      dtype='object', name='customerID')

Given the customer elasticity I recommend to promote the elastic customers as obatined from Task 2.2. In future, if given the purchase category information and brand preference (i.e. prefered_brand_annual_spent/category_annual_spent of a customer) information. I would also leverage the latter two factors in customer selection. 