## 5.2: what promo items per day/store/time?

### Import libraries 

In [1]:
#Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
#NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np
# Matplotlib is a plotting library for python and pyplot gives us a MatLab like plotting framework. We will use this in our plotter function to plot data.
import matplotlib.pyplot as plt
#Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics
import seaborn as sns
import dataframe_image as dfi

from matplotlib.ticker import StrMethodFormatter

# importing the required function for correlations
from scipy.stats import chi2_contingency

### Load and view data 

In [2]:
df = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2.csv")
# df.describe(include='all')

In [3]:
# use the corr function to display the correlation between all the features
data_corr = df.corr()
# data_corr

In [4]:
# aggregate afternoon and evening
df['time'] = df['time'].replace(['afternoon', 'evening'], 'afternoon')

# 1.1. item type vs promo

In [5]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['promo_num'])

CrosstabResult

promo_num,0,1
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1
alcoholic drinks,1,0
almond milk,1,0
andalouse sauce,1,0
apple sauce,4,0
apples,2,0
...,...,...
veal,1,0
vegetable box,0,2
vegetable mix,1,2
yoghurt,7,1


In [6]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 0.0003625346131603951


We could separate items that are more often bought in promo and exclude them from regular groceries

#### (if for promo #1>#0)

In [7]:
CrosstabResult.drop(CrosstabResult[CrosstabResult[1] <= CrosstabResult[0]].index, inplace = True)

CrosstabResult = CrosstabResult.reset_index()
CrosstabResult

promo_num,item_type,0,1
0,beef,1,2
1,blueberries,0,2
2,chicken wrap,0,1
3,chocolates,0,1
4,egg wrap,0,1
5,fish,1,2
6,fruit salad,0,1
7,lunch,0,1
8,pork/veal,0,1
9,vegetable box,0,2


In [8]:
options = CrosstabResult.item_type.values.tolist()

In [9]:
options

['beef',
 'blueberries',
 'chicken wrap',
 'chocolates',
 'egg wrap',
 'fish',
 'fruit salad',
 'lunch',
 'pork/veal',
 'vegetable box',
 'vegetable mix']

In [10]:
# df with only the promo items
df_promo = df[df['item_type'].isin(options)]

# Now to delete these item types from the current dataframe
df = df[-df['item_type'].isin(options)]

# 1. GENERAL CORRELATIONS: FOR PROMO ITEMS

## 1.1. CORRELATION 1: item type vs day

In [11]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df_promo['item_type'],columns=df_promo['day'])

CrosstabResult

day,Friday,Monday,Saturday,Sunday,Thursday,Wednesday
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
beef,0,1,0,2,0,0
blueberries,1,0,0,1,0,0
chicken wrap,0,0,0,0,0,1
chocolates,0,0,1,0,0,0
egg wrap,0,0,0,0,0,1
fish,0,0,1,2,0,0
fruit salad,0,0,0,0,1,0
lunch,0,1,0,0,0,0
pork/veal,0,0,1,0,0,0
vegetable box,0,2,0,0,0,0


In [12]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 0.026302854895702676


Significant (strong), we could limit options to what's bought per day:
> 1. Certain cat. are bought on specific days only/mostly
2. Could be by chance, but could be on purpose for some categories (e.g. bakery)
3. >  Let's check if this is also true for only the supermarket/times...

## 1.2. CORRELATION 2: item type vs time

In [13]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df_promo['item_type'],columns=df_promo['time'])

CrosstabResult

time,afternoon,morning,noon
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
beef,1,2,0
blueberries,1,1,0
chicken wrap,0,0,1
chocolates,1,0,0
egg wrap,0,0,1
fish,1,2,0
fruit salad,0,0,1
lunch,0,0,1
pork/veal,0,1,0
vegetable box,2,0,0


In [14]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 0.10194708324604537


No correlation between time and promo item

## 1.3.1 CORRELATION 3: item type vs store

### 1.3.1 item type vs store type

In [15]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df_promo['item_type'],columns=df_promo['store_type'])

CrosstabResult

store_type,supermarket
item_type,Unnamed: 1_level_1
beef,3
blueberries,2
chicken wrap,1
chocolates,1
egg wrap,1
fish,3
fruit salad,1
lunch,1
pork/veal,1
vegetable box,2


In [16]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 1.0


No correlation between store type and promo item

### 1.3.2. item type vs store name

In [17]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df_promo['item_type'],columns=df_promo['store_name'])

CrosstabResult

store_name,Albert Heijn,Carrefour,Delhaize,Okay
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
beef,1,0,2,0
blueberries,0,0,2,0
chicken wrap,1,0,0,0
chocolates,0,1,0,0
egg wrap,1,0,0,0
fish,0,1,2,0
fruit salad,1,0,0,0
lunch,1,0,0,0
pork/veal,0,0,0,1
vegetable box,2,0,0,0


In [18]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 0.041726906528480184


Correlation between store NAME and promo item

# 2. Narrow down dataframe

## 2.1 Per DAY

In [19]:
# define data
Monday = 'Monday'
Tuesday = 'Tuesday'
Wednesday = 'Wednesday'
Thursday = 'Thursday'
Friday = 'Friday'
Saturday = 'Saturday'
Sunday = 'Sunday'

# select only these data (for day, store, store name)
df_Monday = df_promo[df_promo["day"] == Monday]
df_Tuesday = df_promo[df_promo["day"] == Tuesday]
df_Wednesday = df_promo[df_promo["day"] == Wednesday]
df_Thursday = df_promo[df_promo["day"] == Thursday]
df_Friday = df_promo[df_promo["day"] == Friday]
df_Saturday = df_promo[df_promo["day"] == Saturday]
df_Sunday = df_promo[df_promo["day"] == Sunday]

In [20]:
# Cross tabulation between category and store type
CrosstabResult_Monday =pd.crosstab(index=df_Monday['item_type'],columns=df_Monday['time'], dropna=False)
CrosstabResult_Tuesday =pd.crosstab(index=df_Tuesday['item_type'],columns=df_Tuesday['time'], dropna=False)
CrosstabResult_Wednesday =pd.crosstab(index=df_Wednesday['item_type'],columns=df_Wednesday['time'], dropna=False)
CrosstabResult_Thursday =pd.crosstab(index=df_Thursday['item_type'],columns=df_Thursday['time'], dropna=False)
CrosstabResult_Friday =pd.crosstab(index=df_Friday['item_type'],columns=df_Friday['time'], dropna=False)
CrosstabResult_Saturday =pd.crosstab(index=df_Saturday['item_type'],columns=df_Saturday['time'], dropna=False)
CrosstabResult_Sunday =pd.crosstab(index=df_Sunday['item_type'],columns=df_Sunday['time'], dropna=False)

# Performing Chi-sq test
Monday = chi2_contingency(CrosstabResult_Monday)
# Tuesday = chi2_contingency(CrosstabResult_Tuesday)
Wednesday = chi2_contingency(CrosstabResult_Wednesday)
Thursday = chi2_contingency(CrosstabResult_Thursday)
Friday = chi2_contingency(CrosstabResult_Friday)
Saturday = chi2_contingency(CrosstabResult_Saturday)
Sunday = chi2_contingency(CrosstabResult_Sunday)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for Mondays:', Monday[1])
print('The P-Value of the ChiSq Test is, for Tuesdays:', Tuesday[1])
print('The P-Value of the ChiSq Test is, for Wednesdays:', Wednesday[1])
print('The P-Value of the ChiSq Test is, for Thursdays:', Thursday[1])
print('The P-Value of the ChiSq Test is, for Fridays:', Friday[1])
print('The P-Value of the ChiSq Test is, for Saturdays:', Saturday[1])
print('The P-Value of the ChiSq Test is, for Sundays:', Sunday[1])

The P-Value of the ChiSq Test is, for Mondays: 0.13533528323661273
The P-Value of the ChiSq Test is, for Tuesdays: u
The P-Value of the ChiSq Test is, for Wednesdays: 1.0
The P-Value of the ChiSq Test is, for Thursdays: 0.5049850750938457
The P-Value of the ChiSq Test is, for Fridays: 1.0
The P-Value of the ChiSq Test is, for Saturdays: 0.22313016014842982
The P-Value of the ChiSq Test is, for Sundays: 1.0


In [21]:
# Cross tabulation between category and store type
CrosstabResult_Monday =pd.crosstab(index=df_Monday['item_type'],columns=df_Monday['store_type'], dropna=False)
CrosstabResult_Tuesday =pd.crosstab(index=df_Tuesday['item_type'],columns=df_Tuesday['store_type'], dropna=False)
CrosstabResult_Wednesday =pd.crosstab(index=df_Wednesday['item_type'],columns=df_Wednesday['store_type'], dropna=False)
CrosstabResult_Thursday =pd.crosstab(index=df_Thursday['item_type'],columns=df_Thursday['store_type'], dropna=False)
CrosstabResult_Friday =pd.crosstab(index=df_Friday['item_type'],columns=df_Friday['store_type'], dropna=False)
CrosstabResult_Saturday =pd.crosstab(index=df_Saturday['item_type'],columns=df_Saturday['store_type'], dropna=False)
CrosstabResult_Sunday =pd.crosstab(index=df_Sunday['item_type'],columns=df_Sunday['store_type'], dropna=False)

# Performing Chi-sq test
Monday = chi2_contingency(CrosstabResult_Monday)
# Tuesday = chi2_contingency(CrosstabResult_Tuesday)
Wednesday = chi2_contingency(CrosstabResult_Wednesday)
Thursday = chi2_contingency(CrosstabResult_Thursday)
Friday = chi2_contingency(CrosstabResult_Friday)
Saturday = chi2_contingency(CrosstabResult_Saturday)
Sunday = chi2_contingency(CrosstabResult_Sunday)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for Mondays:', Monday[1])
print('The P-Value of the ChiSq Test is, for Tuesdays:', Tuesday[1])
print('The P-Value of the ChiSq Test is, for Wednesdays:', Wednesday[1])
print('The P-Value of the ChiSq Test is, for Thursdays:', Thursday[1])
print('The P-Value of the ChiSq Test is, for Fridays:', Friday[1])
print('The P-Value of the ChiSq Test is, for Saturdays:', Saturday[1])
print('The P-Value of the ChiSq Test is, for Sundays:', Sunday[1])

The P-Value of the ChiSq Test is, for Mondays: 1.0
The P-Value of the ChiSq Test is, for Tuesdays: u
The P-Value of the ChiSq Test is, for Wednesdays: 1.0
The P-Value of the ChiSq Test is, for Thursdays: 1.0
The P-Value of the ChiSq Test is, for Fridays: 1.0
The P-Value of the ChiSq Test is, for Saturdays: 1.0
The P-Value of the ChiSq Test is, for Sundays: 1.0


In [22]:
# Cross tabulation between category and store type
CrosstabResult_Monday =pd.crosstab(index=df_Monday['item_type'],columns=df_Monday['store_name'], dropna=False)
CrosstabResult_Tuesday =pd.crosstab(index=df_Tuesday['item_type'],columns=df_Tuesday['store_name'], dropna=False)
CrosstabResult_Wednesday =pd.crosstab(index=df_Wednesday['item_type'],columns=df_Wednesday['store_name'], dropna=False)
CrosstabResult_Thursday =pd.crosstab(index=df_Thursday['item_type'],columns=df_Thursday['store_name'], dropna=False)
CrosstabResult_Friday =pd.crosstab(index=df_Friday['item_type'],columns=df_Friday['store_name'], dropna=False)
CrosstabResult_Saturday =pd.crosstab(index=df_Saturday['item_type'],columns=df_Saturday['store_name'], dropna=False)
CrosstabResult_Sunday =pd.crosstab(index=df_Sunday['item_type'],columns=df_Sunday['store_name'], dropna=False)

# Performing Chi-sq test
Monday = chi2_contingency(CrosstabResult_Monday)
# Tuesday = chi2_contingency(CrosstabResult_Tuesday)
Wednesday = chi2_contingency(CrosstabResult_Wednesday)
Thursday = chi2_contingency(CrosstabResult_Thursday)
Friday = chi2_contingency(CrosstabResult_Friday)
Saturday = chi2_contingency(CrosstabResult_Saturday)
Sunday = chi2_contingency(CrosstabResult_Sunday)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for Mondays:', Monday[1])
print('The P-Value of the ChiSq Test is, for Tuesdays:', Tuesday[1])
print('The P-Value of the ChiSq Test is, for Wednesdays:', Wednesday[1])
print('The P-Value of the ChiSq Test is, for Thursdays:', Thursday[1])
print('The P-Value of the ChiSq Test is, for Fridays:', Friday[1])
print('The P-Value of the ChiSq Test is, for Saturdays:', Saturday[1])
print('The P-Value of the ChiSq Test is, for Sundays:', Sunday[1])

The P-Value of the ChiSq Test is, for Mondays: 1.0
The P-Value of the ChiSq Test is, for Tuesdays: u
The P-Value of the ChiSq Test is, for Wednesdays: 1.0
The P-Value of the ChiSq Test is, for Thursdays: 0.5049850750938457
The P-Value of the ChiSq Test is, for Fridays: 1.0
The P-Value of the ChiSq Test is, for Saturdays: 0.22313016014842982
The P-Value of the ChiSq Test is, for Sundays: 1.0


## 2.4 Per STORE NAME

In [23]:
# define data
store1 = 'Albert Heijn'
store2 = 'Carrefour'
store3 = 'Okay'
store4 = 'Delhaize'
store5 = 'Versavel Poelman'
store6 = 'Kruidvat'
store7 = 'Brabo'
store8 = 'Ikea'
store9 = 'Sys'

# select only these data for the df
df_store1 = df_promo[df_promo['store_name'] == store1]
df_store2 = df_promo[df_promo['store_name'] == store2]
df_store3 = df_promo[df_promo['store_name'] == store3]
df_store4 = df_promo[df_promo['store_name'] == store4]
df_store5 = df_promo[df_promo['store_name'] == store5]
df_store6 = df_promo[df_promo['store_name'] == store6]
df_store7 = df_promo[df_promo['store_name'] == store7]
df_store8 = df_promo[df_promo['store_name'] == store8]
df_store9 = df_promo[df_promo['store_name'] == store9] 

In [26]:
# Cross tabulation between category and day
CrosstabResult1=pd.crosstab(index=df_store1['item_type'],columns=df_store1['time'])
CrosstabResult2=pd.crosstab(index=df_store2['item_type'],columns=df_store2['time'])
CrosstabResult3=pd.crosstab(index=df_store3['item_type'],columns=df_store3['time'])
CrosstabResult4=pd.crosstab(index=df_store4['item_type'],columns=df_store4['time'])
CrosstabResult5=pd.crosstab(index=df_store5['item_type'],columns=df_store5['time'])
CrosstabResult6=pd.crosstab(index=df_store6['item_type'],columns=df_store6['time'])
CrosstabResult7=pd.crosstab(index=df_store7['item_type'],columns=df_store7['time'])
CrosstabResult8=pd.crosstab(index=df_store8['item_type'],columns=df_store8['time'])
CrosstabResult9=pd.crosstab(index=df_store9['item_type'],columns=df_store9['time'])

# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)
ChiSqResult4 = chi2_contingency(CrosstabResult4)
# ChiSqResult5 = chi2_contingency(CrosstabResult5)
# ChiSqResult6 = chi2_contingency(CrosstabResult6)
# ChiSqResult7 = chi2_contingency(CrosstabResult7)
# ChiSqResult8 = chi2_contingency(CrosstabResult8)
# ChiSqResult9 = chi2_contingency(CrosstabResult9)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test AH is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test Carrefour is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test Okay is:', ChiSqResult3[1])
print('The P-Value of the ChiSq Test Delhaize is:', ChiSqResult4[1])
# print('The P-Value of the ChiSq Test Versavel Poelman is:', ChiSqResult5[1])
# print('The P-Value of the ChiSq Test Kruidvat is:', ChiSqResult6[1])
# print('The P-Value of the ChiSq Test Brabo is:', ChiSqResult7[1])
# print('The P-Value of the ChiSq Test Ikea is:', ChiSqResult8[1])
# print('The P-Value of the ChiSq Test Sys is:', ChiSqResult9[1])

The P-Value of the ChiSq Test AH is: 0.22064030793671066
The P-Value of the ChiSq Test Carrefour is: 1.0
The P-Value of the ChiSq Test Okay is: 0.5049850750938457
The P-Value of the ChiSq Test Delhaize is: 0.3011942119122021


In [27]:
# Cross tabulation between category and day
CrosstabResult1=pd.crosstab(index=df_store1['item_type'],columns=df_store1['day'])
CrosstabResult2=pd.crosstab(index=df_store2['item_type'],columns=df_store2['day'])
CrosstabResult3=pd.crosstab(index=df_store3['item_type'],columns=df_store3['day'])
CrosstabResult4=pd.crosstab(index=df_store4['item_type'],columns=df_store4['day'])
CrosstabResult5=pd.crosstab(index=df_store5['item_type'],columns=df_store5['day'])
CrosstabResult6=pd.crosstab(index=df_store6['item_type'],columns=df_store6['day'])
CrosstabResult7=pd.crosstab(index=df_store7['item_type'],columns=df_store7['day'])
CrosstabResult8=pd.crosstab(index=df_store8['item_type'],columns=df_store8['day'])
CrosstabResult9=pd.crosstab(index=df_store9['item_type'],columns=df_store9['day'])

# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)
ChiSqResult4 = chi2_contingency(CrosstabResult4)
# ChiSqResult5 = chi2_contingency(CrosstabResult5)
# ChiSqResult6 = chi2_contingency(CrosstabResult6)
# ChiSqResult7 = chi2_contingency(CrosstabResult7)
# ChiSqResult8 = chi2_contingency(CrosstabResult8)
# ChiSqResult9 = chi2_contingency(CrosstabResult9)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test AH is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test Carrefour is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test Okay is:', ChiSqResult3[1])
print('The P-Value of the ChiSq Test Delhaize is:', ChiSqResult4[1])
# print('The P-Value of the ChiSq Test Versavel Poelman is:', ChiSqResult5[1])
# print('The P-Value of the ChiSq Test Kruidvat is:', ChiSqResult6[1])
# print('The P-Value of the ChiSq Test Brabo is:', ChiSqResult7[1])
# print('The P-Value of the ChiSq Test Ikea is:', ChiSqResult8[1])
# print('The P-Value of the ChiSq Test Sys is:', ChiSqResult9[1])

The P-Value of the ChiSq Test AH is: 0.17299160788207146
The P-Value of the ChiSq Test Carrefour is: 1.0
The P-Value of the ChiSq Test Okay is: 0.5049850750938457
The P-Value of the ChiSq Test Delhaize is: 0.3011942119122021
