## 5: CORRELATIONS - WHAT ITEM TYPES?

### Import libraries 

In [3]:
#Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
#NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np
# Matplotlib is a plotting library for python and pyplot gives us a MatLab like plotting framework. We will use this in our plotter function to plot data.
import matplotlib.pyplot as plt
#Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics
import seaborn as sns
import dataframe_image as dfi

from matplotlib.ticker import StrMethodFormatter

# importing the required function for correlations
from scipy.stats import chi2_contingency

### Load and view data 

In [4]:
df = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2.csv")
# df.describe(include='all')

In [5]:
# use the corr function to display the correlation between all the features
data_corr = df.corr()
# data_corr

In [6]:
df['time'] = df['time'].replace(['afternoon', 'evening'], 'afternoon')

# 1. GENERAL CORRELATIONS: time/day/storetype-name

## 1.1. CORRELATION 1: item type vs day

In [6]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['day'])

CrosstabResult

day,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
alcoholic drinks,1,0,0,0,0,0,0
almond milk,0,0,1,0,0,0,0
andalouse sauce,0,0,0,0,1,0,0
apple sauce,1,0,2,0,1,0,0
apples,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...
veal,0,0,1,0,0,0,0
vegetable box,0,2,0,0,0,0,0
vegetable mix,0,0,0,0,3,0,0
yoghurt,1,3,2,1,1,0,0


In [7]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 0.007328794340601195


Significant (strong), we could limit options to what's bought per day:
> 1. Certain cat. are bought on specific days only/mostly
2. Could be by chance, but could be on purpose for some categories (e.g. bakery)
3. >  Let's check if this is also true for only the supermarket/times...

## 1.2. CORRELATION 2: item type vs time

In [7]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['time'])

CrosstabResult

time,afternoon,morning,noon
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alcoholic drinks,1,0,0
almond milk,1,0,0
andalouse sauce,1,0,0
apple sauce,3,1,0
apples,2,0,0
...,...,...,...
veal,0,1,0
vegetable box,2,0,0
vegetable mix,3,0,0
yoghurt,6,2,0


In [8]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 0.0004258260117925643


Significant (strong) correlation, we could limit options to what's bought per time [in a supermarket]

## 1.3.1 CORRELATION 3: item type vs store

### 1.3.1 item type vs store type

In [8]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['store_type'])

CrosstabResult

store_type,bakery,butcher,drugstore,furniture store,supermarket
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
alcoholic drinks,0,0,0,0,1
almond milk,0,0,0,0,1
andalouse sauce,0,0,0,0,1
apple sauce,0,0,0,0,4
apples,0,0,0,0,2
...,...,...,...,...,...
veal,0,0,0,0,1
vegetable box,0,0,0,0,2
vegetable mix,0,0,0,0,3
yoghurt,0,0,0,0,8


In [9]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 7.298136204606577e-19


### 1.3.2. item type vs store name

In [10]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['store_name'])

CrosstabResult

store_name,Albert Heijn,Brabo,Carrefour,Delhaize,Ikea,Kruidvat,Okay,Sys,Versavel Poelman
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
alcoholic drinks,0,0,0,1,0,0,0,0,0
almond milk,0,0,1,0,0,0,0,0,0
andalouse sauce,0,0,0,0,0,0,1,0,0
apple sauce,0,0,1,1,0,0,2,0,0
apples,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
veal,0,0,0,0,0,0,1,0,0
vegetable box,2,0,0,0,0,0,0,0,0
vegetable mix,0,0,0,0,0,0,3,0,0
yoghurt,3,0,1,2,0,0,2,0,0


In [11]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 4.60371070098832e-15


## 1.4. item type vs promo

In [36]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['promo_num'])

CrosstabResult

promo_num,0,1
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1
alcoholic drinks,1,0
almond milk,1,0
andalouse sauce,1,0
apple sauce,4,0
apples,2,0
...,...,...
veal,1,0
vegetable box,0,2
vegetable mix,1,2
yoghurt,7,1


In [37]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 0.0003625346131603951


We could separate items that are primarily/only bought in promo and exclude them from regular groceries

# 2. Narrow down dataframe

## 2.4 Per STORE NAME

In [9]:
# define data
store1 = 'Albert Heijn'
store2 = 'Carrefour'
store3 = 'Okay'
store4 = 'Delhaize'
store5 = 'Versavel Poelman'
store6 = 'Kruidvat'
store7 = 'Brabo'
store8 = 'Ikea'
store9 = 'Sys'

# select only these data for the df
df_store1 = df[df['store_name'] == store1]
df_store2 = df[df['store_name'] == store2]
df_store3 = df[df['store_name'] == store3]
df_store4 = df[df['store_name'] == store4]
df_store5 = df[df['store_name'] == store5]
df_store6 = df[df['store_name'] == store6]
df_store7 = df[df['store_name'] == store7]
df_store8 = df[df['store_name'] == store8]
df_store9 = df[df['store_name'] == store9] 

In [10]:
# Cross tabulation between category and day
CrosstabResult1=pd.crosstab(index=df_store1['item_type'],columns=df_store1['time'])
CrosstabResult2=pd.crosstab(index=df_store2['item_type'],columns=df_store2['time'])
CrosstabResult3=pd.crosstab(index=df_store3['item_type'],columns=df_store3['time'])
CrosstabResult4=pd.crosstab(index=df_store4['item_type'],columns=df_store4['time'])
CrosstabResult5=pd.crosstab(index=df_store5['item_type'],columns=df_store5['time'])
CrosstabResult6=pd.crosstab(index=df_store6['item_type'],columns=df_store6['time'])
CrosstabResult7=pd.crosstab(index=df_store7['item_type'],columns=df_store7['time'])
CrosstabResult8=pd.crosstab(index=df_store8['item_type'],columns=df_store8['time'])
CrosstabResult9=pd.crosstab(index=df_store9['item_type'],columns=df_store9['time'])

# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)
ChiSqResult4 = chi2_contingency(CrosstabResult4)
ChiSqResult5 = chi2_contingency(CrosstabResult5)
ChiSqResult6 = chi2_contingency(CrosstabResult6)
ChiSqResult7 = chi2_contingency(CrosstabResult7)
ChiSqResult8 = chi2_contingency(CrosstabResult8)
ChiSqResult9 = chi2_contingency(CrosstabResult9)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test AH is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test Carrefour is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test Okay is:', ChiSqResult3[1])
print('The P-Value of the ChiSq Test Delhaize is:', ChiSqResult4[1])
print('The P-Value of the ChiSq Test Versavel Poelman is:', ChiSqResult5[1])
print('The P-Value of the ChiSq Test Kruidvat is:', ChiSqResult6[1])
print('The P-Value of the ChiSq Test Brabo is:', ChiSqResult7[1])
print('The P-Value of the ChiSq Test Ikea is:', ChiSqResult8[1])
print('The P-Value of the ChiSq Test Sys is:', ChiSqResult9[1])

The P-Value of the ChiSq Test AH is: 0.2553069052181278
The P-Value of the ChiSq Test Carrefour is: 1.0
The P-Value of the ChiSq Test Okay is: 0.7453955042975469
The P-Value of the ChiSq Test Delhaize is: 0.48913159160139275
The P-Value of the ChiSq Test Versavel Poelman is: 1.0
The P-Value of the ChiSq Test Kruidvat is: 1.0
The P-Value of the ChiSq Test Brabo is: 1.0
The P-Value of the ChiSq Test Ikea is: 1.0
The P-Value of the ChiSq Test Sys is: 0.9604070567886343


In [11]:
# Cross tabulation between category and day
CrosstabResult1=pd.crosstab(index=df_store1['item_type'],columns=df_store1['day'])
CrosstabResult2=pd.crosstab(index=df_store2['item_type'],columns=df_store2['day'])
CrosstabResult3=pd.crosstab(index=df_store3['item_type'],columns=df_store3['day'])
CrosstabResult4=pd.crosstab(index=df_store4['item_type'],columns=df_store4['day'])
CrosstabResult5=pd.crosstab(index=df_store5['item_type'],columns=df_store5['day'])
CrosstabResult6=pd.crosstab(index=df_store6['item_type'],columns=df_store6['day'])
CrosstabResult7=pd.crosstab(index=df_store7['item_type'],columns=df_store7['day'])
CrosstabResult8=pd.crosstab(index=df_store8['item_type'],columns=df_store8['day'])
CrosstabResult9=pd.crosstab(index=df_store9['item_type'],columns=df_store9['day'])

# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)
ChiSqResult4 = chi2_contingency(CrosstabResult4)
ChiSqResult5 = chi2_contingency(CrosstabResult5)
ChiSqResult6 = chi2_contingency(CrosstabResult6)
ChiSqResult7 = chi2_contingency(CrosstabResult7)
ChiSqResult8 = chi2_contingency(CrosstabResult8)
ChiSqResult9 = chi2_contingency(CrosstabResult9)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test AH is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test Carrefour is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test Okay is:', ChiSqResult3[1])
print('The P-Value of the ChiSq Test Delhaize is:', ChiSqResult4[1])
print('The P-Value of the ChiSq Test Versavel Poelman is:', ChiSqResult5[1])
print('The P-Value of the ChiSq Test Kruidvat is:', ChiSqResult6[1])
print('The P-Value of the ChiSq Test Brabo is:', ChiSqResult7[1])
print('The P-Value of the ChiSq Test Ikea is:', ChiSqResult8[1])
print('The P-Value of the ChiSq Test Sys is:', ChiSqResult9[1])

The P-Value of the ChiSq Test AH is: 0.9916297197883326
The P-Value of the ChiSq Test Carrefour is: 1.0
The P-Value of the ChiSq Test Okay is: 0.4407906202626398
The P-Value of the ChiSq Test Delhaize is: 0.48913159160139275
The P-Value of the ChiSq Test Versavel Poelman is: 0.5905214502087732
The P-Value of the ChiSq Test Kruidvat is: 1.0
The P-Value of the ChiSq Test Brabo is: 1.0
The P-Value of the ChiSq Test Ikea is: 1.0
The P-Value of the ChiSq Test Sys is: 0.865542455654743
