## 5: CORRELATIONS - WHAT ITEM TYPES?

### Import libraries 

In [1]:
#Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
#NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np
# Matplotlib is a plotting library for python and pyplot gives us a MatLab like plotting framework. We will use this in our plotter function to plot data.
import matplotlib.pyplot as plt
#Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics
import seaborn as sns
import dataframe_image as dfi

from matplotlib.ticker import StrMethodFormatter

# importing the required function for correlations
from scipy.stats import chi2_contingency

### Load and view data 

In [2]:
df = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH3\df\df_HH3.csv")
# df.describe(include='all')

In [3]:
# use the corr function to display the correlation between all the features
data_corr = df.corr()
# data_corr

# 1. GENERAL CORRELATIONS: time/day/storetype-name

## 1.1. CORRELATION 1: item type vs day

In [4]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['day'])

CrosstabResult

# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 2.4437756405492703e-12


## 1.2. CORRELATION 2: item type vs time

In [5]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['time'])

display(CrosstabResult)

# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

time,afternoon,evening,noon
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5-spice powder,1,0,0
all-purpose cleaner,0,1,0
apple sauce,0,1,0
apples,2,3,0
bacon,0,3,0
...,...,...,...
toothbrush,1,1,0
vegetable mix,1,2,0
wasabi,1,0,0
water filters,1,1,0


The P-Value of the ChiSq Test is: 0.00039157204398085


Significant correlation, we could limit options to what's bought per time [in a supermarket]

## 1.3.1 CORRELATION 3: item type vs store

### 1.3.1 item type vs store type

In [6]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['store_type'])

CrosstabResult

# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 3.5776106579743055e-18


### 1.3.2. item type vs store name

In [7]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['store_name'])

CrosstabResult

# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 1.2178092538093965e-14


## 1.4. item type vs promo

In [8]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['promo_num'])

CrosstabResult

# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 3.7949776778212043e-06


We could separate items that are more often bought in promo and exclude them from regular groceries

#### (if for promo #1>#0)

In [9]:
CrosstabResult.drop(CrosstabResult[CrosstabResult[1] <= CrosstabResult[0]].index, inplace = True)

CrosstabResult = CrosstabResult.reset_index()
CrosstabResult

options = CrosstabResult.item_type.values.tolist()

options

['bag', 'pie', 'seseame oil']

In [10]:
# df with only the promo items
df_promo = df[df['item_type'].isin(options)]

# Now to delete these item types from the current dataframe
df = df[-df['item_type'].isin(options)]

# 2. Narrow down dataframe

## 2.2 Per TIME

### Aggregate time

In [15]:
# Find unique values for store names
print(df['time'].unique())

['evening' 'afternoon' 'noon']


In [17]:
# define data
noon = 'noon'
afternoon = 'afternoon'
evening = 'evening'

# select only these data (for time)
df_noon = df[df["time"] == noon]
df_afternoon = df[df["time"] == afternoon]
df_evening = df[df["time"] == evening]

In [18]:
# Cross tabulation between category and store type
CrosstabResult_noon =pd.crosstab(index=df_noon['item_type'],columns=df_noon['day'])
CrosstabResult_afternoon =pd.crosstab(index=df_afternoon['item_type'],columns=df_afternoon['day'])
CrosstabResult_evening =pd.crosstab(index=df_evening['item_type'],columns=df_evening['day'])

# Performing Chi-sq test
ChiSqResult_noon = chi2_contingency(CrosstabResult_noon)
ChiSqResult_afternoon = chi2_contingency(CrosstabResult_afternoon)
ChiSqResult_evening = chi2_contingency(CrosstabResult_evening)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for noons:', ChiSqResult_noon[1])
print('The P-Value of the ChiSq Test is, for afternoons:', ChiSqResult_afternoon[1])
print('The P-Value of the ChiSq Test is, for evening:', ChiSqResult_evening[1])

The P-Value of the ChiSq Test is, for noons: 0.7667395507581166
The P-Value of the ChiSq Test is, for afternoons: 0.16913332985620702
The P-Value of the ChiSq Test is, for evening: 0.9183952937794617


In [19]:
# Cross tabulation between category and store type
CrosstabResult_noon =pd.crosstab(index=df_noon['item_type'],columns=df_noon['store_type'])
CrosstabResult_afternoon =pd.crosstab(index=df_afternoon['item_type'],columns=df_afternoon['store_type'])
CrosstabResult_evening =pd.crosstab(index=df_evening['item_type'],columns=df_evening['store_type'])

# Performing Chi-sq test
ChiSqResult_noon = chi2_contingency(CrosstabResult_noon)
ChiSqResult_afternoon = chi2_contingency(CrosstabResult_afternoon)
ChiSqResult_evening = chi2_contingency(CrosstabResult_evening)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for noons:', ChiSqResult_noon[1])
print('The P-Value of the ChiSq Test is, for afternoons:', ChiSqResult_afternoon[1])
print('The P-Value of the ChiSq Test is, for evening:', ChiSqResult_evening[1])

The P-Value of the ChiSq Test is, for noons: 1.0
The P-Value of the ChiSq Test is, for afternoons: 0.012828542014759478
The P-Value of the ChiSq Test is, for evening: 1.0


In [20]:
# Cross tabulation between category and store type
CrosstabResult_noon =pd.crosstab(index=df_noon['item_type'],columns=df_noon['store_name'])
CrosstabResult_afternoon =pd.crosstab(index=df_afternoon['item_type'],columns=df_afternoon['store_name'])
CrosstabResult_evening =pd.crosstab(index=df_evening['item_type'],columns=df_evening['store_name'])

# Performing Chi-sq test
ChiSqResult_noon = chi2_contingency(CrosstabResult_noon)
ChiSqResult_afternoon = chi2_contingency(CrosstabResult_afternoon)
ChiSqResult_evening = chi2_contingency(CrosstabResult_evening)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for noons:', ChiSqResult_noon[1])
print('The P-Value of the ChiSq Test is, for afternoons:', ChiSqResult_afternoon[1])
print('The P-Value of the ChiSq Test is, for evening:', ChiSqResult_evening[1])

The P-Value of the ChiSq Test is, for noons: 0.34978239680619466
The P-Value of the ChiSq Test is, for afternoons: 0.0028453074454443715
The P-Value of the ChiSq Test is, for evening: 0.6465053189668006


## 2.3 Per STORE TYPE

In [22]:
# to select only one
storet1 = 'supermarket'
storet2 = 'bakery'
storet3 = 'pet supplies'

# select only these data for the df
df_storet1 = df[df['store_type'] == storet1]
df_storet2 = df[df['store_type'] == storet2]
df_storet3 = df[df['store_type'] == storet3]

# Cross tabulation between category and day
CrosstabResult1=pd.crosstab(index=df_storet1['item_type'],columns=df_storet1['time'])
CrosstabResult2=pd.crosstab(index=df_storet2['item_type'],columns=df_storet2['time'])
CrosstabResult3=pd.crosstab(index=df_storet3['item_type'],columns=df_storet3['time'])

# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test supermarket is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test bakery is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test pet supplies is:', ChiSqResult3[1])

The P-Value of the ChiSq Test supermarket is: 0.3378889070849209
The P-Value of the ChiSq Test bakery is: 1.0
The P-Value of the ChiSq Test pet supplies is: 1.0


In [24]:
# Cross tabulation between category and day
CrosstabResult1=pd.crosstab(index=df_storet1['item_type'],columns=df_storet1['day'])
CrosstabResult2=pd.crosstab(index=df_storet2['item_type'],columns=df_storet2['day'])
CrosstabResult3=pd.crosstab(index=df_storet3['item_type'],columns=df_storet3['day'])

# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test supermarket is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test bakery is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test pet supplies is:', ChiSqResult3[1])

The P-Value of the ChiSq Test supermarket is: 0.0499483069585892
The P-Value of the ChiSq Test bakery is: 0.7667395507581166
The P-Value of the ChiSq Test pet supplies is: 0.2872974951836456


In [25]:
# Cross tabulation between category and day
CrosstabResult1=pd.crosstab(index=df_storet1['item_type'],columns=df_storet1['store_name'])
CrosstabResult2=pd.crosstab(index=df_storet2['item_type'],columns=df_storet2['store_name'])
CrosstabResult3=pd.crosstab(index=df_storet3['item_type'],columns=df_storet3['store_name'])

# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test supermarket is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test bakery is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test pet supplies is:', ChiSqResult3[1])

The P-Value of the ChiSq Test supermarket is: 4.678415181323773e-05
The P-Value of the ChiSq Test bakery is: 0.34978239680619466
The P-Value of the ChiSq Test pet supplies is: 0.2872974951836456


## 2.4.  Per DAY

In [55]:
# define data
Monday = 'Monday'
Tuesday = 'Tuesday'
Wednesday = 'Wednesday'
Thursday = 'Thursday'
Friday = 'Friday'
Saturday = 'Saturday'
Sunday = 'Sunday'

# select only these data (for day, store, store name)
df_Monday = df[df["day"] == Monday]
df_Tuesday = df[df["day"] == Tuesday]
df_Wednesday = df[df["day"] == Wednesday]
df_Thursday = df[df["day"] == Thursday]
df_Friday = df[df["day"] == Friday]
df_Saturday = df[df["day"] == Saturday]
df_Sunday = df[df["day"] == Sunday]

In [63]:
# Cross tabulation between category and store type
CrosstabResult_Monday =pd.crosstab(index=df_Monday['item_type'],columns=df_Monday['store_name'], dropna=False)
CrosstabResult_Tuesday =pd.crosstab(index=df_Tuesday['item_type'],columns=df_Tuesday['store_name'], dropna=False)
CrosstabResult_Wednesday =pd.crosstab(index=df_Wednesday['item_type'],columns=df_Wednesday['store_name'], dropna=False)
CrosstabResult_Thursday =pd.crosstab(index=df_Thursday['item_type'],columns=df_Thursday['store_name'], dropna=False)
CrosstabResult_Friday =pd.crosstab(index=df_Friday['item_type'],columns=df_Friday['store_name'], dropna=False)
CrosstabResult_Saturday =pd.crosstab(index=df_Saturday['item_type'],columns=df_Saturday['store_name'], dropna=False)
CrosstabResult_Sunday =pd.crosstab(index=df_Sunday['item_type'],columns=df_Sunday['store_name'], dropna=False)

# Performing Chi-sq test
Monday = chi2_contingency(CrosstabResult_Monday)
Tuesday = chi2_contingency(CrosstabResult_Tuesday)
Wednesday = chi2_contingency(CrosstabResult_Wednesday)
Thursday = chi2_contingency(CrosstabResult_Thursday)
Friday = chi2_contingency(CrosstabResult_Friday)
# Saturday = chi2_contingency(CrosstabResult_Saturday)
Sunday = chi2_contingency(CrosstabResult_Sunday)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for Mondays:', Monday[1])
print('The P-Value of the ChiSq Test is, for Tuesdays:', Tuesday[1])
print('The P-Value of the ChiSq Test is, for Wednesdays:', Wednesday[1])
print('The P-Value of the ChiSq Test is, for Thursdays:', Thursday[1])
print('The P-Value of the ChiSq Test is, for Fridays:', Friday[1])
# print('The P-Value of the ChiSq Test is, for Saturdays:', Saturday[1])
print('The P-Value of the ChiSq Test is, for Sundays:', Sunday[1])

The P-Value of the ChiSq Test is, for Mondays: 0.43133509838175615
The P-Value of the ChiSq Test is, for Tuesdays: 0.012274396812279185
The P-Value of the ChiSq Test is, for Wednesdays: 1.0
The P-Value of the ChiSq Test is, for Thursdays: 0.23771371108535994
The P-Value of the ChiSq Test is, for Fridays: 1.0
The P-Value of the ChiSq Test is, for Sundays: 0.17553294879732945


In [65]:
CrosstabResult_Tuesday

store_name,Aldi,Tine
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1
all-purpose cleaner,1,0
apples,1,0
bacon,1,0
basil,1,0
bell peppers,1,0
biscuits,1,0
bread,0,1
butter,1,0
candy,4,0
charcuterie,2,0


## 2.5. Per STORE NAME

In [30]:
# Find unique values for store names
print(df['store_name'].unique())

['Aldi' 'Tom&Co' 'Wouters' 'Colruyt' 'Vanessa' 'Tine' 'Albert Heijn'
 'Food Factory' 'Maxi Zoo' 'Delhaize']


In [31]:
# define data
store1 = 'Aldi'
store2 = 'Colruyt'
store3 = 'Albert Heijn'
store4 = 'Delhaize'
store5 = 'Food Factory'
store6 = 'Wouters'
store7 = 'Vanessa'
store8 = 'Tine'
store9 = 'Tom&CO'
store10 = 'Maxi Zoo'

# select only these data for the df
df_store1 = df[df['store_name'] == store1]
df_store2 = df[df['store_name'] == store2]
df_store3 = df[df['store_name'] == store3] 
df_store4 = df[df['store_name'] == store4] 
df_store5 = df[df['store_name'] == store5] 
df_store6 = df[df['store_name'] == store6] 
df_store7 = df[df['store_name'] == store7] 
df_store8 = df[df['store_name'] == store8] 
df_store9 = df[df['store_name'] == store9]
df_store10 = df[df['store_name'] == store10] 

In [36]:
# Cross tabulation between category and day
CrosstabResult1=pd.crosstab(index=df_store1['item_type'],columns=df_store1['time'])
CrosstabResult2=pd.crosstab(index=df_store2['item_type'],columns=df_store2['time'])
CrosstabResult3=pd.crosstab(index=df_store3['item_type'],columns=df_store3['time'])
CrosstabResult4=pd.crosstab(index=df_store4['item_type'],columns=df_store4['time'])
CrosstabResult5=pd.crosstab(index=df_store5['item_type'],columns=df_store5['time'])
CrosstabResult6=pd.crosstab(index=df_store6['item_type'],columns=df_store6['time'])
CrosstabResult7=pd.crosstab(index=df_store7['item_type'],columns=df_store7['time'])
CrosstabResult8=pd.crosstab(index=df_store8['item_type'],columns=df_store8['time'])
CrosstabResult9=pd.crosstab(index=df_store9['item_type'],columns=df_store9['time'])
CrosstabResult10=pd.crosstab(index=df_store10['item_type'],columns=df_store10['time'])

# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)
ChiSqResult4 = chi2_contingency(CrosstabResult4)
ChiSqResult5 = chi2_contingency(CrosstabResult5)
ChiSqResult6 = chi2_contingency(CrosstabResult6)
ChiSqResult7 = chi2_contingency(CrosstabResult7)
ChiSqResult8 = chi2_contingency(CrosstabResult8)
# ChiSqResult9 = chi2_contingency(CrosstabResult9)
ChiSqResult10 = chi2_contingency(CrosstabResult10)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test Aldi is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test Colruyt is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test Albert Heijn is:', ChiSqResult3[1])
print('The P-Value of the ChiSq Test Delhaize is:', ChiSqResult4[1])
print('The P-Value of the ChiSq Test Food Factory is:', ChiSqResult5[1])
print('The P-Value of the ChiSq Test Wouters is:', ChiSqResult6[1])
print('The P-Value of the ChiSq Test Vanessa is:', ChiSqResult7[1])
print('The P-Value of the ChiSq Test Tine is:', ChiSqResult8[1])
# print('The P-Value of the ChiSq Test Tom&Co is:', ChiSqResult9[1])
print('The P-Value of the ChiSq Test Maxi Zoo is:', ChiSqResult10[1])

The P-Value of the ChiSq Test Aldi is: 0.7980730584593592
The P-Value of the ChiSq Test Colruyt is: 0.4285718302281244
The P-Value of the ChiSq Test Albert Heijn is: 1.0
The P-Value of the ChiSq Test Delhaize is: 1.0
The P-Value of the ChiSq Test Food Factory is: 1.0
The P-Value of the ChiSq Test Wouters is: 1.0
The P-Value of the ChiSq Test Vanessa is: 1.0
The P-Value of the ChiSq Test Tine is: 1.0
The P-Value of the ChiSq Test Maxi Zoo is: 1.0


In [35]:
# Cross tabulation between category and day
CrosstabResult1=pd.crosstab(index=df_store1['item_type'],columns=df_store1['day'])
CrosstabResult2=pd.crosstab(index=df_store2['item_type'],columns=df_store2['day'])
CrosstabResult3=pd.crosstab(index=df_store3['item_type'],columns=df_store3['day'])
CrosstabResult4=pd.crosstab(index=df_store4['item_type'],columns=df_store4['day'])
CrosstabResult5=pd.crosstab(index=df_store5['item_type'],columns=df_store5['day'])
CrosstabResult6=pd.crosstab(index=df_store6['item_type'],columns=df_store6['day'])
CrosstabResult7=pd.crosstab(index=df_store7['item_type'],columns=df_store7['day'])
CrosstabResult8=pd.crosstab(index=df_store8['item_type'],columns=df_store8['day'])
CrosstabResult9=pd.crosstab(index=df_store9['item_type'],columns=df_store9['day'])
CrosstabResult10=pd.crosstab(index=df_store10['item_type'],columns=df_store10['day'])

# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)
ChiSqResult4 = chi2_contingency(CrosstabResult4)
ChiSqResult5 = chi2_contingency(CrosstabResult5)
ChiSqResult6 = chi2_contingency(CrosstabResult6)
ChiSqResult7 = chi2_contingency(CrosstabResult7)
ChiSqResult8 = chi2_contingency(CrosstabResult8)
# ChiSqResult9 = chi2_contingency(CrosstabResult9)
ChiSqResult10 = chi2_contingency(CrosstabResult10)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test Aldi is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test Colruyt is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test Albert Heijn is:', ChiSqResult3[1])
print('The P-Value of the ChiSq Test Delhaize is:', ChiSqResult4[1])
print('The P-Value of the ChiSq Test Food Factory is:', ChiSqResult5[1])
print('The P-Value of the ChiSq Test Wouters is:', ChiSqResult6[1])
print('The P-Value of the ChiSq Test Vanessa is:', ChiSqResult7[1])
print('The P-Value of the ChiSq Test Tine is:', ChiSqResult8[1])
# print('The P-Value of the ChiSq Test Tom&Co is:', ChiSqResult9[1])
print('The P-Value of the ChiSq Test Maxi Zoo is:', ChiSqResult10[1])

The P-Value of the ChiSq Test Aldi is: 0.9996540649152295
The P-Value of the ChiSq Test Colruyt is: 0.9088660608901772
The P-Value of the ChiSq Test Albert Heijn is: 1.0
The P-Value of the ChiSq Test Delhaize is: 1.0
The P-Value of the ChiSq Test Food Factory is: 1.0
The P-Value of the ChiSq Test Wouters is: 1.0
The P-Value of the ChiSq Test Vanessa is: 1.0
The P-Value of the ChiSq Test Tine is: 1.0
The P-Value of the ChiSq Test Maxi Zoo is: 1.0
