## 5: CORRELATIONS - WHAT ITEM TYPES?

### Import libraries 

In [29]:
#Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
#NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np
# Matplotlib is a plotting library for python and pyplot gives us a MatLab like plotting framework. We will use this in our plotter function to plot data.
import matplotlib.pyplot as plt
#Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics
import seaborn as sns
import dataframe_image as dfi

from matplotlib.ticker import StrMethodFormatter

# importing the required function for correlations
from scipy.stats import chi2_contingency

### Load and view data 

In [33]:
df = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2.csv")
# df.describe(include='all')

In [34]:
# use the corr function to display the correlation between all the features
data_corr = df.corr()
# data_corr

In [35]:
df.dtypes

week               int64
order_ID           int64
item_name         object
amount             int64
price_unit       float64
price_total      float64
item_type         object
category          object
day               object
time              object
store_type        object
store_name        object
promo               bool
reorder             bool
item_id            int64
type_id            int64
order_amount       int64
order_price      float64
day_num            int64
storetype_num      int64
storename_num      int64
cat_num            int64
time_num           int64
promo_num          int64
datetime          object
timestamp          int64
dates             object
times             object
times_min          int64
dates_days         int64
dtype: object

# 1. GENERAL CORRELATIONS: time/day/storetype-name

## 1.1. CORRELATION 1: item type vs day

In [6]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['day'])

CrosstabResult

day,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
alcoholic drinks,1,0,0,0,0,0,0
almond milk,0,0,1,0,0,0,0
andalouse sauce,0,0,0,0,1,0,0
apple sauce,1,0,2,0,1,0,0
apples,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...
veal,0,0,1,0,0,0,0
vegetable box,0,2,0,0,0,0,0
vegetable mix,0,0,0,0,3,0,0
yoghurt,1,3,2,1,1,0,0


In [7]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 0.007328794340601195


Significant (strong), we could limit options to what's bought per day:
> 1. Certain cat. are bought on specific days only/mostly
2. Could be by chance, but could be on purpose for some categories (e.g. bakery)
3. >  Let's check if this is also true for only the supermarket/times...

## 1.2. CORRELATION 2: item type vs time

In [4]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['time'])

CrosstabResult

time,afternoon,evening,morning,noon
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
alcoholic drinks,1,0,0,0
almond milk,0,1,0,0
andalouse sauce,1,0,0,0
apple sauce,2,1,1,0
apples,1,1,0,0
...,...,...,...,...
veal,0,0,1,0
vegetable box,0,2,0,0
vegetable mix,2,1,0,0
yoghurt,2,4,2,0


In [5]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 0.002121466627746871


Significant (strong) correlation, we could limit options to what's bought per time [in a supermarket]

## 1.3.1 CORRELATION 3: item type vs store

### 1.3.1 item type vs store type

In [8]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['store_type'])

CrosstabResult

store_type,bakery,butcher,drugstore,furniture store,supermarket
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
alcoholic drinks,0,0,0,0,1
almond milk,0,0,0,0,1
andalouse sauce,0,0,0,0,1
apple sauce,0,0,0,0,4
apples,0,0,0,0,2
...,...,...,...,...,...
veal,0,0,0,0,1
vegetable box,0,0,0,0,2
vegetable mix,0,0,0,0,3
yoghurt,0,0,0,0,8


In [9]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 7.298136204606577e-19


### 1.3.2. item type vs store name

In [10]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['store_name'])

CrosstabResult

store_name,Albert Heijn,Brabo,Carrefour,Delhaize,Ikea,Kruidvat,Okay,Sys,Versavel Poelman
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
alcoholic drinks,0,0,0,1,0,0,0,0,0
almond milk,0,0,1,0,0,0,0,0,0
andalouse sauce,0,0,0,0,0,0,1,0,0
apple sauce,0,0,1,1,0,0,2,0,0
apples,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
veal,0,0,0,0,0,0,1,0,0
vegetable box,2,0,0,0,0,0,0,0,0
vegetable mix,0,0,0,0,0,0,3,0,0
yoghurt,3,0,1,2,0,0,2,0,0


In [11]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 4.60371070098832e-15


## 1.4. item type vs promo

In [36]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['promo_num'])

CrosstabResult

promo_num,0,1
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1
alcoholic drinks,1,0
almond milk,1,0
andalouse sauce,1,0
apple sauce,4,0
apples,2,0
...,...,...
veal,1,0
vegetable box,0,2
vegetable mix,1,2
yoghurt,7,1


In [37]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 0.0003625346131603951


We could separate items that are primarily/only bought in promo and exclude them from regular groceries

# 2. Narrow down dataframe

## 2.1 Per DAY

In [38]:
# define data
Monday = 'Monday'
Tuesday = 'Tuesday'
Wednesday = 'Wednesday'
Thursday = 'Thursday'
Friday = 'Friday'
Saturday = 'Saturday'
Sunday = 'Sunday'

# select only these data (for day, store, store name)
df_Monday = df[df["day"] == Monday]
df_Tuesday = df[df["day"] == Tuesday]
df_Wednesday = df[df["day"] == Wednesday]
df_Thursday = df[df["day"] == Thursday]
df_Friday = df[df["day"] == Friday]
df_Saturday = df[df["day"] == Saturday]
df_Sunday = df[df["day"] == Sunday]

In [42]:
# Cross tabulation between category and store type
CrosstabResult_Monday =pd.crosstab(index=df_Monday['item_type'],columns=df_Monday['time'], dropna=False)
CrosstabResult_Tuesday =pd.crosstab(index=df_Tuesday['item_type'],columns=df_Tuesday['time'], dropna=False)
CrosstabResult_Wednesday =pd.crosstab(index=df_Wednesday['item_type'],columns=df_Wednesday['time'], dropna=False)
CrosstabResult_Thursday =pd.crosstab(index=df_Thursday['item_type'],columns=df_Thursday['time'], dropna=False)
CrosstabResult_Friday =pd.crosstab(index=df_Friday['item_type'],columns=df_Friday['time'], dropna=False)
CrosstabResult_Saturday =pd.crosstab(index=df_Saturday['item_type'],columns=df_Saturday['time'], dropna=False)
CrosstabResult_Sunday =pd.crosstab(index=df_Sunday['item_type'],columns=df_Sunday['time'], dropna=False)

# Performing Chi-sq test
Monday = chi2_contingency(CrosstabResult_Monday)
Tuesday = chi2_contingency(CrosstabResult_Tuesday)
Wednesday = chi2_contingency(CrosstabResult_Wednesday)
Thursday = chi2_contingency(CrosstabResult_Thursday)
Friday = chi2_contingency(CrosstabResult_Friday)
Saturday = chi2_contingency(CrosstabResult_Saturday)
Sunday = chi2_contingency(CrosstabResult_Sunday)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for Mondays:', Monday[1])
print('The P-Value of the ChiSq Test is, for Tuesdays:', Tuesday[1])
print('The P-Value of the ChiSq Test is, for Wednesdays:', Wednesday[1])
print('The P-Value of the ChiSq Test is, for Thursdays:', Thursday[1])
print('The P-Value of the ChiSq Test is, for Fridays:', Friday[1])
print('The P-Value of the ChiSq Test is, for Saturdays:', Saturday[1])
print('The P-Value of the ChiSq Test is, for Sundays:', Sunday[1])

The P-Value of the ChiSq Test is, for Mondays: 0.04201822547743906
The P-Value of the ChiSq Test is, for Tuesdays: 0.07250903390723971
The P-Value of the ChiSq Test is, for Wednesdays: 0.30988495327136445
The P-Value of the ChiSq Test is, for Thursdays: 0.09812289522680583
The P-Value of the ChiSq Test is, for Fridays: 0.49221958873322486
The P-Value of the ChiSq Test is, for Saturdays: 0.43434402458059357
The P-Value of the ChiSq Test is, for Sundays: 1.0


In [43]:
# Cross tabulation between category and store type
CrosstabResult_Monday =pd.crosstab(index=df_Monday['item_type'],columns=df_Monday['store_type'], dropna=False)
CrosstabResult_Tuesday =pd.crosstab(index=df_Tuesday['item_type'],columns=df_Tuesday['store_type'], dropna=False)
CrosstabResult_Wednesday =pd.crosstab(index=df_Wednesday['item_type'],columns=df_Wednesday['store_type'], dropna=False)
CrosstabResult_Thursday =pd.crosstab(index=df_Thursday['item_type'],columns=df_Thursday['store_type'], dropna=False)
CrosstabResult_Friday =pd.crosstab(index=df_Friday['item_type'],columns=df_Friday['store_type'], dropna=False)
CrosstabResult_Saturday =pd.crosstab(index=df_Saturday['item_type'],columns=df_Saturday['store_type'], dropna=False)
CrosstabResult_Sunday =pd.crosstab(index=df_Sunday['item_type'],columns=df_Sunday['store_type'], dropna=False)

# Performing Chi-sq test
Monday = chi2_contingency(CrosstabResult_Monday)
Tuesday = chi2_contingency(CrosstabResult_Tuesday)
Wednesday = chi2_contingency(CrosstabResult_Wednesday)
Thursday = chi2_contingency(CrosstabResult_Thursday)
Friday = chi2_contingency(CrosstabResult_Friday)
Saturday = chi2_contingency(CrosstabResult_Saturday)
Sunday = chi2_contingency(CrosstabResult_Sunday)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for Mondays:', Monday[1])
print('The P-Value of the ChiSq Test is, for Tuesdays:', Tuesday[1])
print('The P-Value of the ChiSq Test is, for Wednesdays:', Wednesday[1])
print('The P-Value of the ChiSq Test is, for Thursdays:', Thursday[1])
print('The P-Value of the ChiSq Test is, for Fridays:', Friday[1])
print('The P-Value of the ChiSq Test is, for Saturdays:', Saturday[1])
print('The P-Value of the ChiSq Test is, for Sundays:', Sunday[1])

The P-Value of the ChiSq Test is, for Mondays: 0.013901439708649498
The P-Value of the ChiSq Test is, for Tuesdays: 0.025016961216351507
The P-Value of the ChiSq Test is, for Wednesdays: 0.1090641579497725
The P-Value of the ChiSq Test is, for Thursdays: 0.9686462864551427
The P-Value of the ChiSq Test is, for Fridays: 0.8655512299356921
The P-Value of the ChiSq Test is, for Saturdays: 0.004323383270839815
The P-Value of the ChiSq Test is, for Sundays: 0.017341246964033406


In [44]:
# Cross tabulation between category and store type
CrosstabResult_Monday =pd.crosstab(index=df_Monday['item_type'],columns=df_Monday['store_name'], dropna=False)
CrosstabResult_Tuesday =pd.crosstab(index=df_Tuesday['item_type'],columns=df_Tuesday['store_name'], dropna=False)
CrosstabResult_Wednesday =pd.crosstab(index=df_Wednesday['item_type'],columns=df_Wednesday['store_name'], dropna=False)
CrosstabResult_Thursday =pd.crosstab(index=df_Thursday['item_type'],columns=df_Thursday['store_name'], dropna=False)
CrosstabResult_Friday =pd.crosstab(index=df_Friday['item_type'],columns=df_Friday['store_name'], dropna=False)
CrosstabResult_Saturday =pd.crosstab(index=df_Saturday['item_type'],columns=df_Saturday['store_name'], dropna=False)
CrosstabResult_Sunday =pd.crosstab(index=df_Sunday['item_type'],columns=df_Sunday['store_name'], dropna=False)

# Performing Chi-sq test
Monday = chi2_contingency(CrosstabResult_Monday)
Tuesday = chi2_contingency(CrosstabResult_Tuesday)
Wednesday = chi2_contingency(CrosstabResult_Wednesday)
Thursday = chi2_contingency(CrosstabResult_Thursday)
Friday = chi2_contingency(CrosstabResult_Friday)
Saturday = chi2_contingency(CrosstabResult_Saturday)
Sunday = chi2_contingency(CrosstabResult_Sunday)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for Mondays:', Monday[1])
print('The P-Value of the ChiSq Test is, for Tuesdays:', Tuesday[1])
print('The P-Value of the ChiSq Test is, for Wednesdays:', Wednesday[1])
print('The P-Value of the ChiSq Test is, for Thursdays:', Thursday[1])
print('The P-Value of the ChiSq Test is, for Fridays:', Friday[1])
print('The P-Value of the ChiSq Test is, for Saturdays:', Saturday[1])
print('The P-Value of the ChiSq Test is, for Sundays:', Sunday[1])

The P-Value of the ChiSq Test is, for Mondays: 0.013901439708649408
The P-Value of the ChiSq Test is, for Tuesdays: 0.02501696121635147
The P-Value of the ChiSq Test is, for Wednesdays: 0.1090641579497723
The P-Value of the ChiSq Test is, for Thursdays: 0.2937140173009805
The P-Value of the ChiSq Test is, for Fridays: 0.4922195887332244
The P-Value of the ChiSq Test is, for Saturdays: 0.013851927042001286
The P-Value of the ChiSq Test is, for Sundays: 0.0003349433115878115


Narrowing down type per day vs time makes sense only for mondays

# 2. Specify categories

In [16]:
# data for different stores vs time
df_AHmornings = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_AHmorn.csv")
df_AHnoons = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_AHnoon.csv")
df_AHafternoons = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_AHafter.csv")

df_SYSmornings = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_SYSmorn.csv")
df_SYSnoons = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_SYSnoon.csv")

df_OKAYmornings = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_OKAYmorn.csv")
df_OKAYnoons = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_OKAYnoon.csv")
df_OKAYafternoons = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_OKAYafter.csv")

df_VERSAVELmornings = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_VERSmorn.csv")

df_DELHAIZEmornings = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_DELHmorn.csv")
df_DELHAIZEafternoons = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_DELHafter.csv")

df_KRUIDVATafternoons = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_KRUIDafter.csv")

df_BRABOafternoons = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_BRAafter.csv")

df_IKEAmornings = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_IKEAmorn.csv")

df_CARREFOURafternoons = pd.read_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_CARafter.csv")

In [24]:
df_CARREFOURafternoons.describe()

Unnamed: 0,week,order_ID,amount,price_unit,price_total,item_id,type_id,order_amount,order_price,day_num,storetype_num,storename_num,cat_num,time_num,timestamp,times_min,dates_days
count,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0
mean,1.0,1.0,1.0,2.203594,2.203594,15.0,12.870968,31.0,68.31142,2.0,4.0,2.0,8.83871,1.0,1.637435e+18,1858.0,20211120.0
std,0.0,0.0,0.0,1.72864,1.72864,9.092121,7.419532,0.0,0.0,0.0,0.0,0.0,2.945016,0.0,1040.927,0.0,0.0
min,1.0,1.0,1.0,0.57,0.57,0.0,0.0,31.0,68.31142,2.0,4.0,2.0,2.0,1.0,1.637435e+18,1858.0,20211120.0
25%,1.0,1.0,1.0,0.94,0.94,7.5,7.5,31.0,68.31142,2.0,4.0,2.0,7.0,1.0,1.637435e+18,1858.0,20211120.0
50%,1.0,1.0,1.0,1.59,1.59,15.0,13.0,31.0,68.31142,2.0,4.0,2.0,9.0,1.0,1.637435e+18,1858.0,20211120.0
75%,1.0,1.0,1.0,2.43286,2.43286,22.5,18.5,31.0,68.31142,2.0,4.0,2.0,10.0,1.0,1.637435e+18,1858.0,20211120.0
max,1.0,1.0,1.0,5.99,5.99,30.0,26.0,31.0,68.31142,2.0,4.0,2.0,15.0,1.0,1.637435e+18,1858.0,20211120.0


## 2.1. CORRELATION 1: item type vs time

In [18]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df_AHmornings['item_type'],columns=df_AHmornings['time'])

CrosstabResult

time,morning
item_type,Unnamed: 1_level_1
bread,15
chocolate milk,1
danish,8
fish spread,1
meat spread,1
pastry,2
sandwich,4


In [19]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 1.0


Significant (strong) correlation, we could limit options to what's bought per time [in a supermarket]

## 2.2. CORRELATION 2: item type vs day

In [6]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['day'])

CrosstabResult

day,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
alcoholic drinks,1,0,0,0,0,0,0
almond milk,0,0,1,0,0,0,0
andalouse sauce,0,0,0,0,1,0,0
apple sauce,1,0,2,0,1,0,0
apples,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...
veal,0,0,1,0,0,0,0
vegetable box,0,2,0,0,0,0,0
vegetable mix,0,0,0,0,3,0,0
yoghurt,1,3,2,1,1,0,0


In [7]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 0.007328794340601195


Significant (strong), we could limit options to what's bought per day:
> 1. Certain cat. are bought on specific days only/mostly
2. Could be by chance, but could be on purpose for some categories (e.g. bakery)
3. >  Let's check if this is also true for only the supermarket/times...

## 2.3.1 CORRELATION 3: item type vs store

### 2.3.1 item type vs store type

In [8]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['store_type'])

CrosstabResult

store_type,bakery,butcher,drugstore,furniture store,supermarket
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
alcoholic drinks,0,0,0,0,1
almond milk,0,0,0,0,1
andalouse sauce,0,0,0,0,1
apple sauce,0,0,0,0,4
apples,0,0,0,0,2
...,...,...,...,...,...
veal,0,0,0,0,1
vegetable box,0,0,0,0,2
vegetable mix,0,0,0,0,3
yoghurt,0,0,0,0,8


In [9]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 7.298136204606577e-19


### 2.3.2. item type vs store name

In [10]:
# Cross tabulation between category and store type
CrosstabResult =pd.crosstab(index=df['item_type'],columns=df['store_name'])

CrosstabResult

store_name,Albert Heijn,Brabo,Carrefour,Delhaize,Ikea,Kruidvat,Okay,Sys,Versavel Poelman
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
alcoholic drinks,0,0,0,1,0,0,0,0,0
almond milk,0,0,1,0,0,0,0,0,0
andalouse sauce,0,0,0,0,0,0,1,0,0
apple sauce,0,0,1,1,0,0,2,0,0
apples,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
veal,0,0,0,0,0,0,1,0,0
vegetable box,2,0,0,0,0,0,0,0,0
vegetable mix,0,0,0,0,0,0,3,0,0
yoghurt,3,0,1,2,0,0,2,0,0


In [11]:
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 4.60371070098832e-15


-
# 2. SPECIFIC CORRELATIONS: introduce third variable (day/time/store/time)

In [124]:
# define data
Monday = 'Monday'
Tuesday = 'Tuesday'
Wednesday = 'Wednesday'
Thursday = 'Thursday'
Friday = 'Friday'
Saturday = 'Saturday'
Sunday = 'Sunday'

# select only these data (for day, store, store name)
df_Monday = df[df["day"] == Monday]
df_Tuesday = df[df["day"] == Tuesday]
df_Wednesday = df[df["day"] == Wednesday]
df_Thursday = df[df["day"] == Thursday]
df_Friday = df[df["day"] == Friday]
df_Saturday = df[df["day"] == Saturday]
df_Sunday = df[df["day"] == Sunday]

In [125]:
# define data
store1 = 'Albert Heijn'
store2 = 'Carrefour'
store3 = 'Okay'
store4 = 'Delhaize'
store5 = 'Versavel Poelman'
store6 = 'Kruidvat'
store7 = 'Brabo'
store8 = 'Ikea'
store9 = 'Sys'

# select only these data for the df
df_store1 = df[df['store_name'] == store1]
df_store2 = df[df['store_name'] == store2]
df_store3 = df[df['store_name'] == store3]
df_store4 = df[df['store_name'] == store4]
df_store5 = df[df['store_name'] == store5]
df_store6 = df[df['store_name'] == store6]
df_store7 = df[df['store_name'] == store7]
df_store8 = df[df['store_name'] == store8]
df_store9 = df[df['store_name'] == store9] 

In [126]:
# to select only one
storet1 = 'supermarket'
storet2 = 'butcher'
storet3 = 'bakery'
storet4 = 'drugstore'
storet5 = 'furniture store'

# select only these data for the df
df_storet1 = df[df['store_type'] == storet1]
df_storet2 = df[df['store_type'] == storet2]
df_storet3 = df[df['store_type'] == storet3]
df_storet4 = df[df['store_type'] == storet4]
df_storet5 = df[df['store_type'] == storet5]

In [127]:
# aggregate afternoon and evening
# df['time'] = df['time'].replace(['afternoon', 'evening'], 'afternoon')

# select only these data (for time)
df_morning = df[df["time"] == 'morning']
df_noon = df[df["time"] == 'noon']
df_afternoon = df[df["time"] == 'afternoon']
df_evening = df[df["time"] == 'evening']

-
## 2.2: CORRELATION 2: item type vs day

## 2.2.1.: vs time

In [156]:
# Cross tabulation between category and store type
CrosstabResult_morning =pd.crosstab(index=df_morning['item_type'],columns=df_morning['day'])
CrosstabResult_noon =pd.crosstab(index=df_noon['item_type'],columns=df_noon['day'])
CrosstabResult_afternoon =pd.crosstab(index=df_afternoon['item_type'],columns=df_afternoon['day'])

In [129]:
# Performing Chi-sq test
ChiSqResult_morning = chi2_contingency(CrosstabResult_morning)
ChiSqResult_noon = chi2_contingency(CrosstabResult_noon)
ChiSqResult_afternoon = chi2_contingency(CrosstabResult_afternoon)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for mornings:', ChiSqResult_morning[1])
print('The P-Value of the ChiSq Test is, for noons:', ChiSqResult_noon[1])
print('The P-Value of the ChiSq Test is, for afternoons:', ChiSqResult_afternoon[1])

The P-Value of the ChiSq Test is, for mornings: 0.07356115062220253
The P-Value of the ChiSq Test is, for noons: 0.39419366188146604
The P-Value of the ChiSq Test is, for afternoons: 0.2409068528224968


-
## 2.1. CORRELATION 1: item type vs TIME

## 2.1.1.: vs day

In [130]:
# Cross tabulation between category and store type
CrosstabResult_Monday =pd.crosstab(index=df_Monday['item_type'],columns=df_Monday['time'], dropna=False)
CrosstabResult_Tuesday =pd.crosstab(index=df_Tuesday['item_type'],columns=df_Tuesday['time'], dropna=False)
CrosstabResult_Wednesday =pd.crosstab(index=df_Wednesday['item_type'],columns=df_Wednesday['time'], dropna=False)
CrosstabResult_Thursday =pd.crosstab(index=df_Thursday['item_type'],columns=df_Thursday['time'], dropna=False)
CrosstabResult_Friday =pd.crosstab(index=df_Friday['item_type'],columns=df_Friday['time'], dropna=False)
CrosstabResult_Saturday =pd.crosstab(index=df_Saturday['item_type'],columns=df_Saturday['time'], dropna=False)
CrosstabResult_Sunday =pd.crosstab(index=df_Sunday['item_type'],columns=df_Sunday['time'], dropna=False)


In [131]:
# Performing Chi-sq test
Monday = chi2_contingency(CrosstabResult_Monday)
Tuesday = chi2_contingency(CrosstabResult_Tuesday)
Wednesday = chi2_contingency(CrosstabResult_Wednesday)
Thursday = chi2_contingency(CrosstabResult_Thursday)
Friday = chi2_contingency(CrosstabResult_Friday)
Saturday = chi2_contingency(CrosstabResult_Saturday)
Sunday = chi2_contingency(CrosstabResult_Sunday)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for Mondays:', Monday[1])
print('The P-Value of the ChiSq Test is, for Tuesdays:', Tuesday[1])
print('The P-Value of the ChiSq Test is, for Wednesdays:', Wednesday[1])
print('The P-Value of the ChiSq Test is, for Thursdays:', Thursday[1])
print('The P-Value of the ChiSq Test is, for Fridays:', Friday[1])
print('The P-Value of the ChiSq Test is, for Saturdays:', Saturday[1])
print('The P-Value of the ChiSq Test is, for Sundays:', Sunday[1])

The P-Value of the ChiSq Test is, for Mondays: 0.04201822547743906
The P-Value of the ChiSq Test is, for Tuesdays: 0.07250903390723971
The P-Value of the ChiSq Test is, for Wednesdays: 0.30988495327136445
The P-Value of the ChiSq Test is, for Thursdays: 0.09812289522680583
The P-Value of the ChiSq Test is, for Fridays: 0.49221958873322486
The P-Value of the ChiSq Test is, for Saturdays: 0.43434402458059357
The P-Value of the ChiSq Test is, for Sundays: 1.0


- On some days it is clear which categories are bought at which times
- On other days, it does not matter what time it is, categories can be bought whenever
- For Sunday, they did not go shopping any other time than in the morning, you thus cant tell if they would buy other categories at other times

## 2.1.2.: vs store

#### 2.1.2.1. vs store name

In [132]:
# Cross tabulation between category and day
CrosstabResult1=pd.crosstab(index=df_store1['item_type'],columns=df_store1['time'])
CrosstabResult2=pd.crosstab(index=df_store2['item_type'],columns=df_store2['time'])
CrosstabResult3=pd.crosstab(index=df_store3['item_type'],columns=df_store3['time'])
CrosstabResult4=pd.crosstab(index=df_store4['item_type'],columns=df_store4['time'])
CrosstabResult5=pd.crosstab(index=df_store5['item_type'],columns=df_store5['time'])
CrosstabResult6=pd.crosstab(index=df_store6['item_type'],columns=df_store6['time'])
CrosstabResult7=pd.crosstab(index=df_store7['item_type'],columns=df_store7['time'])
CrosstabResult8=pd.crosstab(index=df_store8['item_type'],columns=df_store8['time'])
CrosstabResult9=pd.crosstab(index=df_store9['item_type'],columns=df_store9['time'])

In [133]:
# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)
ChiSqResult4 = chi2_contingency(CrosstabResult4)
ChiSqResult5 = chi2_contingency(CrosstabResult5)
ChiSqResult6 = chi2_contingency(CrosstabResult6)
ChiSqResult7 = chi2_contingency(CrosstabResult7)
ChiSqResult8 = chi2_contingency(CrosstabResult8)
ChiSqResult9 = chi2_contingency(CrosstabResult9)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test AH is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test Carrefour is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test Okay is:', ChiSqResult3[1])
print('The P-Value of the ChiSq Test Delhaize is:', ChiSqResult4[1])
print('The P-Value of the ChiSq Test Versavel Poelman is:', ChiSqResult5[1])
print('The P-Value of the ChiSq Test Kruidvat is:', ChiSqResult6[1])
print('The P-Value of the ChiSq Test Brabo is:', ChiSqResult7[1])
print('The P-Value of the ChiSq Test Ikea is:', ChiSqResult8[1])
print('The P-Value of the ChiSq Test Sys is:', ChiSqResult9[1])

The P-Value of the ChiSq Test AH is: 0.2553069052181278
The P-Value of the ChiSq Test Carrefour is: 1.0
The P-Value of the ChiSq Test Okay is: 0.5929013051789971
The P-Value of the ChiSq Test Delhaize is: 0.48913159160139275
The P-Value of the ChiSq Test Versavel Poelman is: 1.0
The P-Value of the ChiSq Test Kruidvat is: 1.0
The P-Value of the ChiSq Test Brabo is: 1.0
The P-Value of the ChiSq Test Ikea is: 1.0
The P-Value of the ChiSq Test Sys is: 0.9604070567886343


- Time vs category would only matter for Okay

#### 2.1.2.2 vs store type

In [134]:
# Cross tabulation between DAY and TIME
CrosstabResult1=pd.crosstab(index=df_storet1['item_type'],columns=df_storet1['time'])
CrosstabResult2=pd.crosstab(index=df_storet2['item_type'],columns=df_storet2['time'])
CrosstabResult3=pd.crosstab(index=df_storet3['item_type'],columns=df_storet3['time'])
CrosstabResult4=pd.crosstab(index=df_storet4['item_type'],columns=df_storet4['time'])
CrosstabResult5=pd.crosstab(index=df_storet5['item_type'],columns=df_storet5['time'])

In [135]:
# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)
ChiSqResult4 = chi2_contingency(CrosstabResult4)
ChiSqResult5 = chi2_contingency(CrosstabResult5)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test supermarket is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test butcher is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test bakery is:', ChiSqResult3[1])
print('The P-Value of the ChiSq Test drugstore is:', ChiSqResult4[1])
print('The P-Value of the ChiSq Test furniture store is:', ChiSqResult5[1])

The P-Value of the ChiSq Test supermarket is: 0.020303910136060987
The P-Value of the ChiSq Test butcher is: 0.8540426379636573
The P-Value of the ChiSq Test bakery is: 0.9604070567886343
The P-Value of the ChiSq Test drugstore is: 1.0
The P-Value of the ChiSq Test furniture store is: 1.0


- Time vs category does not matter for store type (mostly then for supermarkets)

-
## 2.2: CORRELATION 2: item types vs DAY

## 2.2.1.: vs time

In [143]:
# Cross tabulation between category and store type
CrosstabResult_morning =pd.crosstab(index=df_morning['item_type'],columns=df_morning['day'])
CrosstabResult_noon =pd.crosstab(index=df_noon['item_type'],columns=df_noon['day'])
CrosstabResult_afternoon =pd.crosstab(index=df_afternoon['item_type'],columns=df_afternoon['day'])

# CrosstabResult_afternoon

In [137]:
# Performing Chi-sq test
ChiSqResult_morning = chi2_contingency(CrosstabResult_morning)
ChiSqResult_noon = chi2_contingency(CrosstabResult_noon)
ChiSqResult_afternoon = chi2_contingency(CrosstabResult_afternoon)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for mornings:', ChiSqResult_morning[1])
print('The P-Value of the ChiSq Test is, for noons:', ChiSqResult_noon[1])
print('The P-Value of the ChiSq Test is, for afternoons:', ChiSqResult_afternoon[1])

The P-Value of the ChiSq Test is, for mornings: 0.07356115062220253
The P-Value of the ChiSq Test is, for noons: 0.39419366188146604
The P-Value of the ChiSq Test is, for afternoons: 0.2409068528224968


- At noon, it does not matter what day it is to buy certain categories
  - (E.g., It's Monday noon, could buy fruit, but could also just buy snacks though they have not done this before.)
- In morning & afternoon: there is a correlation between category & day 
  - (E.g., Its Sunday morning, most likely they buy bakery items, but probably no canned foods or condiments.)

## 2.2.2.: vs store

#### 2.1.2.1. vs store name

In [138]:
# Cross tabulation between category and day
CrosstabResult1=pd.crosstab(index=df_store1['item_type'],columns=df_store1['day'])
CrosstabResult2=pd.crosstab(index=df_store2['item_type'],columns=df_store2['day'])
CrosstabResult3=pd.crosstab(index=df_store3['item_type'],columns=df_store3['day'])
CrosstabResult4=pd.crosstab(index=df_store4['item_type'],columns=df_store4['day'])
CrosstabResult5=pd.crosstab(index=df_store5['item_type'],columns=df_store5['day'])
CrosstabResult6=pd.crosstab(index=df_store6['item_type'],columns=df_store6['day'])
CrosstabResult7=pd.crosstab(index=df_store7['item_type'],columns=df_store7['day'])
CrosstabResult8=pd.crosstab(index=df_store8['item_type'],columns=df_store8['day'])
CrosstabResult9=pd.crosstab(index=df_store9['item_type'],columns=df_store9['day'])
CrosstabResult9

day,Monday,Saturday,Sunday,Tuesday,Wednesday
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bread,2,4,8,1,1
chocolate milk,0,1,0,0,0
danish,0,3,4,1,1
fish spread,0,1,0,0,0
meat spread,0,0,1,0,0
pastry,0,0,2,0,0
sandwich,0,1,1,2,1


In [139]:
# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)
ChiSqResult4 = chi2_contingency(CrosstabResult4)
ChiSqResult5 = chi2_contingency(CrosstabResult5)
ChiSqResult6 = chi2_contingency(CrosstabResult6)
ChiSqResult7 = chi2_contingency(CrosstabResult7)
ChiSqResult8 = chi2_contingency(CrosstabResult8)
ChiSqResult9 = chi2_contingency(CrosstabResult9)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test 1 is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test 2 is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test 3 is:', ChiSqResult3[1])
print('The P-Value of the ChiSq Test 4 is:', ChiSqResult4[1])
print('The P-Value of the ChiSq Test 5 is:', ChiSqResult5[1])
print('The P-Value of the ChiSq Test 6 is:', ChiSqResult6[1])
print('The P-Value of the ChiSq Test 7 is:', ChiSqResult7[1])
print('The P-Value of the ChiSq Test 8 is:', ChiSqResult8[1])
print('The P-Value of the ChiSq Test 9 is:', ChiSqResult9[1])

The P-Value of the ChiSq Test 1 is: 0.9916297197883326
The P-Value of the ChiSq Test 2 is: 1.0
The P-Value of the ChiSq Test 3 is: 0.4407906202626398
The P-Value of the ChiSq Test 4 is: 0.48913159160139275
The P-Value of the ChiSq Test 5 is: 0.5905214502087732
The P-Value of the ChiSq Test 6 is: 1.0
The P-Value of the ChiSq Test 7 is: 1.0
The P-Value of the ChiSq Test 8 is: 1.0
The P-Value of the ChiSq Test 9 is: 0.865542455654743


No correlation. It does not matter in what store they are when buying cat X on day X

#### 2.1.2.2 vs store type

In [140]:
# Cross tabulation between DAY and TIME
CrosstabResult1=pd.crosstab(index=df_storet1['item_type'],columns=df_storet1['day'])
CrosstabResult2=pd.crosstab(index=df_storet2['item_type'],columns=df_storet2['day'])
CrosstabResult3=pd.crosstab(index=df_storet3['item_type'],columns=df_storet3['day'])
CrosstabResult4=pd.crosstab(index=df_storet4['item_type'],columns=df_storet4['day'])
CrosstabResult5=pd.crosstab(index=df_storet5['item_type'],columns=df_storet5['day'])
CrosstabResult1

day,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
alcoholic drinks,1,0,0,0,0,0,0
almond milk,0,0,1,0,0,0,0
andalouse sauce,0,0,0,0,1,0,0
apple sauce,1,0,2,0,1,0,0
apples,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...
veal,0,0,1,0,0,0,0
vegetable box,0,2,0,0,0,0,0
vegetable mix,0,0,0,0,3,0,0
yoghurt,1,3,2,1,1,0,0


In [141]:
# Performing Chi-sq test
ChiSqResult1 = chi2_contingency(CrosstabResult1)
ChiSqResult2 = chi2_contingency(CrosstabResult2)
ChiSqResult3 = chi2_contingency(CrosstabResult3)
ChiSqResult4 = chi2_contingency(CrosstabResult4)
ChiSqResult5 = chi2_contingency(CrosstabResult5)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test 1 (supermarkets) is:', ChiSqResult1[1])
print('The P-Value of the ChiSq Test 2 is:', ChiSqResult2[1])
print('The P-Value of the ChiSq Test 3 is:', ChiSqResult3[1])
print('The P-Value of the ChiSq Test 4 is:', ChiSqResult4[1])
print('The P-Value of the ChiSq Test 5 is:', ChiSqResult5[1])

The P-Value of the ChiSq Test 1 (supermarkets) is: 0.0020143937344595624
The P-Value of the ChiSq Test 2 is: 0.626987087442622
The P-Value of the ChiSq Test 3 is: 0.865542455654743
The P-Value of the ChiSq Test 4 is: 1.0
The P-Value of the ChiSq Test 5 is: 1.0


Only for supermarkets it matters what items are are bought on which day

-
## 2.3: CORRELATION 3: item types vs store name

### 2.3.1: vs time

In [142]:
# Cross tabulation between category and store type
CrosstabResult_morning =pd.crosstab(index=df_morning['item_type'],columns=df_morning['store_name'])
CrosstabResult_noon =pd.crosstab(index=df_noon['item_type'],columns=df_noon['store_name'])
CrosstabResult_afternoon =pd.crosstab(index=df_afternoon['item_type'],columns=df_afternoon['store_name'])
CrosstabResult_evening =pd.crosstab(index=df_evening['item_type'],columns=df_evening['store_name'])

In [73]:
# Performing Chi-sq test
ChiSqResult_morning = chi2_contingency(CrosstabResult_morning)
ChiSqResult_noon = chi2_contingency(CrosstabResult_noon)
ChiSqResult_afternoon = chi2_contingency(CrosstabResult_afternoon)
ChiSqResult_evening = chi2_contingency(CrosstabResult_evening)


# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for mornings:', ChiSqResult_morning[1])
print('The P-Value of the ChiSq Test is, for noons:', ChiSqResult_noon[1])
print('The P-Value of the ChiSq Test is, for afternoons:', ChiSqResult_afternoon[1])
print('The P-Value of the ChiSq Test is, for evenings:', ChiSqResult_evening[1])

The P-Value of the ChiSq Test is, for mornings: 1.1084771504656664e-20
The P-Value of the ChiSq Test is, for noons: 0.012904767011066262
The P-Value of the ChiSq Test is, for afternoons: 0.462864009144089
The P-Value of the ChiSq Test is, for evenings: 0.05579996797855117


#### Aggregating afternoon and evening (too few evening values)

In [161]:
# aggregate afternoon and evening
df['time'] = df['time'].replace(['afternoon', 'evening'], 'afternoon')

# select only these data (for time)
df_morning = df[df["time"] == morning]
df_noon = df[df["time"] == noon]
df_afternoon = df[df["time"] == afternoon]

In [164]:
# Cross tabulation between category and store type
CrosstabResult_morning =pd.crosstab(index=df_morning['item_type'],columns=df_morning['store_name'])
CrosstabResult_noon =pd.crosstab(index=df_noon['item_type'],columns=df_noon['store_name'])
CrosstabResult_afternoon =pd.crosstab(index=df_afternoon['item_type'],columns=df_afternoon['store_name'])

# CrosstabResult_afternoon

In [167]:
CrosstabResult_afternoon

store_name,Albert Heijn,Brabo,Carrefour,Delhaize,Kruidvat,Okay
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
alcoholic drinks,0,0,0,1,0,0
almond milk,0,0,1,0,0,0
andalouse sauce,0,0,0,0,0,1
apple sauce,0,0,1,1,0,1
apples,0,0,0,1,0,1
...,...,...,...,...,...,...
tonic,0,0,0,0,0,1
vegetable box,2,0,0,0,0,0
vegetable mix,0,0,0,0,0,3
yoghurt,3,0,1,1,0,1


In [165]:
# Performing Chi-sq test
ChiSqResult_morning = chi2_contingency(CrosstabResult_morning)
ChiSqResult_noon = chi2_contingency(CrosstabResult_noon)
ChiSqResult_afternoon = chi2_contingency(CrosstabResult_afternoon)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for mornings:', ChiSqResult_morning[1])
print('The P-Value of the ChiSq Test is, for noons:', ChiSqResult_noon[1])
print('The P-Value of the ChiSq Test is, for afternoons:', ChiSqResult_afternoon[1])

The P-Value of the ChiSq Test is, for mornings: 1.1084771504656664e-20
The P-Value of the ChiSq Test is, for noons: 0.012904767011066262
The P-Value of the ChiSq Test is, for afternoons: 0.5135745255228084


#### For specific categories

In [262]:
# select only these data (for time)
df_morning = df_AHnoon[df_AHnoon["time"] == morning]
df_noon = df_AHnoon[df_AHnoon["time"] == noon]
df_afternoon = df_AHnoon[df_AHnoon["time"] == afternoon]

In [261]:
# Cross tabulation between category and store type
CrosstabResult_morning =pd.crosstab(index=df_morning['item_type'],columns=df_morning['store_name'])
CrosstabResult_noon =pd.crosstab(index=df_noon['item_type'],columns=df_noon['store_name'])
CrosstabResult_afternoon =pd.crosstab(index=df_afternoon['item_type'],columns=df_afternoon['store_name'])

# # Performing Chi-sq test
ChiSqResult_morning = chi2_contingency(CrosstabResult_morning)
ChiSqResult_noon = chi2_contingency(CrosstabResult_noon)
ChiSqResult_afternoon = chi2_contingency(CrosstabResult_afternoon)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for mornings:', ChiSqResult_morning[1])
print('The P-Value of the ChiSq Test is, for noons:', ChiSqResult_noon[1])
print('The P-Value of the ChiSq Test is, for afternoons:', ChiSqResult_afternoon[1])

The P-Value of the ChiSq Test is, for mornings: 4.364552937318244e-17
The P-Value of the ChiSq Test is, for noons: 0.3208471988621342
The P-Value of the ChiSq Test is, for afternoons: 0.2750704809241535


### 2.3.2: vs day

In [44]:
# Cross tabulation between category and store type
CrosstabResult_Monday =pd.crosstab(index=df_Monday['item_type'],columns=df_Monday['store_name'], dropna=False)
CrosstabResult_Tuesday =pd.crosstab(index=df_Tuesday['item_type'],columns=df_Tuesday['store_name'], dropna=False)
CrosstabResult_Wednesday =pd.crosstab(index=df_Wednesday['item_type'],columns=df_Wednesday['store_name'], dropna=False)
CrosstabResult_Thursday =pd.crosstab(index=df_Thursday['item_type'],columns=df_Thursday['store_name'], dropna=False)
CrosstabResult_Friday =pd.crosstab(index=df_Friday['item_type'],columns=df_Friday['store_name'], dropna=False)
CrosstabResult_Saturday =pd.crosstab(index=df_Saturday['item_type'],columns=df_Saturday['store_name'], dropna=False)
CrosstabResult_Sunday =pd.crosstab(index=df_Sunday['item_type'],columns=df_Sunday['store_name'], dropna=False)
CrosstabResult_Sunday

store_name,Albert Heijn,Delhaize,Sys,Versavel Poelman
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
beef,0,2,0,0
bell peppers,0,1,0,0
blueberries,0,1,0,0
bread,0,0,8,0
broccoli,0,1,0,0
carrots,0,1,0,0
cauliflower,0,1,0,0
cereal,0,1,0,0
charcuterie,0,4,0,3
cheese,0,5,0,0


In [45]:
# Performing Chi-sq test
Monday = chi2_contingency(CrosstabResult_Monday)
Tuesday = chi2_contingency(CrosstabResult_Tuesday)
Wednesday = chi2_contingency(CrosstabResult_Wednesday)
Thursday = chi2_contingency(CrosstabResult_Thursday)
Friday = chi2_contingency(CrosstabResult_Friday)
Saturday = chi2_contingency(CrosstabResult_Saturday)
Sunday = chi2_contingency(CrosstabResult_Sunday)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for Mondays:', Monday[1])
print('The P-Value of the ChiSq Test is, for Tuesdays:', Tuesday[1])
print('The P-Value of the ChiSq Test is, for Wednesdays:', Wednesday[1])
print('The P-Value of the ChiSq Test is, for Thursdays:', Thursday[1])
print('The P-Value of the ChiSq Test is, for Fridays:', Friday[1])
print('The P-Value of the ChiSq Test is, for Saturdays:', Saturday[1])
print('The P-Value of the ChiSq Test is, for Sundays:', Sunday[1])

The P-Value of the ChiSq Test is, for Mondays: 0.013901439708649408
The P-Value of the ChiSq Test is, for Tuesdays: 0.02501696121635147
The P-Value of the ChiSq Test is, for Wednesdays: 0.1090641579497723
The P-Value of the ChiSq Test is, for Thursdays: 0.2937140173009805
The P-Value of the ChiSq Test is, for Fridays: 0.4922195887332244
The P-Value of the ChiSq Test is, for Saturdays: 0.013851927042001286
The P-Value of the ChiSq Test is, for Sundays: 0.0003349433115878115


-
## 2.4: CORRELATION 4: item type vs store TYPE

### 2.4.1: vs time

In [171]:
# aggregate afternoon and evening
df['time'] = df['time'].replace(['afternoon', 'evening'], 'afternoon')

# Cross tabulation between category and store type
CrosstabResult_morning =pd.crosstab(index=df_morning['item_type'],columns=df_morning['store_type'])
CrosstabResult_noon =pd.crosstab(index=df_noon['item_type'],columns=df_noon['store_type'])
CrosstabResult_afternoon =pd.crosstab(index=df_afternoon['item_type'],columns=df_afternoon['store_type'])

In [169]:
CrosstabResult_afternoon

store_type,butcher,drugstore,supermarket
item_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
alcoholic drinks,0,0,1
almond milk,0,0,1
andalouse sauce,0,0,1
apple sauce,0,0,3
apples,0,0,2
...,...,...,...
tonic,0,0,1
vegetable box,0,0,2
vegetable mix,0,0,3
yoghurt,0,0,6


In [172]:
# Performing Chi-sq test
ChiSqResult_morning = chi2_contingency(CrosstabResult_morning)
ChiSqResult_noon = chi2_contingency(CrosstabResult_noon)
ChiSqResult_afternoon = chi2_contingency(CrosstabResult_afternoon)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for mornings:', ChiSqResult_morning[1])
print('The P-Value of the ChiSq Test is, for noons:', ChiSqResult_noon[1])
print('The P-Value of the ChiSq Test is, for afternoons:', ChiSqResult_afternoon[1])

The P-Value of the ChiSq Test is, for mornings: 7.350991261367724e-15
The P-Value of the ChiSq Test is, for noons: 0.05038045108893573
The P-Value of the ChiSq Test is, for afternoons: 0.9916088795681783


#### For specific categories

In [263]:
# select only these data (for time)
df_morning = df_AHnoon[df_AHnoon["time"] == morning]
df_noon = df_AHnoon[df_AHnoon["time"] == noon]
df_afternoon = df_AHnoon[df_AHnoon["time"] == afternoon]

In [264]:
# Cross tabulation between category and store type
CrosstabResult_morning =pd.crosstab(index=df_morning['item_type'],columns=df_morning['store_type'])
CrosstabResult_noon =pd.crosstab(index=df_noon['item_type'],columns=df_noon['store_type'])
CrosstabResult_afternoon =pd.crosstab(index=df_afternoon['item_type'],columns=df_afternoon['store_type'])

# Performing Chi-sq test
ChiSqResult_morning = chi2_contingency(CrosstabResult_morning)
ChiSqResult_noon = chi2_contingency(CrosstabResult_noon)
ChiSqResult_afternoon = chi2_contingency(CrosstabResult_afternoon)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for mornings:', ChiSqResult_morning[1])
print('The P-Value of the ChiSq Test is, for noons:', ChiSqResult_noon[1])
print('The P-Value of the ChiSq Test is, for afternoons:', ChiSqResult_afternoon[1])

The P-Value of the ChiSq Test is, for mornings: 5.845289203614577e-10
The P-Value of the ChiSq Test is, for noons: 0.3208471988621342
The P-Value of the ChiSq Test is, for afternoons: 0.7057584769449716


### 2.4.2: vs day

In [49]:
# Cross tabulation between category and store type
CrosstabResult_Monday =pd.crosstab(index=df_Monday['item_type'],columns=df_Monday['store_type'], dropna=False)
CrosstabResult_Tuesday =pd.crosstab(index=df_Tuesday['item_type'],columns=df_Tuesday['store_type'], dropna=False)
CrosstabResult_Wednesday =pd.crosstab(index=df_Wednesday['item_type'],columns=df_Wednesday['store_type'], dropna=False)
CrosstabResult_Thursday =pd.crosstab(index=df_Thursday['item_type'],columns=df_Thursday['store_type'], dropna=False)
CrosstabResult_Friday =pd.crosstab(index=df_Friday['item_type'],columns=df_Friday['store_type'], dropna=False)
CrosstabResult_Saturday =pd.crosstab(index=df_Saturday['item_type'],columns=df_Saturday['store_type'], dropna=False)
CrosstabResult_Sunday =pd.crosstab(index=df_Sunday['item_type'],columns=df_Sunday['store_type'], dropna=False)
# CrosstabResult_Sunday

In [50]:
# Performing Chi-sq test
Monday = chi2_contingency(CrosstabResult_Monday)
Tuesday = chi2_contingency(CrosstabResult_Tuesday)
Wednesday = chi2_contingency(CrosstabResult_Wednesday)
Thursday = chi2_contingency(CrosstabResult_Thursday)
Friday = chi2_contingency(CrosstabResult_Friday)
Saturday = chi2_contingency(CrosstabResult_Saturday)
Sunday = chi2_contingency(CrosstabResult_Sunday)

# P-Value is the Probability of H0 being True
# If P-Value > 0.05 then only we Accept the assumption(H0)

print('The P-Value of the ChiSq Test is, for Mondays:', Monday[1])
print('The P-Value of the ChiSq Test is, for Tuesdays:', Tuesday[1])
print('The P-Value of the ChiSq Test is, for Wednesdays:', Wednesday[1])
print('The P-Value of the ChiSq Test is, for Thursdays:', Thursday[1])
print('The P-Value of the ChiSq Test is, for Fridays:', Friday[1])
print('The P-Value of the ChiSq Test is, for Saturdays:', Saturday[1])
print('The P-Value of the ChiSq Test is, for Sundays:', Sunday[1])

The P-Value of the ChiSq Test is, for Mondays: 0.013901439708649498
The P-Value of the ChiSq Test is, for Tuesdays: 0.025016961216351507
The P-Value of the ChiSq Test is, for Wednesdays: 0.1090641579497725
The P-Value of the ChiSq Test is, for Thursdays: 0.9686462864551427
The P-Value of the ChiSq Test is, for Fridays: 0.8655512299356921
The P-Value of the ChiSq Test is, for Saturdays: 0.004323383270839815
The P-Value of the ChiSq Test is, for Sundays: 0.017341246964033406


-
# New dataframes per store name & time

In [311]:
# to select only one
store1 = 'Albert Heijn'
store2 = 'Sys'
store3 = 'Okay'
store4 = 'Delhaize'
store5 = 'Versavel Poelman'
store6 = 'Kruidvat'
store7 = 'Brabo'
store8 = 'Ikea'
store9 = 'Carrefour'

# select only these data for the df
df_AH = df[df['store_name'] == store1]
df_SYS = df[df['store_name'] == store2]
df_OKAY = df[df['store_name'] == store3]
df_DELHAIZE = df[df['store_name'] == store4]
df_VERSAVEL = df[df['store_name'] == store5]
df_KRUIDVAT = df[df['store_name'] == store6]
df_BRABO = df[df['store_name'] == store7]
df_IKEA = df[df['store_name'] == store8]
df_CARREFOUR = df[df['store_name'] == store9]

## Time dataframes for ALBERT HEIJN

In [319]:
# select only these time data for the df_AH
df_mornings = df_AH[df_AH['time'] == 'morning']
df_noons = df_AH[df_AH['time'] == 'noon']
df_afternoons = df_AH[df_AH['time'] == 'afternoon']

In [320]:
df_morning = df_mornings.groupby(['category'])['order_ID'].nunique()
df_morning = pd.DataFrame (df_morning)
df_morning = df_morning.reset_index()
df_morning = df_morning.loc[~(df_morning==0).any(axis=1)]
df_morning = df_morning.rename(columns={"order_ID": "weight"})

df_noon = df_noons.groupby(['category'])['order_ID'].nunique()
df_noon = pd.DataFrame (df_noon)
df_noon = df_noon.reset_index()
df_noon = df_noon.loc[~(df_noon==0).any(axis=1)]
df_noon = df_noon.rename(columns={"order_ID": "weight"})

df_afternoon = df_afternoons.groupby(['category'])['order_ID'].nunique()
df_afternoon = pd.DataFrame (df_afternoon)
df_afternoon = df_afternoon.reset_index()
df_afternoon = df_afternoon.loc[~(df_afternoon==0).any(axis=1)]
df_afternoon = df_afternoon.rename(columns={"order_ID": "weight"})

In [324]:
df_noon

Unnamed: 0,category,weight
0,bakery,1
1,fruit & vegetables,6
2,"meat, fish & veggie",1
3,snacks,1


In [325]:
df_morning.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_AHmornCAT.csv", index = None, header=True)
df_noon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_AHnoonCAT.csv", index = None, header=True)
df_afternoon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_AHafterCAT.csv", index = None, header=True)

## Time dataframes for SYS

In [326]:
# select only these time data for the df_AH
df_mornings = df_SYS[df_SYS['time'] == 'morning']
df_noons = df_SYS[df_SYS['time'] == 'noon']
df_afternoons = df_SYS[df_SYS['time'] == 'afternoon']

In [327]:
df_morning = df_mornings.groupby(['category'])['order_ID'].nunique()
df_morning = pd.DataFrame (df_morning)
df_morning = df_morning.reset_index()
df_morning = df_morning.loc[~(df_morning==0).any(axis=1)]
df_morning = df_morning.rename(columns={"order_ID": "weight"})

df_noon = df_noons.groupby(['category'])['order_ID'].nunique()
df_noon = pd.DataFrame (df_noon)
df_noon = df_noon.reset_index()
df_noon = df_noon.loc[~(df_noon==0).any(axis=1)]
df_noon = df_noon.rename(columns={"order_ID": "weight"})

df_afternoon = df_afternoons.groupby(['category'])['order_ID'].nunique()
df_afternoon = pd.DataFrame (df_afternoon)
df_afternoon = df_afternoon.reset_index()
df_afternoon = df_afternoon.loc[~(df_afternoon==0).any(axis=1)]
df_afternoon = df_afternoon.rename(columns={"order_ID": "weight"})

In [328]:
df_noon

Unnamed: 0,category,weight
0,bakery,1


In [329]:
df_morning.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_SYSmornCAT.csv", index = None, header=True)
df_noon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_SYSnoonCAT.csv", index = None, header=True)
df_afternoon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_SYSafterCAT.csv", index = None, header=True)

## Time dataframes for OKAY

In [334]:
# select only these time data for the df_AH
df_mornings = df_OKAY[df_OKAY['time'] == 'morning']
df_noons = df_OKAY[df_OKAY['time'] == 'noon']
df_afternoons = df_OKAY[df_OKAY['time'] == 'afternoon']

In [335]:
df_morning = df_mornings.groupby(['category'])['order_ID'].nunique()
df_morning = pd.DataFrame (df_morning)
df_morning = df_morning.reset_index()
df_morning = df_morning.loc[~(df_morning==0).any(axis=1)]
df_morning = df_morning.rename(columns={"order_ID": "weight"})

df_noon = df_noons.groupby(['category'])['order_ID'].nunique()
df_noon = pd.DataFrame (df_noon)
df_noon = df_noon.reset_index()
df_noon = df_noon.loc[~(df_noon==0).any(axis=1)]
df_noon = df_noon.rename(columns={"order_ID": "weight"})

df_afternoon = df_afternoons.groupby(['category'])['order_ID'].nunique()
df_afternoon = pd.DataFrame (df_afternoon)
df_afternoon = df_afternoon.reset_index()
df_afternoon = df_afternoon.loc[~(df_afternoon==0).any(axis=1)]
df_afternoon = df_afternoon.rename(columns={"order_ID": "weight"})

In [336]:
df_noon

Unnamed: 0,category,weight
0,breakfast & spreads,1
1,canned foods,1
2,dairy & plant based,1


In [337]:
df_morning.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_OKAYmornCAT.csv", index = None, header=True)
df_noon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_OKAYnoonCAT.csv", index = None, header=True)
df_afternoon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_OKAYafterCAT.csv", index = None, header=True)

## Time dataframes for DELHAIZE

In [338]:
# select only these time data for the df_AH
df_mornings = df_DELHAIZE[df_DELHAIZE['time'] == 'morning']
df_noons = df_DELHAIZE[df_DELHAIZE['time'] == 'noon']
df_afternoons = df_DELHAIZE[df_DELHAIZE['time'] == 'afternoon']

In [339]:
df_morning = df_mornings.groupby(['category'])['order_ID'].nunique()
df_morning = pd.DataFrame (df_morning)
df_morning = df_morning.reset_index()
df_morning = df_morning.loc[~(df_morning==0).any(axis=1)]
df_morning = df_morning.rename(columns={"order_ID": "weight"})

df_noon = df_noons.groupby(['category'])['order_ID'].nunique()
df_noon = pd.DataFrame (df_noon)
df_noon = df_noon.reset_index()
df_noon = df_noon.loc[~(df_noon==0).any(axis=1)]
df_noon = df_noon.rename(columns={"order_ID": "weight"})

df_afternoon = df_afternoons.groupby(['category'])['order_ID'].nunique()
df_afternoon = pd.DataFrame (df_afternoon)
df_afternoon = df_afternoon.reset_index()
df_afternoon = df_afternoon.loc[~(df_afternoon==0).any(axis=1)]
df_afternoon = df_afternoon.rename(columns={"order_ID": "weight"})

In [343]:
df_morning

Unnamed: 0,category,weight
0,bakery,1
1,beverages,2
2,breakfast & spreads,1
3,condiments,1
4,dairy & plant based,2
5,frozen foods,1
6,fruit & vegetables,2
7,"meat, fish & veggie",2
8,snacks,2


In [341]:
df_morning.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_DELHmornCAT.csv", index = None, header=True)
df_noon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_DELHnoonCAT.csv", index = None, header=True)
df_afternoon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_DELHafterCAT.csv", index = None, header=True)

## Time dataframes for VERSAVEL POELMAN

In [344]:
# select only these time data for the df_AH
df_mornings = df_VERSAVEL[df_VERSAVEL['time'] == 'morning']
df_noons = df_VERSAVEL[df_VERSAVEL['time'] == 'noon']
df_afternoons = df_VERSAVEL[df_VERSAVEL['time'] == 'afternoon']

In [345]:
df_morning = df_mornings.groupby(['category'])['order_ID'].nunique()
df_morning = pd.DataFrame (df_morning)
df_morning = df_morning.reset_index()
df_morning = df_morning.loc[~(df_morning==0).any(axis=1)]
df_morning = df_morning.rename(columns={"order_ID": "weight"})

df_noon = df_noons.groupby(['category'])['order_ID'].nunique()
df_noon = pd.DataFrame (df_noon)
df_noon = df_noon.reset_index()
df_noon = df_noon.loc[~(df_noon==0).any(axis=1)]
df_noon = df_noon.rename(columns={"order_ID": "weight"})

df_afternoon = df_afternoons.groupby(['category'])['order_ID'].nunique()
df_afternoon = pd.DataFrame (df_afternoon)
df_afternoon = df_afternoon.reset_index()
df_afternoon = df_afternoon.loc[~(df_afternoon==0).any(axis=1)]
df_afternoon = df_afternoon.rename(columns={"order_ID": "weight"})

In [346]:
df_morning

Unnamed: 0,category,weight
0,dairy & plant based,1
1,"meat, fish & veggie",3


In [347]:
df_morning.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_VERSmornCAT.csv", index = None, header=True)
df_noon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_VERSnoonCAT.csv", index = None, header=True)
df_afternoon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_VERSafterCAT.csv", index = None, header=True)

## Time dataframes for KRUIDVAT

In [348]:
# select only these time data for the df_AH
df_mornings = df_KRUIDVAT[df_KRUIDVAT['time'] == 'morning']
df_noons = df_KRUIDVAT[df_KRUIDVAT['time'] == 'noon']
df_afternoons = df_KRUIDVAT[df_KRUIDVAT['time'] == 'afternoon']

In [349]:
df_morning = df_mornings.groupby(['category'])['order_ID'].nunique()
df_morning = pd.DataFrame (df_morning)
df_morning = df_morning.reset_index()
df_morning = df_morning.loc[~(df_morning==0).any(axis=1)]
df_morning = df_morning.rename(columns={"order_ID": "weight"})

df_noon = df_noons.groupby(['category'])['order_ID'].nunique()
df_noon = pd.DataFrame (df_noon)
df_noon = df_noon.reset_index()
df_noon = df_noon.loc[~(df_noon==0).any(axis=1)]
df_noon = df_noon.rename(columns={"order_ID": "weight"})

df_afternoon = df_afternoons.groupby(['category'])['order_ID'].nunique()
df_afternoon = pd.DataFrame (df_afternoon)
df_afternoon = df_afternoon.reset_index()
df_afternoon = df_afternoon.loc[~(df_afternoon==0).any(axis=1)]
df_afternoon = df_afternoon.rename(columns={"order_ID": "weight"})

In [351]:
df_afternoon

Unnamed: 0,category,weight
0,beverages,1
1,care & hyygiene,1
2,snacks,1


In [352]:
df_morning.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_KRUIDmornCAT.csv", index = None, header=True)
df_noon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_KRUIDnoonCAT.csv", index = None, header=True)
df_afternoon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_KRUIDafterCAT.csv", index = None, header=True)

## Time dataframes for BRABO

In [353]:
# select only these time data for the df_AH
df_mornings = df_BRABO[df_BRABO['time'] == 'morning']
df_noons = df_BRABO[df_BRABO['time'] == 'noon']
df_afternoons = df_BRABO[df_BRABO['time'] == 'afternoon']

In [354]:
df_morning = df_mornings.groupby(['category'])['order_ID'].nunique()
df_morning = pd.DataFrame (df_morning)
df_morning = df_morning.reset_index()
df_morning = df_morning.loc[~(df_morning==0).any(axis=1)]
df_morning = df_morning.rename(columns={"order_ID": "weight"})

df_noon = df_noons.groupby(['category'])['order_ID'].nunique()
df_noon = pd.DataFrame (df_noon)
df_noon = df_noon.reset_index()
df_noon = df_noon.loc[~(df_noon==0).any(axis=1)]
df_noon = df_noon.rename(columns={"order_ID": "weight"})

df_afternoon = df_afternoons.groupby(['category'])['order_ID'].nunique()
df_afternoon = pd.DataFrame (df_afternoon)
df_afternoon = df_afternoon.reset_index()
df_afternoon = df_afternoon.loc[~(df_afternoon==0).any(axis=1)]
df_afternoon = df_afternoon.rename(columns={"order_ID": "weight"})

In [355]:
df_afternoon

Unnamed: 0,category,weight
0,"meat, fish & veggie",1


In [356]:
df_morning.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_BRAmornCAT.csv", index = None, header=True)
df_noon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_BRAnoonCAT.csv", index = None, header=True)
df_afternoon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_BRAafterCAT.csv", index = None, header=True)

## Time dataframes for IKEA

In [357]:
# select only these time data for the df_AH
df_mornings = df_IKEA[df_IKEA['time'] == 'morning']
df_noons = df_IKEA[df_IKEA['time'] == 'noon']
df_afternoons = df_IKEA[df_IKEA['time'] == 'afternoon']

In [358]:
df_morning = df_mornings.groupby(['category'])['order_ID'].nunique()
df_morning = pd.DataFrame (df_morning)
df_morning = df_morning.reset_index()
df_morning = df_morning.loc[~(df_morning==0).any(axis=1)]
df_morning = df_morning.rename(columns={"order_ID": "weight"})

df_noon = df_noons.groupby(['category'])['order_ID'].nunique()
df_noon = pd.DataFrame (df_noon)
df_noon = df_noon.reset_index()
df_noon = df_noon.loc[~(df_noon==0).any(axis=1)]
df_noon = df_noon.rename(columns={"order_ID": "weight"})

df_afternoon = df_afternoons.groupby(['category'])['order_ID'].nunique()
df_afternoon = pd.DataFrame (df_afternoon)
df_afternoon = df_afternoon.reset_index()
df_afternoon = df_afternoon.loc[~(df_afternoon==0).any(axis=1)]
df_afternoon = df_afternoon.rename(columns={"order_ID": "weight"})

In [362]:
df_morning

Unnamed: 0,category,weight
0,beverages,1


In [363]:
df_morning.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_IKEAmornCAT.csv", index = None, header=True)
df_noon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_IKEAnoonCAT.csv", index = None, header=True)
df_afternoon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_IKEAafterCAT.csv", index = None, header=True)

## Time dataframes for CARREFOUR

In [364]:
# select only these time data for the df_AH
df_mornings = df_CARREFOUR[df_CARREFOUR['time'] == 'morning']
df_noons = df_CARREFOUR[df_CARREFOUR['time'] == 'noon']
df_afternoons = df_CARREFOUR[df_CARREFOUR['time'] == 'afternoon']

In [365]:
df_morning = df_mornings.groupby(['category'])['order_ID'].nunique()
df_morning = pd.DataFrame (df_morning)
df_morning = df_morning.reset_index()
df_morning = df_morning.loc[~(df_morning==0).any(axis=1)]
df_morning = df_morning.rename(columns={"order_ID": "weight"})

df_noon = df_noons.groupby(['category'])['order_ID'].nunique()
df_noon = pd.DataFrame (df_noon)
df_noon = df_noon.reset_index()
df_noon = df_noon.loc[~(df_noon==0).any(axis=1)]
df_noon = df_noon.rename(columns={"order_ID": "weight"})

df_afternoon = df_afternoons.groupby(['category'])['order_ID'].nunique()
df_afternoon = pd.DataFrame (df_afternoon)
df_afternoon = df_afternoon.reset_index()
df_afternoon = df_afternoon.loc[~(df_afternoon==0).any(axis=1)]
df_afternoon = df_afternoon.rename(columns={"order_ID": "weight"})

In [368]:
df_afternoon

Unnamed: 0,category,weight
0,breakfast & spreads,1
1,canned foods,1
2,condiments,1
3,dairy & plant based,1
4,fruit & vegetables,1
5,"meat, fish & veggie",1
6,snacks,1


In [369]:
df_morning.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_CATmornCAT.csv", index = None, header=True)
df_noon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_CATnoonCAT.csv", index = None, header=True)
df_afternoon.to_csv (r"C:\Users\20204113\OneDrive - TU Eindhoven\2_Research\1_Groceries\DATA\9th week - narrative (3rd attempt)\HH2\df\df_HH2_CARafterCAT.csv", index = None, header=True)