In [30]:
# pip install mlxtend

In [31]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

In [32]:
df = pd.read_csv('GroceryStoreDataSet.csv', names = ['products'], sep = ',')
df.head(10)

Unnamed: 0,products
0,"MILK,BREAD,BISCUIT"
1,"BREAD,MILK,BISCUIT,CORNFLAKES"
2,"BREAD,TEA,BOURNVITA"
3,"JAM,MAGGI,BREAD,MILK"
4,"MAGGI,TEA,BISCUIT"
5,"BREAD,TEA,BOURNVITA"
6,"MAGGI,TEA,CORNFLAKES"
7,"MAGGI,BREAD,TEA,BISCUIT"
8,"JAM,MAGGI,BREAD,TEA"
9,"BREAD,MILK"


In [33]:
df.shape

(20, 1)


Let's split the products and create a list called by 'data',

In [34]:
data = list(df["products"].apply(lambda x:x.split(",") ))
data

[['MILK', 'BREAD', 'BISCUIT'],
 ['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['JAM', 'MAGGI', 'BREAD', 'MILK'],
 ['MAGGI', 'TEA', 'BISCUIT'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['MAGGI', 'TEA', 'CORNFLAKES'],
 ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'],
 ['JAM', 'MAGGI', 'BREAD', 'TEA'],
 ['BREAD', 'MILK'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'COCK'],
 ['BREAD', 'SUGER', 'BISCUIT'],
 ['COFFEE', 'SUGER', 'CORNFLAKES'],
 ['BREAD', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]

Apriori Algorithm and One-Hot Encoding

Apriori's algorithm transforms True/False or 1/0.
Using TransactionEncoder, we convert the list to a One-Hot Encoded Boolean list.
Products that customers bought or did not buy during shopping will now be represented by values 1 and 0.

In [35]:
#Let's transform the list, with one-hot encoding
from mlxtend.preprocessing import TransactionEncoder
a = TransactionEncoder()
a_data = a.fit(data).transform(data)
df = pd.DataFrame(a_data,columns=a.columns_)
df = df.replace(False,0)
df

Unnamed: 0,BISCUIT,BOURNVITA,BREAD,COCK,COFFEE,CORNFLAKES,JAM,MAGGI,MILK,SUGER,TEA
0,True,0,True,0,0,0,0,0,True,0,0
1,True,0,True,0,0,True,0,0,True,0,0
2,0,True,True,0,0,0,0,0,0,0,True
3,0,0,True,0,0,0,True,True,True,0,0
4,True,0,0,0,0,0,0,True,0,0,True
5,0,True,True,0,0,0,0,0,0,0,True
6,0,0,0,0,0,True,0,True,0,0,True
7,True,0,True,0,0,0,0,True,0,0,True
8,0,0,True,0,0,0,True,True,0,0,True
9,0,0,True,0,0,0,0,0,True,0,0


Applying Apriori and Resulting

The next step is to create the Apriori Model. We can change all the parameters in the Apriori Model in the mlxtend package.
I will try to use minimum support parameters for this modeling.
For this, I set a min_support value with a threshold value of 20% and printed them on the screen as well.

In [36]:
# #set a threshold value for the support value and calculate the support value.
# df = apriori(df, min_support = 0.2, use_colnames = True, verbose = 1)
# # df

In [37]:
# from mlxtend.frequent_patterns import apriori

# # Assuming you have already read the CSV file and created the DataFrame 'df'

# # Convert the DataFrame to a format suitable for Apriori (binary values)
# df_encoded = df.applymap(lambda x: 1 if pd.notna(x) else 0)

# # Set a threshold value for the support
# min_support_threshold = 0.2

# # Use Apriori algorithm to mine frequent itemsets
# frequent_itemsets = apriori(df_encoded, min_support=min_support_threshold, use_colnames=True, verbose=1)

# # Display the frequent itemsets along with their support values
# frequent_itemsets


In [38]:
from mlxtend.frequent_patterns import apriori

# Assuming you have already read the CSV file and created the DataFrame 'df'

# Convert the DataFrame to a boolean format suitable for Apriori
df_boolean = df.notna()

# Set a threshold value for the support
min_support_threshold = 0.2

# Use Apriori algorithm to mine frequent itemsets
frequent_itemsets = apriori(df_boolean, min_support=min_support_threshold, use_colnames=True, verbose=1)

# Display the frequent itemsets along with their support values
frequent_itemsets


Processing 110 combinations | Sampling itemset size 2Processing 495 combinations | Sampling itemset size 3Processing 1320 combinations | Sampling itemset size 4Processing 2310 combinations | Sampling itemset size 5Processing 2772 combinations | Sampling itemset size 6Processing 2310 combinations | Sampling itemset size 7Processing 1320 combinations | Sampling itemset size 8Processing 495 combinations | Sampling itemset size 9Processing 110 combinations | Sampling itemset size 10Processing 11 combinations | Sampling itemset size 11


Unnamed: 0,support,itemsets
0,1.0,(BISCUIT)
1,1.0,(BOURNVITA)
2,1.0,(BREAD)
3,1.0,(COCK)
4,1.0,(COFFEE)
...,...,...
2042,1.0,"(COFFEE, BOURNVITA, JAM, MILK, TEA, MAGGI, SUG..."
2043,1.0,"(COFFEE, BOURNVITA, JAM, MILK, TEA, MAGGI, SUG..."
2044,1.0,"(COFFEE, JAM, MILK, TEA, MAGGI, SUGER, COCK, B..."
2045,1.0,"(COFFEE, BOURNVITA, JAM, MILK, TEA, MAGGI, SUG..."


I chose the 60% minimum confidence value. In other words, when product X is purchased, we can say that the purchase of product Y is 60% or more.

In [39]:
# #Let's view our interpretation values using the Associan rule function.
# df_ar = association_rules(df, metric = "confidence", min_threshold = 0.6)
# # df_ar


In [40]:
# Generate association rules from the frequent itemsets
association_rules_df = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

# Display the generated association rules
association_rules_df


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(BOURNVITA),(BISCUIT),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
1,(BISCUIT),(BOURNVITA),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
2,(BREAD),(BISCUIT),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
3,(BISCUIT),(BREAD),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
4,(COCK),(BISCUIT),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
...,...,...,...,...,...,...,...,...,...,...
173047,(SUGER),"(BOURNVITA, CORNFLAKES, JAM, MILK, TEA, MAGGI,...",1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
173048,(COCK),"(BOURNVITA, CORNFLAKES, JAM, MILK, TEA, MAGGI,...",1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
173049,(BREAD),"(BOURNVITA, CORNFLAKES, JAM, MILK, TEA, MAGGI,...",1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
173050,(COFFEE),"(BOURNVITA, CORNFLAKES, JAM, MILK, TEA, MAGGI,...",1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
