In [2]:
!pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.23.1-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.1-py3-none-any.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.4 MB 1.5 MB/s eta 0:00:01
   -------------- ------------------------- 0.5/1.4 MB 4.7 MB/s eta 0:00:01
   ---------------------------------------  1.4/1.4 MB 10.2 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 9.2 MB/s eta 0:00:00
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.1


# Data Preparation
- Load necessary packages
- Load the data and prepare it for further analysis

In [3]:
# Import 'pandas' for working with data frames
import pandas as pd

# Import 'matplotlib.pyplot' and 'seaborn' for working with data frames
import matplotlib.pyplot as plt
import seaborn as sns

# Import 'apriori' and 'association_rules' from 'mlxtend'
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
# Load the data and take a look at it
# Note: The data needs to be in binary matrix format
df1 = pd.read_csv('supermarket_binarymat.csv')
df1.head()

Unnamed: 0,transID,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,berries,...,UHT-milk,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
df1.shape

(9835, 170)

In [8]:
# Check the data types of various features
df1.dtypes

transID             int64
abrasive cleaner    int64
artif. sweetener    int64
baby cosmetics      int64
baby food           int64
                    ...  
white bread         int64
white wine          int64
whole milk          int64
yogurt              int64
zwieback            int64
Length: 170, dtype: object

In [9]:
# Drop the transaction ID as it will not be useful in this analysis
df1.drop('transID', inplace = True, axis = 1)

In [None]:
#Alternate way of removing columns like above cell
#df1.drop(columns = ['transID'], inplace = True)

# EDA
It is almost always useful to examine the most frequent items in the data set. Here the top 25 most frequent items are plotted with their frequencies.

In [7]:
# Obtain the purchase counts of each of the items or features, sort them, and print the top 25 most selling items
S = df1.sum(axis = 0)
Top = S.sort_values(ascending = False)[:25]
print(Top)

whole milk               2513
other vegetables         1903
rolls/buns               1809
soda                     1715
yogurt                   1372
bottled water            1087
root vegetables          1072
tropical fruit           1032
shopping bags             969
sausage                   924
pastry                    875
citrus fruit              814
bottled beer              792
newspapers                785
canned beer               764
pip fruit                 744
fruit/vegetable juice     711
whipped/sour cream        705
brown bread               638
domestic eggs             624
frankfurter               580
margarine                 576
coffee                    571
pork                      567
butter                    545
dtype: int64


- **Shallow copy:** means both data is referring to the same dataframe
- **Deep Copy:** means both the data will be separate

In [15]:
# Convert the data into the Boolean data type
# Note: Converting data types to Boolean is not absolutely essential but it is computationally faster
df2 = df1.copy(deep = True)  #making a deep copy of df1 and storing in df2
df2 = df2.astype(bool)
df2.head()

Unnamed: 0,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,berries,beverages,...,UHT-milk,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [16]:
# Check the data types of various features
df2.dtypes

abrasive cleaner    bool
artif. sweetener    bool
baby cosmetics      bool
baby food           bool
bags                bool
                    ... 
white bread         bool
white wine          bool
whole milk          bool
yogurt              bool
zwieback            bool
Length: 169, dtype: object

# Building Association Rules

You will need a call to two functions:
- The first will identify *frequent itemsets*, in other words, sets that meet a specified support threshold
- The second will build *association rules* that meet a specified lift (alternatively, confidence) threshold

Note that there may be no rules that meet the specified threshold. In this case, the threshold value will need to be reduced. 

In [11]:
#find the items which are popular items with popularity greater than 1%
frequent_itemsets = apriori(df2, min_support = 0.01, use_colnames = True)

In [15]:
#we are looking at the associations between a popular item with another items
rules = association_rules(frequent_itemsets, metric = 'lift')

In [16]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(beef),(other vegetables),0.052466,0.193493,0.019725,0.375969,1.943066,0.009574,1.292416,0.512224
1,(other vegetables),(beef),0.193493,0.052466,0.019725,0.101944,1.943066,0.009574,1.055095,0.601792
2,(beef),(rolls/buns),0.052466,0.183935,0.013625,0.259690,1.411858,0.003975,1.102329,0.307866
3,(rolls/buns),(beef),0.183935,0.052466,0.013625,0.074074,1.411858,0.003975,1.023337,0.357463
4,(beef),(root vegetables),0.052466,0.108998,0.017387,0.331395,3.040367,0.011668,1.332628,0.708251
...,...,...,...,...,...,...,...,...,...,...
611,"(yogurt, whole milk)",(whipped/sour cream),0.056024,0.071683,0.010880,0.194192,2.709053,0.006864,1.152033,0.668309
612,"(whipped/sour cream, whole milk)",(yogurt),0.032232,0.139502,0.010880,0.337539,2.419607,0.006383,1.298943,0.606250
613,(yogurt),"(whipped/sour cream, whole milk)",0.139502,0.032232,0.010880,0.077988,2.419607,0.006383,1.049627,0.681826
614,(whipped/sour cream),"(yogurt, whole milk)",0.071683,0.056024,0.010880,0.151773,2.709053,0.006864,1.112881,0.679582


In [10]:
# Obtain frequent itemsets for the data using the 'apriori' method
frequent_itemsets = apriori(df2, min_support = 0.01, use_colnames = True)

In [11]:
# Generate associate rules for the frequent itemsets using the 'association_rules' method for the default metric and threshold
# Note: The default metric is 'confidence' and the default threshold is 0.8
rules = association_rules(frequent_itemsets)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric


In [12]:
# Generate associate rules for the frequent itemsets using the 'association_rules' method for 'lift' with threshold 1.5
rules = association_rules(frequent_itemsets, metric = 'lift', min_threshold = 1.5)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(other vegetables),(beef),0.193493,0.052466,0.019725,0.101944,1.943066,0.009574,1.055095,0.601792
1,(beef),(other vegetables),0.052466,0.193493,0.019725,0.375969,1.943066,0.009574,1.292416,0.512224
2,(root vegetables),(beef),0.108998,0.052466,0.017387,0.159515,3.040367,0.011668,1.127366,0.753189
3,(beef),(root vegetables),0.052466,0.108998,0.017387,0.331395,3.040367,0.011668,1.332628,0.708251
4,(whole milk),(beef),0.255516,0.052466,0.021251,0.083168,1.58518,0.007845,1.033487,0.495856


# Subsetting Rules Based on Criteria
We can zero in on rules that meet user-specified criteria, such as lift, confidence or support.

In [13]:
rules[(rules['lift'] >= 2) & (rules['confidence'] >= 0.55)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
217,"(butter, other vegetables)",(whole milk),0.020031,0.255516,0.01149,0.573604,2.244885,0.006371,1.745992,0.565878
222,"(root vegetables, citrus fruit)",(other vegetables),0.017692,0.193493,0.010371,0.586207,3.029608,0.006948,1.949059,0.68199
241,"(curd, yogurt)",(whole milk),0.017285,0.255516,0.010066,0.582353,2.279125,0.005649,1.782567,0.571107
248,"(other vegetables, domestic eggs)",(whole milk),0.022267,0.255516,0.012303,0.552511,2.162336,0.006613,1.663694,0.549779
289,"(root vegetables, tropical fruit)",(other vegetables),0.021047,0.193493,0.012303,0.584541,3.020999,0.008231,1.941244,0.683367
357,"(root vegetables, tropical fruit)",(whole milk),0.021047,0.255516,0.011998,0.570048,2.230969,0.00662,1.731553,0.563627
363,"(root vegetables, yogurt)",(whole milk),0.025826,0.255516,0.01454,0.562992,2.203354,0.007941,1.703594,0.560625


# Sorting Rules

In [14]:
rules.sort_values(by = ['lift'], inplace = True, ascending = False)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
243,(curd),"(whole milk, yogurt)",0.053279,0.056024,0.010066,0.188931,3.372304,0.007081,1.163866,0.743056
242,"(whole milk, yogurt)",(curd),0.056024,0.053279,0.010066,0.179673,3.372304,0.007081,1.154078,0.745217
224,"(citrus fruit, other vegetables)",(root vegetables),0.028876,0.108998,0.010371,0.359155,3.295045,0.007224,1.390354,0.717225
225,(root vegetables),"(citrus fruit, other vegetables)",0.108998,0.028876,0.010371,0.095149,3.295045,0.007224,1.073242,0.78172
332,"(other vegetables, yogurt)",(whipped/sour cream),0.043416,0.071683,0.010168,0.234192,3.267062,0.007056,1.212206,0.725409


# Targeting antecedents and consequents

This is a very frequently used approach. You may be interested in discovering what other things people who purchase yogurt and whole milk also buy? Or, what is a basket such that when people buy it, they also buy whole milk? Can you think of how this information might be used for product placement or promotions?

In [15]:
rules[rules['antecedents'] == {'whole milk', 'yogurt'}]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
242,"(whole milk, yogurt)",(curd),0.056024,0.053279,0.010066,0.179673,3.372304,0.007081,1.154078,0.745217
377,"(whole milk, yogurt)",(whipped/sour cream),0.056024,0.071683,0.01088,0.194192,2.709053,0.006864,1.152033,0.668309
371,"(whole milk, yogurt)",(tropical fruit),0.056024,0.104931,0.01515,0.270417,2.577089,0.009271,1.226823,0.648285
364,"(whole milk, yogurt)",(root vegetables),0.056024,0.108998,0.01454,0.259528,2.381025,0.008433,1.203289,0.614436
235,"(whole milk, yogurt)",(citrus fruit),0.056024,0.082766,0.010269,0.183303,2.214725,0.005633,1.123103,0.581028
337,"(whole milk, yogurt)",(other vegetables),0.056024,0.193493,0.022267,0.397459,2.054131,0.011427,1.338511,0.543633
352,"(whole milk, yogurt)",(rolls/buns),0.056024,0.183935,0.015557,0.277677,1.509648,0.005252,1.129779,0.35763


In [16]:
rules[(rules['consequents'] == {'whole milk'}) & (rules['lift']>2)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
241,"(curd, yogurt)",(whole milk),0.017285,0.255516,0.010066,0.582353,2.279125,0.005649,1.782567,0.571107
217,"(butter, other vegetables)",(whole milk),0.020031,0.255516,0.01149,0.573604,2.244885,0.006371,1.745992,0.565878
357,"(root vegetables, tropical fruit)",(whole milk),0.021047,0.255516,0.011998,0.570048,2.230969,0.00662,1.731553,0.563627
363,"(root vegetables, yogurt)",(whole milk),0.025826,0.255516,0.01454,0.562992,2.203354,0.007941,1.703594,0.560625
248,"(other vegetables, domestic eggs)",(whole milk),0.022267,0.255516,0.012303,0.552511,2.162336,0.006613,1.663694,0.549779
378,"(whipped/sour cream, yogurt)",(whole milk),0.020742,0.255516,0.01088,0.52451,2.052747,0.00558,1.565719,0.523711
342,"(root vegetables, rolls/buns)",(whole milk),0.024301,0.255516,0.01271,0.523013,2.046888,0.0065,1.560804,0.524192
266,"(other vegetables, pip fruit)",(whole milk),0.026131,0.255516,0.013523,0.51751,2.025351,0.006846,1.543003,0.519843
372,"(tropical fruit, yogurt)",(whole milk),0.029283,0.255516,0.01515,0.517361,2.02477,0.007668,1.542528,0.521384
338,"(other vegetables, yogurt)",(whole milk),0.043416,0.255516,0.022267,0.512881,2.007235,0.011174,1.52834,0.524577
