From the UCI ML Datasets, apply Apriori algorithm for given data for frequently occuring items and generate strong association rules using support and confidence thresholds. For example, Market basket analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
df1 = pd.read_csv('./basket_analysis.csv', index_col=0)

In [3]:
df1.head()

Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 999 entries, 0 to 998
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Apple         999 non-null    bool 
 1   Bread         999 non-null    bool 
 2   Butter        999 non-null    bool 
 3   Cheese        999 non-null    bool 
 4   Corn          999 non-null    bool 
 5   Dill          999 non-null    bool 
 6   Eggs          999 non-null    bool 
 7   Ice cream     999 non-null    bool 
 8   Kidney Beans  999 non-null    bool 
 9   Milk          999 non-null    bool 
 10  Nutmeg        999 non-null    bool 
 11  Onion         999 non-null    bool 
 12  Sugar         999 non-null    bool 
 13  Unicorn       999 non-null    bool 
 14  Yogurt        999 non-null    bool 
 15  chocolate     999 non-null    bool 
dtypes: bool(16)
memory usage: 23.4 KB


In [5]:
df = df1.astype('int64')

In [6]:
df.head()

Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,0,1,0,0,1,1,0,1,0,0,0,0,1,0,1,1
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,1,0,1,0,0,1,0,1,0,1,0,0,0,0,1,1
3,0,0,1,1,0,1,0,0,0,1,1,1,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 999 entries, 0 to 998
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Apple         999 non-null    int64
 1   Bread         999 non-null    int64
 2   Butter        999 non-null    int64
 3   Cheese        999 non-null    int64
 4   Corn          999 non-null    int64
 5   Dill          999 non-null    int64
 6   Eggs          999 non-null    int64
 7   Ice cream     999 non-null    int64
 8   Kidney Beans  999 non-null    int64
 9   Milk          999 non-null    int64
 10  Nutmeg        999 non-null    int64
 11  Onion         999 non-null    int64
 12  Sugar         999 non-null    int64
 13  Unicorn       999 non-null    int64
 14  Yogurt        999 non-null    int64
 15  chocolate     999 non-null    int64
dtypes: int64(16)
memory usage: 132.7 KB


In [8]:
%%capture 
# the %%capture above hides the warning for this cell
frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)

In [9]:
print(type(frequent_itemsets))

<class 'pandas.core.frame.DataFrame'>


In [10]:
print(frequent_itemsets)

     support                itemsets
0   0.383383                 (Apple)
1   0.384384                 (Bread)
2   0.420420                (Butter)
3   0.404404                (Cheese)
4   0.407407                  (Corn)
5   0.398398                  (Dill)
6   0.384384                  (Eggs)
7   0.410410             (Ice cream)
8   0.408408          (Kidney Beans)
9   0.405405                  (Milk)
10  0.401401                (Nutmeg)
11  0.403403                 (Onion)
12  0.409409                 (Sugar)
13  0.389389               (Unicorn)
14  0.420420                (Yogurt)
15  0.421421             (chocolate)
16  0.207207     (Ice cream, Butter)
17  0.202202  (Kidney Beans, Butter)
18  0.202202     (chocolate, Butter)
19  0.200200  (Cheese, Kidney Beans)
20  0.202202  (Ice cream, chocolate)
21  0.211211       (Milk, chocolate)


In [11]:
rules = association_rules(frequent_itemsets, metric="support", min_threshold=0.2)

In [12]:
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

       antecedents     consequents   support  confidence      lift
0      (Ice cream)        (Butter)  0.207207    0.504878  1.200889
1         (Butter)     (Ice cream)  0.207207    0.492857  1.200889
2   (Kidney Beans)        (Butter)  0.202202    0.495098  1.177626
3         (Butter)  (Kidney Beans)  0.202202    0.480952  1.177626
4      (chocolate)        (Butter)  0.202202    0.479810  1.141262
5         (Butter)     (chocolate)  0.202202    0.480952  1.141262
6         (Cheese)  (Kidney Beans)  0.200200    0.495050  1.212143
7   (Kidney Beans)        (Cheese)  0.200200    0.490196  1.212143
8      (Ice cream)     (chocolate)  0.202202    0.492683  1.169098
9      (chocolate)     (Ice cream)  0.202202    0.479810  1.169098
10          (Milk)     (chocolate)  0.211211    0.520988  1.236263
11     (chocolate)          (Milk)  0.211211    0.501188  1.236263


In [13]:
rules['rule_pair'] = rules.apply(lambda row: tuple(sorted(list(row['antecedents']) + list(row['consequents']))), axis=1)

In [14]:
rules_unique = rules.drop_duplicates(subset=['rule_pair']).drop(columns=['rule_pair'])

In [15]:
print(rules_unique[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

       antecedents     consequents   support  confidence      lift
0      (Ice cream)        (Butter)  0.207207    0.504878  1.200889
2   (Kidney Beans)        (Butter)  0.202202    0.495098  1.177626
4      (chocolate)        (Butter)  0.202202    0.479810  1.141262
6         (Cheese)  (Kidney Beans)  0.200200    0.495050  1.212143
8      (Ice cream)     (chocolate)  0.202202    0.492683  1.169098
10          (Milk)     (chocolate)  0.211211    0.520988  1.236263
