# Market Basket Analysis Using Apriori Algorithm

In [1]:
# Import Libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from efficient_apriori import apriori

In [2]:
# Reading the basket_analysis dataset
df = pd.read_csv('./data/basket_analysis.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [3]:
# Column Information 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Unnamed: 0    999 non-null    int64
 1   Apple         999 non-null    bool 
 2   Bread         999 non-null    bool 
 3   Butter        999 non-null    bool 
 4   Cheese        999 non-null    bool 
 5   Corn          999 non-null    bool 
 6   Dill          999 non-null    bool 
 7   Eggs          999 non-null    bool 
 8   Ice cream     999 non-null    bool 
 9   Kidney Beans  999 non-null    bool 
 10  Milk          999 non-null    bool 
 11  Nutmeg        999 non-null    bool 
 12  Onion         999 non-null    bool 
 13  Sugar         999 non-null    bool 
 14  Unicorn       999 non-null    bool 
 15  Yogurt        999 non-null    bool 
 16  chocolate     999 non-null    bool 
dtypes: bool(16), int64(1)
memory usage: 23.5 KB


In [4]:
# Display the total rows and column of the dataset
print("The total rows and columns of the basket_analysis dataset is", df.shape)

The total rows and columns of the basket_analysis dataset is (999, 17)


In [5]:
# Display the desciptive summary statistic
df.describe()

Unnamed: 0.1,Unnamed: 0
count,999.0
mean,499.0
std,288.530761
min,0.0
25%,249.5
50%,499.0
75%,748.5
max,998.0


In [6]:
# Check for the data that has missing data
df.isna().sum()

Unnamed: 0      0
Apple           0
Bread           0
Butter          0
Cheese          0
Corn            0
Dill            0
Eggs            0
Ice cream       0
Kidney Beans    0
Milk            0
Nutmeg          0
Onion           0
Sugar           0
Unicorn         0
Yogurt          0
chocolate       0
dtype: int64

In [7]:
# Remove the ID column in the dataset
df = df.loc[:, df.columns != 'Unnamed: 0']
df

Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,True
995,True,False,False,False,True,False,False,False,True,True,True,False,False,False,True,False
996,True,False,False,False,True,True,False,False,False,False,False,False,True,False,False,True
997,False,False,True,True,True,False,True,True,True,False,True,False,True,False,True,True


In [8]:
df['Bought'] = np.NaN
df['Bought'] = df.apply(lambda row: tuple(row[row == True].index.values), axis=1)

In [15]:
itemsets, rules = apriori(list(df['Bought']), min_support=0.2, min_confidence=0)

In [16]:
for rule in rules:
    print(rule)

{Ice cream} -> {Butter} (conf: 0.505, supp: 0.207, lift: 1.201, conv: 1.171)
{Butter} -> {Ice cream} (conf: 0.493, supp: 0.207, lift: 1.201, conv: 1.163)
{Kidney Beans} -> {Butter} (conf: 0.495, supp: 0.202, lift: 1.178, conv: 1.148)
{Butter} -> {Kidney Beans} (conf: 0.481, supp: 0.202, lift: 1.178, conv: 1.140)
{chocolate} -> {Butter} (conf: 0.480, supp: 0.202, lift: 1.141, conv: 1.114)
{Butter} -> {chocolate} (conf: 0.481, supp: 0.202, lift: 1.141, conv: 1.115)
{Kidney Beans} -> {Cheese} (conf: 0.490, supp: 0.200, lift: 1.212, conv: 1.168)
{Cheese} -> {Kidney Beans} (conf: 0.495, supp: 0.200, lift: 1.212, conv: 1.172)
{chocolate} -> {Ice cream} (conf: 0.480, supp: 0.202, lift: 1.169, conv: 1.133)
{Ice cream} -> {chocolate} (conf: 0.493, supp: 0.202, lift: 1.169, conv: 1.140)
{chocolate} -> {Milk} (conf: 0.501, supp: 0.211, lift: 1.236, conv: 1.192)
{Milk} -> {chocolate} (conf: 0.521, supp: 0.211, lift: 1.236, conv: 1.208)


In [17]:
df[df['Bread'] == True]['Bread'].sum() / df.shape[0]

0.3843843843843844

In [18]:
len(itemsets)

2

In [25]:
len(itemsets[2])

6

In [22]:
itemsets

{1: {('Bread',): 384,
  ('Corn',): 407,
  ('Dill',): 398,
  ('Ice cream',): 410,
  ('Sugar',): 409,
  ('Yogurt',): 420,
  ('chocolate',): 421,
  ('Milk',): 405,
  ('Apple',): 383,
  ('Butter',): 420,
  ('Cheese',): 404,
  ('Nutmeg',): 401,
  ('Onion',): 403,
  ('Unicorn',): 389,
  ('Eggs',): 384,
  ('Kidney Beans',): 408},
 2: {('Butter', 'Ice cream'): 207,
  ('Butter', 'Kidney Beans'): 202,
  ('Butter', 'chocolate'): 202,
  ('Cheese', 'Kidney Beans'): 200,
  ('Ice cream', 'chocolate'): 202,
  ('Milk', 'chocolate'): 211}}