# Market Basket Analysis in Python using Apriori Algorithm

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from apyori import apriori
import warnings
warnings.filterwarnings('ignore')

### Importing Dataset

In [2]:
data = pd.read_csv('Groceries_dataset.csv')

### Exploring Dataset

In [3]:
data.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [4]:
data.isnull().sum()

Member_number      0
Date               0
itemDescription    0
dtype: int64

In [5]:
data.shape

(38765, 3)

### Statistical Info 

In [6]:
data.describe()

Unnamed: 0,Member_number
count,38765.0
mean,3003.641868
std,1153.611031
min,1000.0
25%,2002.0
50%,3005.0
75%,4007.0
max,5000.0


### String to Numeric data

In [7]:
encoder = LabelEncoder()
data['Date'] = encoder.fit_transform(data['Date'])
Date  = {index : label for index, label in enumerate(encoder.classes_)}
Date 

{0: '01-01-2014',
 1: '01-01-2015',
 2: '01-02-2014',
 3: '01-02-2015',
 4: '01-03-2014',
 5: '01-03-2015',
 6: '01-04-2014',
 7: '01-04-2015',
 8: '01-05-2014',
 9: '01-05-2015',
 10: '01-06-2014',
 11: '01-06-2015',
 12: '01-07-2014',
 13: '01-07-2015',
 14: '01-08-2014',
 15: '01-08-2015',
 16: '01-09-2014',
 17: '01-09-2015',
 18: '01-10-2014',
 19: '01-10-2015',
 20: '01-11-2014',
 21: '01-11-2015',
 22: '01-12-2014',
 23: '01-12-2015',
 24: '02-01-2014',
 25: '02-01-2015',
 26: '02-02-2014',
 27: '02-02-2015',
 28: '02-03-2014',
 29: '02-03-2015',
 30: '02-04-2014',
 31: '02-04-2015',
 32: '02-05-2014',
 33: '02-05-2015',
 34: '02-06-2014',
 35: '02-06-2015',
 36: '02-07-2014',
 37: '02-07-2015',
 38: '02-08-2014',
 39: '02-08-2015',
 40: '02-09-2014',
 41: '02-09-2015',
 42: '02-10-2014',
 43: '02-10-2015',
 44: '02-11-2014',
 45: '02-11-2015',
 46: '02-12-2014',
 47: '02-12-2015',
 48: '03-01-2014',
 49: '03-01-2015',
 50: '03-02-2014',
 51: '03-02-2015',
 52: '03-03-2014',
 53

In [8]:
data['itemDescription'] = encoder.fit_transform(data['itemDescription'])
itemDescription  = {index : label for index, label in enumerate(encoder.classes_)}
itemDescription 

{0: 'Instant food products',
 1: 'UHT-milk',
 2: 'abrasive cleaner',
 3: 'artif. sweetener',
 4: 'baby cosmetics',
 5: 'bags',
 6: 'baking powder',
 7: 'bathroom cleaner',
 8: 'beef',
 9: 'berries',
 10: 'beverages',
 11: 'bottled beer',
 12: 'bottled water',
 13: 'brandy',
 14: 'brown bread',
 15: 'butter',
 16: 'butter milk',
 17: 'cake bar',
 18: 'candles',
 19: 'candy',
 20: 'canned beer',
 21: 'canned fish',
 22: 'canned fruit',
 23: 'canned vegetables',
 24: 'cat food',
 25: 'cereals',
 26: 'chewing gum',
 27: 'chicken',
 28: 'chocolate',
 29: 'chocolate marshmallow',
 30: 'citrus fruit',
 31: 'cleaner',
 32: 'cling film/bags',
 33: 'cocoa drinks',
 34: 'coffee',
 35: 'condensed milk',
 36: 'cooking chocolate',
 37: 'cookware',
 38: 'cream',
 39: 'cream cheese ',
 40: 'curd',
 41: 'curd cheese',
 42: 'decalcifier',
 43: 'dental care',
 44: 'dessert',
 45: 'detergent',
 46: 'dish cleaner',
 47: 'dishes',
 48: 'dog food',
 49: 'domestic eggs',
 50: 'female sanitary products',
 51: 

In [9]:
data['itemDescription'].value_counts()

164    2502
102    1898
122    1716
138    1514
165    1334
       ... 
124       5
5         4
4         3
79        1
114       1
Name: itemDescription, Length: 167, dtype: int64

In [10]:
data['Member_number'].value_counts()

3180    36
3737    33
3050    33
2051    33
2625    31
        ..
2503     2
3301     2
1775     2
3723     2
2417     2
Name: Member_number, Length: 3898, dtype: int64

### grouping dataset to form a list of products bought by same customer on same date

In [11]:
data=data.groupby(['Member_number','Date'])['itemDescription'].apply(lambda x: list(x))

In [12]:
data.head()

Member_number  Date
1000           341     [130, 164, 132, 165]
               562          [164, 105, 128]
               565                 [20, 92]
               597                [130, 73]
               633               [138, 108]
Name: itemDescription, dtype: object

### apriori takes list as an input, after that converting dtaset to a list

In [13]:
transactions = data.values.tolist()
transactions[:10]

[[130, 164, 132, 165],
 [164, 105, 128],
 [20, 92],
 [130, 73],
 [138, 108],
 [56, 40],
 [130, 164, 122],
 [164, 138],
 [8, 162],
 [56, 138, 160]]

### Applying Apyori algorithm

In [14]:
rules = apriori(transactions, min_support=0.00030,min_confidence = 0.05,min_lift = 2,min_length = 2)
results = list(rules)
results

[RelationRecord(items=frozenset({138, 3}), support=0.00046782062420637575, ordered_statistics=[OrderedStatistic(items_base=frozenset({3}), items_add=frozenset({138}), confidence=0.2413793103448276, lift=2.4857251346797353)]),
 RelationRecord(items=frozenset({9, 35}), support=0.0003341575887188398, ordered_statistics=[OrderedStatistic(items_base=frozenset({35}), items_add=frozenset({9}), confidence=0.05102040816326531, lift=2.34177413296607)]),
 RelationRecord(items=frozenset({164, 13}), support=0.0008688097306689834, ordered_statistics=[OrderedStatistic(items_base=frozenset({13}), items_add=frozenset({164}), confidence=0.34210526315789475, lift=2.1662805978127717)]),
 RelationRecord(items=frozenset({151, 15}), support=0.0003341575887188398, ordered_statistics=[OrderedStatistic(items_base=frozenset({151}), items_add=frozenset({15}), confidence=0.07352941176470588, lift=2.087705101015738)]),
 RelationRecord(items=frozenset({20, 84}), support=0.00040098910646260775, ordered_statistics=[Or

In [15]:
def inspect(results):
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    confidences = [result[2][0][2] for result in results]
    lifts       = [result[2][0][3] for result in results]
    return list(zip(lhs, rhs, supports, confidences, lifts))
ordered_results = pd.DataFrame(inspect(results), columns = ['Left Hand Side', 'Right Hand Side', 'Support', 'Confidence','Lift'] )

### Results

In [16]:
ordered_results

Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift
0,3,138,0.000468,0.241379,2.485725
1,35,9,0.000334,0.051020,2.341774
2,13,164,0.000869,0.342105,2.166281
3,151,15,0.000334,0.073529,2.087705
4,84,20,0.000401,0.120000,2.557778
...,...,...,...,...,...
99,160,165,0.000601,0.204545,2.381800
100,130,165,0.000401,0.206897,2.409178
101,164,130,0.001470,0.131737,2.182917
102,105,138,0.000334,0.090909,7.817659
