In [1]:
import pandas as pd
import numpy as np
from itertools import combinations

In [2]:
data = pd.read_csv('/content/drive/My Drive/GroceryStoreDataSet.csv',sep=',',header=None,index_col=False)
data['I1'],data['I2'],data['I3'],data['I4'] = np.nan,np.nan,np.nan,np.nan
for r in range(data.shape[0]):
  l = data.iloc[r,0].split(',')
  n = len(l)
  for i in range(1,n+1):
    data.iloc[r,i] = l[i-1]
data.drop(data[[0]],inplace=True)
data

Unnamed: 0,0,I1,I2,I3,I4
1,"BREAD,MILK,BISCUIT,CORNFLAKES",BREAD,MILK,BISCUIT,CORNFLAKES
2,"BREAD,TEA,BOURNVITA",BREAD,TEA,BOURNVITA,
3,"JAM,MAGGI,BREAD,MILK",JAM,MAGGI,BREAD,MILK
4,"MAGGI,TEA,BISCUIT",MAGGI,TEA,BISCUIT,
5,"BREAD,TEA,BOURNVITA",BREAD,TEA,BOURNVITA,
6,"MAGGI,TEA,CORNFLAKES",MAGGI,TEA,CORNFLAKES,
7,"MAGGI,BREAD,TEA,BISCUIT",MAGGI,BREAD,TEA,BISCUIT
8,"JAM,MAGGI,BREAD,TEA",JAM,MAGGI,BREAD,TEA
9,"BREAD,MILK",BREAD,MILK,,
10,"COFFEE,COCK,BISCUIT,CORNFLAKES",COFFEE,COCK,BISCUIT,CORNFLAKES


### Initializing minimum support score, records lookup table and itemlist
#### Here, let minimum support score = 2

In [3]:
min_sup,records = 2,[]
for i in range(0,data.shape[0]):
  records.append([str(data.values[i,j]) for j in range(1,len(data.columns)) if str(data.values[i,j]) != 'nan'])
itemlist = sorted([item for sublist in records for item in sublist if item != np.nan])
records

[['BREAD', 'MILK', 'BISCUIT', 'CORNFLAKES'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['JAM', 'MAGGI', 'BREAD', 'MILK'],
 ['MAGGI', 'TEA', 'BISCUIT'],
 ['BREAD', 'TEA', 'BOURNVITA'],
 ['MAGGI', 'TEA', 'CORNFLAKES'],
 ['MAGGI', 'BREAD', 'TEA', 'BISCUIT'],
 ['JAM', 'MAGGI', 'BREAD', 'TEA'],
 ['BREAD', 'MILK'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'COCK', 'BISCUIT', 'CORNFLAKES'],
 ['COFFEE', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'COCK'],
 ['BREAD', 'SUGER', 'BISCUIT'],
 ['COFFEE', 'SUGER', 'CORNFLAKES'],
 ['BREAD', 'SUGER', 'BOURNVITA'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['BREAD', 'COFFEE', 'SUGER'],
 ['TEA', 'MILK', 'COFFEE', 'CORNFLAKES']]

### STAGE 1 - First frequent itemset (k=1)

In [4]:
def stage_1(itemlist,min_sup):
  c1 = {i: itemlist.count(i) for i in itemlist}
  l1 = {}
  for key,val in c1.items():
    if val >= min_sup:
      l1[key] = val
  return c1,l1

# Test run
c1,l1 = stage_1(itemlist,min_sup)
print(c1)
print(l1)

df_stage1 = pd.DataFrame(l1,index=['sup_count']).T
df_stage1

{'BISCUIT': 6, 'BOURNVITA': 4, 'BREAD': 12, 'COCK': 3, 'COFFEE': 8, 'CORNFLAKES': 6, 'JAM': 2, 'MAGGI': 5, 'MILK': 4, 'SUGER': 6, 'TEA': 7}
{'BISCUIT': 6, 'BOURNVITA': 4, 'BREAD': 12, 'COCK': 3, 'COFFEE': 8, 'CORNFLAKES': 6, 'JAM': 2, 'MAGGI': 5, 'MILK': 4, 'SUGER': 6, 'TEA': 7}


Unnamed: 0,sup_count
BISCUIT,6
BOURNVITA,4
BREAD,12
COCK,3
COFFEE,8
CORNFLAKES,6
JAM,2
MAGGI,5
MILK,4
SUGER,6


#### Conclusion : All items pass the minimum support threshold

### STAGE 2 - Rule of pairing (k=2)

In [5]:
'''Function to check if for each subset of the current itemlist(k), whether the combination of k-1 items(previous grouping/pairing),
  belongs to the previous itemlist, so that it qualifies to be a frequent itemlist. 
  Arguments : current itemlist, previous itemlist, n(= k-1)'''
def check_freq(curr,prev,n):
  if n > 1:
    subsets = list(combinations(curr,n))
  else:
    subsets = curr
  for item in subsets:
    if not item in prev:
      return False
    else:
      return True

'''Function to check if i1 is a sublist/subset of i2'''
def sublist(i1,i2):
  return set(i1) <= set(i2)

def stage_2(l1,records,min_sup):
  l1 = sorted(list(l1.keys()))
  L1 = list(combinations(l1,2))
  c2,l2 = {},{}
  for it1 in L1:
    count = 0
    for it2 in records:
      if sublist(it1,it2):
        count += 1
    c2[it1] = count
  for key,val in c2.items():
    if val >= min_sup:
      if check_freq(key,l1,1):
        l2[key] = val
  return c2,l2

# Test run
c2,l2 = stage_2(l1,records,min_sup)
l2 = {key: value for key,value in l2.items() if value != 0}
print(c2)
print(l2)
print("No. of itemsets = {}, No. of frequent itemsets = {}".format(len(list(c2)),len(list(l2))))
df_stage2 = pd.DataFrame(l2,index=['sup_count']).T
df_stage2

{('BISCUIT', 'BOURNVITA'): 0, ('BISCUIT', 'BREAD'): 3, ('BISCUIT', 'COCK'): 2, ('BISCUIT', 'COFFEE'): 2, ('BISCUIT', 'CORNFLAKES'): 3, ('BISCUIT', 'JAM'): 0, ('BISCUIT', 'MAGGI'): 2, ('BISCUIT', 'MILK'): 1, ('BISCUIT', 'SUGER'): 1, ('BISCUIT', 'TEA'): 2, ('BOURNVITA', 'BREAD'): 3, ('BOURNVITA', 'COCK'): 0, ('BOURNVITA', 'COFFEE'): 1, ('BOURNVITA', 'CORNFLAKES'): 0, ('BOURNVITA', 'JAM'): 0, ('BOURNVITA', 'MAGGI'): 0, ('BOURNVITA', 'MILK'): 0, ('BOURNVITA', 'SUGER'): 2, ('BOURNVITA', 'TEA'): 2, ('BREAD', 'COCK'): 1, ('BREAD', 'COFFEE'): 3, ('BREAD', 'CORNFLAKES'): 1, ('BREAD', 'JAM'): 2, ('BREAD', 'MAGGI'): 3, ('BREAD', 'MILK'): 3, ('BREAD', 'SUGER'): 4, ('BREAD', 'TEA'): 4, ('COCK', 'COFFEE'): 3, ('COCK', 'CORNFLAKES'): 2, ('COCK', 'JAM'): 0, ('COCK', 'MAGGI'): 0, ('COCK', 'MILK'): 0, ('COCK', 'SUGER'): 0, ('COCK', 'TEA'): 0, ('COFFEE', 'CORNFLAKES'): 4, ('COFFEE', 'JAM'): 0, ('COFFEE', 'MAGGI'): 0, ('COFFEE', 'MILK'): 1, ('COFFEE', 'SUGER'): 4, ('COFFEE', 'TEA'): 1, ('CORNFLAKES', 'JAM

Unnamed: 0,Unnamed: 1,sup_count
BISCUIT,BREAD,3
BISCUIT,COCK,2
BISCUIT,COFFEE,2
BISCUIT,CORNFLAKES,3
BISCUIT,MAGGI,2
BISCUIT,TEA,2
BOURNVITA,BREAD,3
BOURNVITA,SUGER,2
BOURNVITA,TEA,2
BREAD,COFFEE,3


### STAGE 3 - Rule of Self-Join (k=3)


In [6]:
def stage_3(l2,records,min_sup):
  l2 = list(l2.keys())
  L2 = sorted(list(set([item for temp in l2 for item in temp])))
  L2 = list(combinations(L2,3))
  c3,l3 = {},{}
  for it1 in L2:
    count = 0
    for it2 in records:
      if sublist(it1,it2):
        count += 1
    c3[it1] = count
  for key,val in c3.items():
    if val >= min_sup:
      if check_freq(key,l2,2):
        l3[key] = val
  return c3,l3

# Test run
c3,l3 = stage_3(l2,records,min_sup)
l3 = {key: value for key,value in l3.items() if value != 0}
print(c3)
print(l3)
print("No. of itemsets = {}, No. of frequent itemsets = {}".format(len(list(c3)),len(list(l3))))
df_stage3 = pd.DataFrame(l3,index=['sup_count']).T
df_stage3

{('BISCUIT', 'BOURNVITA', 'BREAD'): 0, ('BISCUIT', 'BOURNVITA', 'COCK'): 0, ('BISCUIT', 'BOURNVITA', 'COFFEE'): 0, ('BISCUIT', 'BOURNVITA', 'CORNFLAKES'): 0, ('BISCUIT', 'BOURNVITA', 'JAM'): 0, ('BISCUIT', 'BOURNVITA', 'MAGGI'): 0, ('BISCUIT', 'BOURNVITA', 'MILK'): 0, ('BISCUIT', 'BOURNVITA', 'SUGER'): 0, ('BISCUIT', 'BOURNVITA', 'TEA'): 0, ('BISCUIT', 'BREAD', 'COCK'): 0, ('BISCUIT', 'BREAD', 'COFFEE'): 0, ('BISCUIT', 'BREAD', 'CORNFLAKES'): 1, ('BISCUIT', 'BREAD', 'JAM'): 0, ('BISCUIT', 'BREAD', 'MAGGI'): 1, ('BISCUIT', 'BREAD', 'MILK'): 1, ('BISCUIT', 'BREAD', 'SUGER'): 1, ('BISCUIT', 'BREAD', 'TEA'): 1, ('BISCUIT', 'COCK', 'COFFEE'): 2, ('BISCUIT', 'COCK', 'CORNFLAKES'): 2, ('BISCUIT', 'COCK', 'JAM'): 0, ('BISCUIT', 'COCK', 'MAGGI'): 0, ('BISCUIT', 'COCK', 'MILK'): 0, ('BISCUIT', 'COCK', 'SUGER'): 0, ('BISCUIT', 'COCK', 'TEA'): 0, ('BISCUIT', 'COFFEE', 'CORNFLAKES'): 2, ('BISCUIT', 'COFFEE', 'JAM'): 0, ('BISCUIT', 'COFFEE', 'MAGGI'): 0, ('BISCUIT', 'COFFEE', 'MILK'): 0, ('BISCUIT',

Unnamed: 0,Unnamed: 1,Unnamed: 2,sup_count
BISCUIT,COCK,COFFEE,2
BISCUIT,COCK,CORNFLAKES,2
BISCUIT,COFFEE,CORNFLAKES,2
BISCUIT,MAGGI,TEA,2
BOURNVITA,BREAD,TEA,2
BREAD,COFFEE,SUGER,2
BREAD,JAM,MAGGI,2
BREAD,MAGGI,TEA,2
COCK,COFFEE,CORNFLAKES,2


### STAGE 4 (LAST STAGE) - Rule of quadruplets (k=4)
##### **How can we confidently say that this is the last stage ?** --> Since the maximum no. of items a person has bought in any transaction is 4, so we can't possibly form frequent itemlists of size 5. Hence, we stop at 4. Now we check if quadruplet is acceptable or we've to settle for triplets.

In [7]:
def stage_4(l3,records,min_sup):
  l3 = list(l3.keys())
  L3 = sorted(list(set([item for temp in l3 for item in temp])))
  L3 = list(combinations(L3,4))
  c4,l4 = {},{}
  for it1 in L3:
    count = 0
    for it2 in records:
      if sublist(it1,it2):
        count += 1
    c4[it1] = count
  for key,val in c4.items():
    if val >= min_sup:
      if check_freq(key,l3,3):
        l4[key] = val
  return c4,l4

# Test run
c4,l4 = stage_4(l3,records,min_sup)
l4 = {key: value for key,value in l4.items() if value != 0}
print(c4)
print(l4)
print("No. of itemsets = {}, No. of frequent itemsets = {}".format(len(list(c4)),len(list(l4))))
df_stage4 = pd.DataFrame(l4,index=['sup_count']).T
df_stage4

{('BISCUIT', 'BOURNVITA', 'BREAD', 'COCK'): 0, ('BISCUIT', 'BOURNVITA', 'BREAD', 'COFFEE'): 0, ('BISCUIT', 'BOURNVITA', 'BREAD', 'CORNFLAKES'): 0, ('BISCUIT', 'BOURNVITA', 'BREAD', 'JAM'): 0, ('BISCUIT', 'BOURNVITA', 'BREAD', 'MAGGI'): 0, ('BISCUIT', 'BOURNVITA', 'BREAD', 'SUGER'): 0, ('BISCUIT', 'BOURNVITA', 'BREAD', 'TEA'): 0, ('BISCUIT', 'BOURNVITA', 'COCK', 'COFFEE'): 0, ('BISCUIT', 'BOURNVITA', 'COCK', 'CORNFLAKES'): 0, ('BISCUIT', 'BOURNVITA', 'COCK', 'JAM'): 0, ('BISCUIT', 'BOURNVITA', 'COCK', 'MAGGI'): 0, ('BISCUIT', 'BOURNVITA', 'COCK', 'SUGER'): 0, ('BISCUIT', 'BOURNVITA', 'COCK', 'TEA'): 0, ('BISCUIT', 'BOURNVITA', 'COFFEE', 'CORNFLAKES'): 0, ('BISCUIT', 'BOURNVITA', 'COFFEE', 'JAM'): 0, ('BISCUIT', 'BOURNVITA', 'COFFEE', 'MAGGI'): 0, ('BISCUIT', 'BOURNVITA', 'COFFEE', 'SUGER'): 0, ('BISCUIT', 'BOURNVITA', 'COFFEE', 'TEA'): 0, ('BISCUIT', 'BOURNVITA', 'CORNFLAKES', 'JAM'): 0, ('BISCUIT', 'BOURNVITA', 'CORNFLAKES', 'MAGGI'): 0, ('BISCUIT', 'BOURNVITA', 'CORNFLAKES', 'SUGER'):

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,sup_count
BISCUIT,COCK,COFFEE,CORNFLAKES,2


### CONCLUSIONS :
1. People tend to buy biscuit, cock, coffee, cornflakes together.
2. For the 2nd part of the conclusion, we create association rules on l3.

### BUILDING THE **ASSOCIATION RULES for MINING (ARM)** :

In [8]:
items = {**l1,**l2,**l3,**l4}
items

{('BISCUIT', 'BREAD'): 3,
 ('BISCUIT', 'COCK'): 2,
 ('BISCUIT', 'COCK', 'COFFEE'): 2,
 ('BISCUIT', 'COCK', 'COFFEE', 'CORNFLAKES'): 2,
 ('BISCUIT', 'COCK', 'CORNFLAKES'): 2,
 ('BISCUIT', 'COFFEE'): 2,
 ('BISCUIT', 'COFFEE', 'CORNFLAKES'): 2,
 ('BISCUIT', 'CORNFLAKES'): 3,
 ('BISCUIT', 'MAGGI'): 2,
 ('BISCUIT', 'MAGGI', 'TEA'): 2,
 ('BISCUIT', 'TEA'): 2,
 ('BOURNVITA', 'BREAD'): 3,
 ('BOURNVITA', 'BREAD', 'TEA'): 2,
 ('BOURNVITA', 'SUGER'): 2,
 ('BOURNVITA', 'TEA'): 2,
 ('BREAD', 'COFFEE'): 3,
 ('BREAD', 'COFFEE', 'SUGER'): 2,
 ('BREAD', 'JAM'): 2,
 ('BREAD', 'JAM', 'MAGGI'): 2,
 ('BREAD', 'MAGGI'): 3,
 ('BREAD', 'MAGGI', 'TEA'): 2,
 ('BREAD', 'MILK'): 3,
 ('BREAD', 'SUGER'): 4,
 ('BREAD', 'TEA'): 4,
 ('COCK', 'COFFEE'): 3,
 ('COCK', 'COFFEE', 'CORNFLAKES'): 2,
 ('COCK', 'CORNFLAKES'): 2,
 ('COFFEE', 'CORNFLAKES'): 4,
 ('COFFEE', 'SUGER'): 4,
 ('CORNFLAKES', 'MILK'): 2,
 ('CORNFLAKES', 'TEA'): 2,
 ('JAM', 'MAGGI'): 2,
 ('MAGGI', 'TEA'): 4,
 'BISCUIT': 6,
 'BOURNVITA': 4,
 'BREAD': 12,
 

In [15]:
'''Working on l3 to break the triplets to form dual pair + individual item comnbination sets for forming the association rules (like, {A,B,C} => {A,B} --> {C} and more)'''
assc_sets = []
for it1 in list(l3.keys()):
  assc_subset = list(combinations(it1,2))
  assc_sets.append(assc_subset)

'''Implementing the association rule.
   An association rule is formed iff the confidence of that rule exceeds the minimum confidence threshold.
   Assuming minimum confidence = 50%
'''
min_conf = 50
# Function to calculate support score
def sup_calc(it,items):
  return items[it]
# Calculating confidence
l3_assc = list(l3.keys())
selected_assc = []
for i in range(len(l3_assc)):
  for it1 in assc_sets[i]:
    denom = it1
    num = set(l3_assc[i]) - set(it1)
    confidence = ((sup_calc(l3_assc[i],items))/(sup_calc(it1,items)))*100
    if confidence > min_conf:
      print("Confidence of the association rule {} --> {} = {:.2f}%".format(denom,num,confidence))
      print("STATUS : SELECTED RULE\n")
    else:
      print("Confidence of the association rule {} --> {} = {:.2f}%".format(denom,num,confidence))
      print("STATUS : REJECTED RULE\n")

Confidence of the association rule ('BISCUIT', 'COCK') --> {'COFFEE'} = 100.00%
STATUS : SELECTED RULE

Confidence of the association rule ('BISCUIT', 'COFFEE') --> {'COCK'} = 100.00%
STATUS : SELECTED RULE

Confidence of the association rule ('COCK', 'COFFEE') --> {'BISCUIT'} = 66.67%
STATUS : SELECTED RULE

Confidence of the association rule ('BISCUIT', 'COCK') --> {'CORNFLAKES'} = 100.00%
STATUS : SELECTED RULE

Confidence of the association rule ('BISCUIT', 'CORNFLAKES') --> {'COCK'} = 66.67%
STATUS : SELECTED RULE

Confidence of the association rule ('COCK', 'CORNFLAKES') --> {'BISCUIT'} = 100.00%
STATUS : SELECTED RULE

Confidence of the association rule ('BISCUIT', 'COFFEE') --> {'CORNFLAKES'} = 100.00%
STATUS : SELECTED RULE

Confidence of the association rule ('BISCUIT', 'CORNFLAKES') --> {'COFFEE'} = 66.67%
STATUS : SELECTED RULE

Confidence of the association rule ('COFFEE', 'CORNFLAKES') --> {'BISCUIT'} = 50.00%
STATUS : REJECTED RULE

Confidence of the association rule ('B