# Market Basket Analysis Using Apriori Algorithm From Scratch

This notebook is for the Activity in Data Mining subject. I scripted the algorithm with my own minds from scratch but I used some essential libraries to make it easier. This activity was developed by Dominic Antigua

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from itertools import combinations, permutations 

In [2]:
# Reading the dataset
df = pd.read_csv('./data/basket_analysis.csv')

In [3]:
# Display the first 5 data in the dataset
df.head()

Unnamed: 0.1,Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [4]:
# Removing the ID column
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [5]:
# Get the frequency in a given condition
def check_freq(i, df):
    if type(i) == tuple:
        l = list(tuple(j for x in i for j in (x if isinstance(x, tuple) else (x,))))
        return df.groupby(l)[l[0]].sum().ravel()[-1]
    return df[df[i] == True][i].sum()

# Support function
def support(items, df, dec=4):
    return np.round((check_freq(items, df) / df.shape[0]), dec)

# Confidence function
def confidence(items, df, dec=4):
    return np.round(support(items, df) / support(items[0], df), dec)

# Lift function
def lift(items, df, dec=4):
    return np.round(support(items, df) / np.prod([support(i, df) for i in items]), dec)

In [6]:
# Destack or merging multiple tuples as one
def merge_tuples(t):
    return set(tuple(j for i in (t) for j in (i if isinstance(i, tuple) else (i,))))

In [7]:
# Pruning function
def pruning(itemset, df, min_supp):
    k_itemset = dict()
    for i in itemset:
        if support(i, df) >= min_supp:
            k_itemset[i] = check_freq(i, df)
    return k_itemset

def comb_rules(rule):
    rule_set = list()
    for j in rule:
        for i in range(len(j)):
            d = list(j)
            d.pop(i)
            rule_set.append(list(combinations([j[i], tuple(d)], 2)))
    return sum(rule_set, [])

# Rules function
def rules(ck, df, min_supp):
    comb = []
    for i in range(2, len(ck)+1):
        for j in ck[i]:
            if support(j, df) > min_supp:
                comb.append(j)
    perm = comb_rules(comb)
    rules_set = []
    for i in perm:
        key = f'{set(i[:1])} => {set(i[1:])}'
        rules_set.append(dict({key: {'support': support(i, df), 'confidence': confidence(i, df), 'lift': lift(i, df)}}))
    return rules_set
    
# Apriori Algorithm
def apriori(df, min_supp = 0.2):
    itemset = list(zip(df.columns))
    ck = dict()
    k = 1
    while True:
        k_itemset = pruning(itemset, df, min_supp)
        if k_itemset:
            ck[k] = k_itemset
            k += 1
            itemset = list(combinations(merge_tuples(k_itemset.keys()), k))
        else:
            break
    return (ck, rules(ck, df, min_supp))

In [16]:
# Run the Apriori Algorithm
ck, lk = apriori(df, min_supp=0.2)

In [19]:
# Output the rules
for i in lk:
    for k, v in i.items():
        print(k,':',v)

{'chocolate'} => {('Milk',)} : {'support': 0.2112, 'confidence': 0.5012, 'lift': 1.2363}
{'Milk'} => {('chocolate',)} : {'support': 0.2112, 'confidence': 0.521, 'lift': 1.2363}
{'chocolate'} => {('Ice cream',)} : {'support': 0.2022, 'confidence': 0.4798, 'lift': 1.1692}
{'Ice cream'} => {('chocolate',)} : {'support': 0.2022, 'confidence': 0.4927, 'lift': 1.1692}
{'chocolate'} => {('Butter',)} : {'support': 0.2022, 'confidence': 0.4798, 'lift': 1.1414}
{'Butter'} => {('chocolate',)} : {'support': 0.2022, 'confidence': 0.481, 'lift': 1.1414}
{'Kidney Beans'} => {('Cheese',)} : {'support': 0.2002, 'confidence': 0.4902, 'lift': 1.2122}
{'Cheese'} => {('Kidney Beans',)} : {'support': 0.2002, 'confidence': 0.4951, 'lift': 1.2122}
{'Kidney Beans'} => {('Butter',)} : {'support': 0.2022, 'confidence': 0.4951, 'lift': 1.1777}
{'Butter'} => {('Kidney Beans',)} : {'support': 0.2022, 'confidence': 0.481, 'lift': 1.1777}
{'Ice cream'} => {('Butter',)} : {'support': 0.2072, 'confidence': 0.5049, 'lif

## Conclusion:
If minimum support is 20%, then {Milk} => {Chocolate} and vice versa are the largest itemsets than the rest with support of 21%.
Therefore I conclude to this data that the lift and the confidence of the rules Milk => Chocolate and Chocolate => Milk is very high. The strength of the association or lift of these rule is 1.21. The market may probably want to put Milk and Chocolate close to each other. The other association of the rules are a bit less strong, but high enough to also put together at the same place in the market.