In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

! pip install mlxtend

Collecting mlxtend
  Obtaining dependency information for mlxtend from https://files.pythonhosted.org/packages/73/da/d5d77a9a7a135c948dbf8d3b873655b105a152d69e590150c83d23c3d070/mlxtend-0.23.0-py3-none-any.whl.metadata
  Downloading mlxtend-0.23.0-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.0-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: mlxtend
Successfully installed mlxtend-0.23.0


# Association Rule for Store Dataset

In this case study, we will explore how association rule can be used to analyze the items that are usualy purcased together.

you can refer to this article to find out about apriori and association rule:
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

## Load Data

We will use the dataset of the transaction in a certain store. You can get the dataset here: 
https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv

In [29]:
# load the data set and show the first five transaction
url = "https://gist.githubusercontent.com/Harsh-Git-Hub/2979ec48043928ad9033d8469928e751/raw/72de943e040b8bd0d087624b154d41b2ba9d9b60/retail_dataset.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


# Get the set of product that has been purchased


Get the unique product that has been purchased

In [30]:
unique_products = set(df.values.flatten())
print(unique_products)

{'Diaper', 'Wine', 'Bagel', 'Bread', 'Cheese', 'Milk', 'Eggs', nan, 'Meat', 'Pencil'}


## Preprocess Data

In this step, we will transform our dataset so that we will have a one hot encoding based on the purchased products.

In [31]:
#create an itemset based on the products
itemset = set()
for col in df.columns[1:]:
    itemset.update(df[col].unique())

In [35]:
# encoding the feature and create new dataframe from the encoded features
encoded_df = pd.DataFrame()

for product in itemset:
    encoded_df[product] = df.apply(lambda row: int(product in row.values), axis=1)

# show the new dataframe
encoded_df

Unnamed: 0,Diaper,Wine,Bagel,Bread,Cheese,Milk,Eggs,NaN,Meat,Pencil
0,1,1,0,1,1,0,1,0,1,1
1,1,1,0,1,1,1,0,0,1,1
2,0,1,0,0,1,1,1,0,1,0
3,0,1,0,0,1,1,1,0,1,0
4,0,1,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...
310,0,0,0,1,1,0,1,0,0,0
311,0,0,0,0,0,1,0,0,1,1
312,1,1,0,1,1,0,1,0,1,1
313,0,0,0,0,1,0,0,0,1,0


Since, the encoded dataframe consist of the empty column. We will drop the NaN column or select all columns other than the first column.

In [24]:
encoded_df = encoded_df.loc[:, encoded_df.columns.notna()]
encoded_df.head()

Unnamed: 0,Diaper,Wine,Bagel,Bread,Cheese,Milk,Eggs,Meat,Pencil
0,1,1,0,1,1,0,1,1,1
1,1,1,0,1,1,1,0,1,1
2,0,1,0,0,1,1,1,1,0
3,0,1,0,0,1,1,1,1,0
4,0,1,0,0,0,0,0,1,1


## Apriori Algorithm

We will use appriori algorithm to determine the frequently purchased products. 
For this case study, we will min_support=0.2

In [25]:
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder

transactions = []
for i in range(len(encoded_df)):
    transactions.append([str(item) for item in encoded_df.columns if encoded_df.loc[i, item] == 1])

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_apriori = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df_apriori, min_support=0.2, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.425397,(Bagel)
1,0.504762,(Bread)
2,0.501587,(Cheese)
3,0.406349,(Diaper)
4,0.438095,(Eggs)
5,0.47619,(Meat)
6,0.501587,(Milk)
7,0.361905,(Pencil)
8,0.438095,(Wine)
9,0.279365,"(Bagel, Bread)"


Then, we will generate association rule of the frequent itemset based on confidence level with the threshold=0.6

In [27]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bagel),(Bread),0.425397,0.504762,0.279365,0.656716,1.301042,0.064641,1.44265,0.402687
1,(Eggs),(Cheese),0.438095,0.501587,0.298413,0.681159,1.358008,0.07867,1.563203,0.469167
2,(Cheese),(Meat),0.501587,0.47619,0.32381,0.64557,1.355696,0.084958,1.477891,0.526414
3,(Meat),(Cheese),0.47619,0.501587,0.32381,0.68,1.355696,0.084958,1.55754,0.500891
4,(Milk),(Cheese),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
5,(Cheese),(Milk),0.501587,0.501587,0.304762,0.607595,1.211344,0.053172,1.270148,0.350053
6,(Wine),(Cheese),0.438095,0.501587,0.269841,0.615942,1.227986,0.050098,1.297754,0.330409
7,(Eggs),(Meat),0.438095,0.47619,0.266667,0.608696,1.278261,0.05805,1.338624,0.387409
8,"(Eggs, Cheese)",(Meat),0.298413,0.47619,0.215873,0.723404,1.519149,0.073772,1.893773,0.487091
9,"(Eggs, Meat)",(Cheese),0.266667,0.501587,0.215873,0.809524,1.613924,0.082116,2.616667,0.518717


Provide explanation about __antecedent support__, __consequent support__, __support__, __confidence__, __lift__, __leverage__ and __conviction__

the key metrics commonly used in association rule mining:

- The support metric is defined for itemsets, not assocication rules. It is used to measure the abundance or frequency (often interpreted as significance or importance) of an itemset in a database. The table produced by the association rule mining algorithm contains three different support metrics: 
  1. Antecedent Support: the proportion of transactions in the dataset that contain the antecedent of a rule.
    
  2. Consequent Support: computes the support for the itemset of the consequent C

  3. Support: then computes the support of the combined itemset A ∪ C.

- Confidence: measures the reliability of the rule. It is the probability of finding the consequent in a transaction given that the transaction contains the antecedent.

- Lift: measures how much more likely item B is purchased when item A is purchased, compared to when item B is purchased without item A. A lift value greater than 1 indicates a positive association.

- Leverage: measures the difference between the observed frequency of A and B appearing together and the frequency that would be expected if A and B were independent.

- Conviction: measures how much more likely the consequent is true given that the antecedent is true, compared to if they were independent. (A high conviction value means that the consequent is highly depending on the antecedent)

$a^2$