# Association Rule Analysis
Association Rule - simple I1, I2, I3, I4, I5

## Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
pd.set_option('display.max_columns', None)
import mlxtend
print("AR Library mlxtend present ", 'mlxtend' in sys.modules)

AR Library mlxtend present  True


### AR Libraries

In [4]:
#pip install mlxtend   #install this from anaconda prompt as Admin
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import time
import logging

## Create Transactional Data

In [5]:
# this is list type, each [] represents 1 transaction with set of items
# if item is present its item ID is in the list
transactions = [['I1','I2','I5'],
                ['I2','I4'],
                ['I2','I3'] ,
                ['I1','I2','I4'],
                ['I1','I3'], 
                ['I2','I3'],
                ['I1','I3'], 
                ['I1','I2','I3','I5'],
                ['I1','I2','I3']]
transactions

[['I1', 'I2', 'I5'],
 ['I2', 'I4'],
 ['I2', 'I3'],
 ['I1', 'I2', 'I4'],
 ['I1', 'I3'],
 ['I2', 'I3'],
 ['I1', 'I3'],
 ['I1', 'I2', 'I3', 'I5'],
 ['I1', 'I2', 'I3']]

In [6]:
### Convert to transaction format : then only we can analyse the data
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
te_ary  # this produces no of colums as there are number of times
#with True stating item was present

array([[ True,  True, False, False,  True],
       [False,  True, False,  True, False],
       [False,  True,  True, False, False],
       [ True,  True, False,  True, False],
       [ True, False,  True, False, False],
       [False,  True,  True, False, False],
       [ True, False,  True, False, False],
       [ True,  True,  True, False,  True],
       [ True,  True,  True, False, False]])

In [7]:
te.columns_  #column order

['I1', 'I2', 'I3', 'I4', 'I5']

In [9]:
df = pd.DataFrame(te_ary, columns=te.columns_)
df  # view transactions in data frame format; True means item present

Unnamed: 0,I1,I2,I3,I4,I5
0,True,True,False,False,True
1,False,True,False,True,False
2,False,True,True,False,False
3,True,True,False,True,False
4,True,False,True,False,False
5,False,True,True,False,False
6,True,False,True,False,False
7,True,True,True,False,True
8,True,True,True,False,False


In [10]:
#this matrix of transactions : T/ F indicate their presence in each Trans ID
df.shape

(9, 5)

In [11]:
#get back orginal transactions
orgtrans1 = te_ary[:]
te.inverse_transform(orgtrans1)

[['I1', 'I2', 'I5'],
 ['I2', 'I4'],
 ['I2', 'I3'],
 ['I1', 'I2', 'I4'],
 ['I1', 'I3'],
 ['I2', 'I3'],
 ['I1', 'I3'],
 ['I1', 'I2', 'I3', 'I5'],
 ['I1', 'I2', 'I3']]

## Frequent Item Set

In [13]:
#%%% #frequent itemsets - Most Imp Step
# first find frequent items in different combinations and perform AR analysis
support_threshold = 0.01 #.01
.1 * 100
.1 * 9
#https://github.com/rasbt/mlxtend/blob/master/mlxtend/frequent_patterns/apriori.py
frequent_itemsets = apriori(df, min_support= support_threshold, use_colnames = True)
frequent_itemsets
#itemset and its support value with condition of min support value
# .44 for (I1, I2) means I1 + I2 are present together in 44% of the transactions

Unnamed: 0,support,itemsets
0,0.666667,(I1)
1,0.777778,(I2)
2,0.666667,(I3)
3,0.222222,(I4)
4,0.222222,(I5)
5,0.444444,"(I1, I2)"
6,0.444444,"(I1, I3)"
7,0.111111,"(I4, I1)"
8,0.222222,"(I5, I1)"
9,0.444444,"(I3, I2)"


In [14]:
print(frequent_itemsets) #dataframe with the itemsets

     support          itemsets
0   0.666667              (I1)
1   0.777778              (I2)
2   0.666667              (I3)
3   0.222222              (I4)
4   0.222222              (I5)
5   0.444444          (I1, I2)
6   0.444444          (I1, I3)
7   0.111111          (I4, I1)
8   0.222222          (I5, I1)
9   0.444444          (I3, I2)
10  0.222222          (I4, I2)
11  0.222222          (I5, I2)
12  0.111111          (I5, I3)
13  0.222222      (I3, I1, I2)
14  0.111111      (I4, I1, I2)
15  0.222222      (I5, I1, I2)
16  0.111111      (I5, I1, I3)
17  0.111111      (I5, I3, I2)
18  0.111111  (I5, I3, I1, I2)


In [16]:
#help(association_rules)

## Association Rules

### Support
No of occurances of item(itemset) / Total Transactions
Higher the value, more frequent the items were bought in the combination

In [15]:
#output - DF with antecedents -> consequent
supportRules3 = association_rules(frequent_itemsets, metric="support", min_threshold = .4)
print(supportRules3)
# 5th column (support)
# .44 for 0 row states - I1 + I2 present together in 44% of transactions
# above functions with min_threshold will list only those items with support value > .4

  antecedents consequents  antecedent support  consequent support   support  \
0        (I1)        (I2)            0.666667            0.777778  0.444444   
1        (I2)        (I1)            0.777778            0.666667  0.444444   
2        (I1)        (I3)            0.666667            0.666667  0.444444   
3        (I3)        (I1)            0.666667            0.666667  0.444444   
4        (I3)        (I2)            0.666667            0.777778  0.444444   
5        (I2)        (I3)            0.777778            0.666667  0.444444   

   confidence      lift  leverage  conviction  
0    0.666667  0.857143 -0.074074    0.666667  
1    0.571429  0.857143 -0.074074    0.777778  
2    0.666667  1.000000  0.000000    1.000000  
3    0.666667  1.000000  0.000000    1.000000  
4    0.666667  0.857143 -0.074074    0.666667  
5    0.571429  0.857143 -0.074074    0.777778  


In [16]:
supportRules3.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(I1),(I2),0.666667,0.777778,0.444444,0.666667,0.857143,-0.074074,0.666667
1,(I2),(I1),0.777778,0.666667,0.444444,0.571429,0.857143,-0.074074,0.777778
2,(I1),(I3),0.666667,0.666667,0.444444,0.666667,1.0,0.0,1.0
3,(I3),(I1),0.666667,0.666667,0.444444,0.666667,1.0,0.0,1.0
4,(I3),(I2),0.666667,0.777778,0.444444,0.666667,0.857143,-0.074074,0.666667


In [19]:
print(supportRules3[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
# Generally support, confidence and lift are used

  antecedents consequents   support  confidence      lift
0        (I2)        (I1)  0.444444    0.571429  0.857143
1        (I1)        (I2)  0.444444    0.666667  0.857143
2        (I3)        (I1)  0.444444    0.666667  1.000000
3        (I1)        (I3)  0.444444    0.666667  1.000000
4        (I2)        (I3)  0.444444    0.571429  0.857143
5        (I3)        (I2)  0.444444    0.666667  0.857143


In [None]:
### Support with different threshold values

In [17]:
supportRules2 = association_rules(frequent_itemsets, metric="support", min_threshold = .2)
print(supportRules2[['antecedents', 'consequents', 'support', 'confidence','lift']])

   antecedents consequents   support  confidence      lift
0         (I1)        (I2)  0.444444    0.666667  0.857143
1         (I2)        (I1)  0.444444    0.571429  0.857143
2         (I1)        (I3)  0.444444    0.666667  1.000000
3         (I3)        (I1)  0.444444    0.666667  1.000000
4         (I5)        (I1)  0.222222    1.000000  1.500000
5         (I1)        (I5)  0.222222    0.333333  1.500000
6         (I3)        (I2)  0.444444    0.666667  0.857143
7         (I2)        (I3)  0.444444    0.571429  0.857143
8         (I4)        (I2)  0.222222    1.000000  1.285714
9         (I2)        (I4)  0.222222    0.285714  1.285714
10        (I5)        (I2)  0.222222    1.000000  1.285714
11        (I2)        (I5)  0.222222    0.285714  1.285714
12    (I1, I3)        (I2)  0.222222    0.500000  0.642857
13    (I3, I2)        (I1)  0.222222    0.500000  0.750000
14    (I1, I2)        (I3)  0.222222    0.500000  0.750000
15        (I3)    (I1, I2)  0.222222    0.333333  0.7500

### Confidence

In [24]:
#%%%% Confidence
confidence6 = association_rules(frequent_itemsets, metric="confidence", min_threshold=.6)
#print(confidence6)
print(confidence6[['antecedents', 'consequents', 'support','confidence']])

     antecedents consequents   support  confidence
0           (I1)        (I2)  0.444444    0.666667
1           (I3)        (I1)  0.444444    0.666667
2           (I1)        (I3)  0.444444    0.666667
3           (I5)        (I1)  0.222222    1.000000
4           (I3)        (I2)  0.444444    0.666667
5           (I4)        (I2)  0.222222    1.000000
6           (I5)        (I2)  0.222222    1.000000
7       (I4, I1)        (I2)  0.111111    1.000000
8       (I2, I5)        (I1)  0.222222    1.000000
9       (I1, I5)        (I2)  0.222222    1.000000
10          (I5)    (I2, I1)  0.222222    1.000000
11      (I3, I5)        (I1)  0.111111    1.000000
12      (I3, I5)        (I2)  0.111111    1.000000
13  (I2, I3, I5)        (I1)  0.111111    1.000000
14  (I1, I3, I5)        (I2)  0.111111    1.000000
15      (I3, I5)    (I2, I1)  0.111111    1.000000


### Lift

In [27]:
#%%%% Lift  : generally > 1 for strong associations
lift1 = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
#print(lift1)
print(lift1[['antecedents', 'consequents', 'support', 'lift', 'confidence']])

     antecedents   consequents   support      lift  confidence
0           (I3)          (I1)  0.444444  1.000000    0.666667
1           (I1)          (I3)  0.444444  1.000000    0.666667
2           (I5)          (I1)  0.222222  1.500000    1.000000
3           (I1)          (I5)  0.222222  1.500000    0.333333
4           (I2)          (I4)  0.222222  1.285714    0.285714
5           (I4)          (I2)  0.222222  1.285714    1.000000
6           (I2)          (I5)  0.222222  1.285714    0.285714
7           (I5)          (I2)  0.222222  1.285714    1.000000
8       (I2, I1)          (I4)  0.111111  1.125000    0.250000
9       (I4, I1)          (I2)  0.111111  1.285714    1.000000
10          (I2)      (I4, I1)  0.111111  1.285714    0.142857
11          (I4)      (I2, I1)  0.111111  1.125000    0.500000
12      (I2, I5)          (I1)  0.222222  1.500000    1.000000
13      (I1, I5)          (I2)  0.222222  1.285714    1.000000
14      (I2, I1)          (I5)  0.222222  2.250000    0

In [26]:
# Lift with different threshold
lift2 = association_rules(frequent_itemsets, metric="lift", min_threshold=2)
#print(lift2)  #high positive correlation
print(lift2[['antecedents', 'consequents', 'support', 'lift', 'confidence']])

    antecedents   consequents   support  lift  confidence
0      (I2, I1)          (I5)  0.222222  2.25        0.50
1          (I5)      (I2, I1)  0.222222  2.25        1.00
2  (I2, I3, I1)          (I5)  0.111111  2.25        0.50
3      (I3, I5)      (I2, I1)  0.111111  2.25        1.00
4      (I2, I1)      (I3, I5)  0.111111  2.25        0.25
5          (I5)  (I2, I3, I1)  0.111111  2.25        0.50


In [None]:
#Confidence and Support Threshold Combined
#twin condition : lift> 2;  confidence > .5, support > .2
lift2[(lift2.confidence > .5) & (lift2.support > .2)]

#### Different Threshold Values

In [20]:
#min support =.3
association_rules(frequent_itemsets, metric="support", min_threshold = .3)[
    [ 'antecedents','consequents','support', 'confidence','lift']]

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(I1),(I2),0.444444,0.666667,0.857143
1,(I2),(I1),0.444444,0.571429,0.857143
2,(I1),(I3),0.444444,0.666667,1.0
3,(I3),(I1),0.444444,0.666667,1.0
4,(I3),(I2),0.444444,0.666667,0.857143
5,(I2),(I3),0.444444,0.571429,0.857143


In [21]:
#min lift =1 
association_rules(frequent_itemsets, metric="lift", min_threshold = 1)[
    [ 'antecedents','consequents','support', 'confidence','lift']]

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(I1),(I3),0.444444,0.666667,1.0
1,(I3),(I1),0.444444,0.666667,1.0
2,(I5),(I1),0.222222,1.0,1.5
3,(I1),(I5),0.222222,0.333333,1.5
4,(I4),(I2),0.222222,1.0,1.285714
5,(I2),(I4),0.222222,0.285714,1.285714
6,(I5),(I2),0.222222,1.0,1.285714
7,(I2),(I5),0.222222,0.285714,1.285714
8,"(I4, I1)",(I2),0.111111,1.0,1.285714
9,"(I1, I2)",(I4),0.111111,0.25,1.125


In [22]:
#min confidence =.6 
association_rules(frequent_itemsets, metric="confidence", min_threshold = .6)[
    ['antecedents','consequents','support', 'confidence','lift']]

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(I1),(I2),0.444444,0.666667,0.857143
1,(I1),(I3),0.444444,0.666667,1.0
2,(I3),(I1),0.444444,0.666667,1.0
3,(I5),(I1),0.222222,1.0,1.5
4,(I3),(I2),0.444444,0.666667,0.857143
5,(I4),(I2),0.222222,1.0,1.285714
6,(I5),(I2),0.222222,1.0,1.285714
7,"(I4, I1)",(I2),0.111111,1.0,1.285714
8,"(I5, I1)",(I2),0.222222,1.0,1.285714
9,"(I5, I2)",(I1),0.222222,1.0,1.5


### Part-1 Over : Interpret the results 

In [None]:
### Analysis

In [29]:
frequent_itemsets = apriori(df, min_support=0.2, use_colnames = True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.666667,(I1)
1,0.777778,(I2)
2,0.666667,(I3)
3,0.222222,(I4)
4,0.222222,(I5)
5,0.444444,"(I2, I1)"
6,0.444444,"(I3, I1)"
7,0.222222,"(I5, I1)"
8,0.444444,"(I2, I3)"
9,0.222222,"(I2, I4)"


In [24]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'I1', 'I2'} ]

Unnamed: 0,support,itemsets
5,0.444444,"(I1, I2)"


In [25]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'I1'} ]

Unnamed: 0,support,itemsets
0,0.666667,(I1)


In [26]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.666667,(I1),1
1,0.777778,(I2),1
2,0.666667,(I3),1
3,0.222222,(I4),1
4,0.222222,(I5),1
5,0.444444,"(I1, I2)",2
6,0.444444,"(I1, I3)",2
7,0.111111,"(I4, I1)",2
8,0.222222,"(I5, I1)",2
9,0.444444,"(I3, I2)",2


In [27]:
frequent_itemsets[ (frequent_itemsets['length'] >= 1) & (frequent_itemsets[ 'support'] >= 0.3) ]

Unnamed: 0,support,itemsets,length
0,0.666667,(I1),1
1,0.777778,(I2),1
2,0.666667,(I3),1
5,0.444444,"(I1, I2)",2
6,0.444444,"(I1, I3)",2
9,0.444444,"(I3, I2)",2


In [28]:
frequent_itemsets[ (frequent_itemsets['length'] == 2) & (frequent_itemsets[ 'support'] >= 0.3) ]

Unnamed: 0,support,itemsets,length
5,0.444444,"(I1, I2)",2
6,0.444444,"(I1, I3)",2
9,0.444444,"(I3, I2)",2


##  Another Example with Item Names

Links

http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/
https://www.kaggle.com/datatheque/association-rules-mining-market-basket-analysis
http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

- summary
metric - support, confidence, lift
frequent item set, rule (threshold - sp, conf, lift)
X->Y   : which rules interesting
Combo plan, relayout, discount, ad, recommendation system

transactions = [['Bread','Butter','Jam'],['Butter','Cheese'],['Butter','Egg'] ,['Bread','Butter','Cheese'],['Bread','Egg'], ['Butter','Egg'],['Bread','Egg'], ['Bread','Butter','Egg','Jam'],['Bread','Butter','Egg']]
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
te_ary
te.columns_
df = pd.DataFrame(te_ary, columns=te.columns_)
df

#https://pypi.org/project/efficient-apriori/
#https://stackabuse.com/association-rule-mining-via-apriori-algorithm-in-python
pip install apyori
from apyori import apriori
association_rules = apriori(df, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2)
association_results = list(association_rules)
association_results
print(len(association_results))
print(association_results[0])
for item in association_results:
    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])

    #second index of the inner list
    print("Support: " + str(item[1]))

    #third index of the list located at 0th
    #of the third index of the inner list

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")

    

#%%%method3  : under draft
#https://pypi.org/project/efficient-apriori/
#pip install efficient_apriori
from efficient_apriori import apriori
transactions = [('eggs', 'bacon', 'soup'),   ('eggs', 'bacon', 'apple'), ('soup', 'bacon', 'banana')]
transactions
itemsets, rules = apriori(transactions, min_support=0.5, min_confidence=1)
print(rules)  # [{eggs} -> {bacon}, {soup} -> {bacon}]

itemsets, rules = apriori(transactions, min_support=0.2, min_confidence=1)
### Print out every rule with 2 items on the left hand side,
### 1 item on the right hand side, sorted by lift
rules_rhs = filter(lambda rule: len(rule.lhs) == 2 and len(rule.rhs) == 1, rules)
for rule in sorted(rules_rhs, key=lambda rule: rule.lift):
  print(rule)  # Prints the rule and its confidence, support, lift, ...

#with ids
from efficient_apriori import apriori
transactions = [('eggs', 'bacon', 'soup'),   ('eggs', 'bacon', 'apple'), ('soup', 'bacon', 'banana')]
itemsets, rules = apriori(transactions, output_transaction_ids=True)
print(itemsets)

transactions
help(apriori)
itemsets2, rules2 = apriori(transactions, min_support=0.2, min_confidence = .3)
itemsets2
rules2

### Print out every rule with 1 items on the left hand side,1 item on the right hand side, sorted by lift
rules_rhs = filter(lambda rule: len(rule.lhs) == 1 and len(rule.rhs) == 1, rules2)
rules_rhs
for rule in sorted(rules_rhs, key=lambda rule: rule.lift):  print(rule) 
### Prints the rule and its confidence, support, lift, ...
### Print out every rule with 2 items on the left hand side,

#%%%

transactions = [['I1','I2','I5'],['I2','I4'],['I2','I3'] ,['I1','I2','I4'],['I1','I3'], ['I2','I3'],['I1','I3'], ['I1','I2','I3','I5'],['I1','I2','I3']]
transactions
#----
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
te_ary
te.columns_
df = pd.DataFrame(te_ary, columns=te.columns_)
df
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori
support_threshold = 0.01
frequent_itemsets = apriori(df, min_support= support_threshold, use_colnames = True)
frequent_itemsets
rules4 = association_rules(frequent_itemsets, metric="lift", min_threshold =1.2)
rules4
#no of items - left and right side
rules4["ant_len"] = rules4["antecedents"].apply(lambda x: len(x))
rules4
rules4["con_len"] = rules4["consequents"].apply(lambda x: len(x))
rules4
rules4[(rules4['ant_len'] >= 1) & (rules4['confidence'] > 0.75) & (rules4['lift'] > 1.2) ]
rules4[rules4['antecedents'] == {'I1','I2'}]

#%%%
transactions = [['I1','I2','I5'],['I2','I4'],['I2','I3'] ,['I1','I2','I4'],['I1','I3'], ['I2','I3'],['I1','I3'], ['I1','I2','I3','I5'],['I1','I2','I3']]
transactions

from efficient_apriori import apriori
#transactions = [('eggs', 'bacon', 'soup'),   ('eggs', 'bacon', 'apple'), ('soup', 'bacon', 'banana')]
itemsets, rules = apriori(transactions, output_transaction_ids=True)
print(itemsets)
itemsets, rules = apriori(transactions, min_support=0.4, min_confidence=.6)
print(rules)