In [1]:
import pandas as pd
from itertools import combinations

In [2]:
data = {

    'TransactionID': [1, 2, 3, 4, 5]

    ,'ItemsPurchased': ['Bread,Butter,Milk','Bread,Diaper,Beer,Milk','Milk,Diaper,Beer','Bread,Butter','Bread,Butter,Diaper,Milk']

}

In [3]:
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,TransactionID,ItemsPurchased
0,1,"Bread,Butter,Milk"
1,2,"Bread,Diaper,Beer,Milk"
2,3,"Milk,Diaper,Beer"
3,4,"Bread,Butter"
4,5,"Bread,Butter,Diaper,Milk"


In [5]:
all_words_rows = df['ItemsPurchased'].tolist()

In [6]:
all_word = set()
for transaction in all_words_rows:
    all_word.update(transaction.split(','))

In [7]:
df['ItemsPurchased'] = df['ItemsPurchased'].apply(lambda x : x.split(','))

In [8]:
all_words_rows = df['ItemsPurchased'].tolist()
print(all_words_rows)

[['Bread', 'Butter', 'Milk'], ['Bread', 'Diaper', 'Beer', 'Milk'], ['Milk', 'Diaper', 'Beer'], ['Bread', 'Butter'], ['Bread', 'Butter', 'Diaper', 'Milk']]


In [9]:
all_words = set()


In [10]:
for i in all_words_rows:
    for word in i:
        all_words.add(word)



In [11]:
all_words

{'Beer', 'Bread', 'Butter', 'Diaper', 'Milk'}

In [12]:
def generate_itemsets(all_words):

    all_itemsets = set()
    for x in  range(1, len(all_words)+1):
        all_itemsets.update(combinations(all_words, x))
    return(all_itemsets)

In [13]:
all_itemset = generate_itemsets(all_words)

In [14]:
example_itemset = ['Beer', 'Bread', 'Butter', 'Diaper']

In [34]:
def generate_rules(item_set):
    rules = []

    itemset_length = len(item_set)
    set_itemset = set(item_set)

    for n in range(1, itemset_length):
        for combo in combinations(set_itemset, n):
            antecedent = set(combo)
            consequent = set_itemset - antecedent
            rules.append((antecedent, consequent, set_itemset))
    return rules

In [16]:
get_itemsets = generate_itemsets(all_words)

In [17]:
binary_df = pd.DataFrame(columns=list(all_words))

In [18]:
binary_df.head()

Unnamed: 0,Beer,Bread,Milk,Diaper,Butter


In [19]:
for idx, row in enumerate(all_words_rows):
    for word in row:
        binary_df.loc[idx, word] = 1

In [20]:
binary_df.head()

Unnamed: 0,Beer,Bread,Milk,Diaper,Butter
0,,1.0,1.0,,1.0
1,1.0,1.0,1.0,1.0,
2,1.0,,1.0,1.0,
3,,1.0,,,1.0
4,,1.0,1.0,1.0,1.0


In [21]:
binary_df = binary_df.map(lambda x : True if x == 1 else False)

In [22]:
binary_df.dtypes

Beer      bool
Bread     bool
Milk      bool
Diaper    bool
Butter    bool
dtype: object

In [23]:
all_rules = []

for itemset in all_itemset:
    all_rules.extend(generate_rules(itemset))

In [24]:
support_dict = {}

for row in all_itemset:
    total_cols = len(row)
    for idx in range(len(binary_df)):
        if (binary_df.loc[idx, list(row)]).sum() == total_cols:
            support_dict[row] = support_dict.get(row, 0) +1


In [25]:
support_list = [(set(k), v / len(binary_df)) for k, v in support_dict.items()]

In [26]:
support_list

[({'Milk'}, 0.8),
 ({'Beer', 'Bread', 'Diaper'}, 0.2),
 ({'Butter'}, 0.6),
 ({'Beer', 'Diaper', 'Milk'}, 0.4),
 ({'Bread', 'Diaper', 'Milk'}, 0.4),
 ({'Beer', 'Bread'}, 0.2),
 ({'Beer', 'Bread', 'Milk'}, 0.2),
 ({'Bread', 'Butter', 'Diaper', 'Milk'}, 0.2),
 ({'Diaper'}, 0.6),
 ({'Bread', 'Diaper'}, 0.4),
 ({'Diaper', 'Milk'}, 0.6),
 ({'Bread', 'Butter', 'Diaper'}, 0.2),
 ({'Beer', 'Diaper'}, 0.4),
 ({'Beer', 'Bread', 'Diaper', 'Milk'}, 0.2),
 ({'Beer'}, 0.4),
 ({'Butter', 'Diaper', 'Milk'}, 0.2),
 ({'Butter', 'Diaper'}, 0.2),
 ({'Bread', 'Butter', 'Milk'}, 0.4),
 ({'Bread'}, 0.8),
 ({'Bread', 'Milk'}, 0.6),
 ({'Beer', 'Milk'}, 0.4),
 ({'Bread', 'Butter'}, 0.6),
 ({'Butter', 'Milk'}, 0.4)]

In [27]:
col = ['antecedent', 'consequent', 'itemset']

rules_df = pd.DataFrame(all_rules, columns=col)


In [28]:
rules_df

Unnamed: 0,antecedent,consequent,itemset
0,{Beer},"{Bread, Diaper}","{Beer, Bread, Diaper}"
1,{Bread},"{Beer, Diaper}","{Beer, Bread, Diaper}"
2,{Diaper},"{Beer, Bread}","{Beer, Bread, Diaper}"
3,"{Beer, Bread}",{Diaper},"{Beer, Bread, Diaper}"
4,"{Beer, Diaper}",{Bread},"{Beer, Bread, Diaper}"
...,...,...,...
175,"{Beer, Bread, Milk}",{Butter},"{Beer, Bread, Butter, Milk}"
176,"{Beer, Butter, Milk}",{Bread},"{Beer, Bread, Butter, Milk}"
177,"{Bread, Butter, Milk}",{Beer},"{Beer, Bread, Butter, Milk}"
178,{Beer},{Butter},"{Beer, Butter}"


In [29]:
def get_support_value(itemset):
    for support_metric in support_list:
        if support_metric[0] == itemset:
            return support_metric[1]
    
    return 0

In [30]:
rules_df['antecedent_support'] = rules_df['antecedent'].apply(get_support_value)
rules_df['consequent_support'] = rules_df['consequent'].apply(get_support_value)
rules_df['itemset_support'] = rules_df['itemset'].apply(get_support_value)

### Confidence

Support(A & B) / Support(A)


In [31]:
rules_df['Confidence'] = rules_df['itemset_support'] / rules_df['antecedent_support']

### Lift

(Support(A & B) / Support(A)) / Support(B)


In [32]:
rules_df['Lift'] = rules_df['Confidence'] / rules_df['consequent_support']

In [33]:
rules_df.head()

Unnamed: 0,antecedent,consequent,itemset,antecedent_support,consequent_support,itemset_support,Confidence,Lift
0,{Beer},"{Bread, Diaper}","{Beer, Bread, Diaper}",0.4,0.4,0.2,0.5,1.25
1,{Bread},"{Beer, Diaper}","{Beer, Bread, Diaper}",0.8,0.4,0.2,0.25,0.625
2,{Diaper},"{Beer, Bread}","{Beer, Bread, Diaper}",0.6,0.2,0.2,0.333333,1.666667
3,"{Beer, Bread}",{Diaper},"{Beer, Bread, Diaper}",0.2,0.6,0.2,1.0,1.666667
4,"{Beer, Diaper}",{Bread},"{Beer, Bread, Diaper}",0.4,0.8,0.2,0.5,0.625
