In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
import time

In [2]:
header_list = ['Class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 
               'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat']

df = pd.read_csv('breast-cancer.csv',  names=header_list, na_values='?')
df = df.dropna()

In [3]:
!pip install fpgrowth_py



In [4]:
df.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [5]:
from fpgrowth_py import fpgrowth

itemlist = (df['tumor-size'] + ',' + df['Class'] + 
           ',' + df['breast' ]  + ',' + df['breast-quad']).apply(lambda x: x.split(','))

freqItemSet, rules = fpgrowth(itemlist, minSupRatio=0.3, minConf=0.2)
print(freqItemSet)
print(rules)  
# [[{'beer'}, {'rice'}, 0.6666666666666666], [{'rice'}, {'beer'}, 1.0]]
# rules[0] --> rules[1], confidence = rules[2]

[{'left_up'}, {'left_low'}, {'right'}, {'right', 'no-recurrence-events'}, {'left'}, {'left', 'no-recurrence-events'}, {'no-recurrence-events'}]
[[{'right'}, {'no-recurrence-events'}, 0.7272727272727273], [{'no-recurrence-events'}, {'right'}, 0.4897959183673469], [{'left'}, {'no-recurrence-events'}, 0.6896551724137931], [{'no-recurrence-events'}, {'left'}, 0.5102040816326531]]


In [8]:
df.columns

Index(['Class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps',
       'deg-malig', 'breast', 'breast-quad', 'irradiat'],
      dtype='object')

In [13]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
from mlxtend.preprocessing import TransactionEncoder
import time


#support - sup(X) = num of transactions containig X / num of all transactions


#support - sup(X->Y) = num of transactions containig both X and Y / num of all transactions
#confidence - conf (X->Y) = sup(X->Y) / sup(X)
#lift(X->Y) = sup(X->Y) / (sup(X)*sup(Y))

# I = {i1, i2, ..., in} - set of n binary attributes called Items
# D = {t1, t2, ..., tn} - transaction set
    #- each ti has a unique transaction id
    #- each transaction is a subset of I
#open: https://www.dataversity.net/frequent-pattern-mining-association-support-business-analysis/


def ds_encoding(dataset):
    te = TransactionEncoder()
    te_ary = te.fit(dataset).transform(dataset)
    return pd.DataFrame(te_ary, columns=te.columns_)


def Apriori_Demo(dataset, min_support):

    frequent_itemsets = apriori(df=dataset,min_support=min_support,use_colnames=True, max_len=None,verbose=0)
    #print(frequent_itemsets)

    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    #print(frequent_itemsets)

    #print(frequent_itemsets[(frequent_itemsets['length'] >= 2) &
    #                         (frequent_itemsets['support'] >= 0.6)])

    #print("Apriori DEMO end")
    return frequent_itemsets

def FPGrowth_Demo(dataset, min_support):

    frequent_itemsets = fpgrowth(df=dataset, min_support=min_support, use_colnames=True,max_len=None)


    #print(frequent_itemsets)
    #print("\n")

    # extend the dataset by a new column
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

    #print(frequent_itemsets)

    #print(frequent_itemsets[(frequent_itemsets['length'] == 1) &
    #                  (frequent_itemsets['support'] >= 0.6)])

    # print("FPGrowth DEMO end")
    return frequent_itemsets

if __name__ == "__main__":

    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    
#     'Class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps',
#        'deg-malig', 'breast', 'breast-quad', 'irradiat'

    dataset = (df['tumor-size'] + ',' + df['Class'] + ',' + df['menopause'] + ',' + df['age'] + ',' + df['irradiat'] + 
           ',' + df['breast' ]  + ',' + df['breast-quad'] + ',' +  df['inv-nodes'] + ','+
               df['deg-malig'].apply(lambda x: str(x))).apply(lambda x: x.split(','))
    min_support=0.5

    itemset = ds_encoding(dataset=dataset)


    freq_itemset_fp = FPGrowth_Demo(itemset,min_support)
    print(freq_itemset_fp)
#     end_time = time.time()
    #print(f"runtime of Appriory: {end_time - start_time}")


    #2. rule minning (with filtering)
    # https://towardsdatascience.com/frequent-pattern-mining-association-and-correlations-8fa9f80c22ef

    assoc_rules = association_rules(df=freq_itemset_fp,metric="support",min_threshold=0.5,support_only=False)
    print(assoc_rules)


#Complette demo:
#http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/


    support                         itemsets  length
0  0.776173                             (no)       1
1  0.754513                            (0-2)       1
2  0.707581           (no-recurrence-events)       1
3  0.537906                        (premeno)       1
4  0.523466                           (left)       1
5  0.649819                        (0-2, no)       2
6  0.599278      (0-2, no-recurrence-events)       2
7  0.592058       (no-recurrence-events, no)       2
8  0.530686  (no-recurrence-events, 0-2, no)       3
                    antecedents                  consequents  \
0                         (0-2)                         (no)   
1                          (no)                        (0-2)   
2                         (0-2)       (no-recurrence-events)   
3        (no-recurrence-events)                        (0-2)   
4        (no-recurrence-events)                         (no)   
5                          (no)       (no-recurrence-events)   
6   (0-2, no-recurrenc

In [14]:
freq_itemset_fp

Unnamed: 0,support,itemsets,length
0,0.776173,(no),1
1,0.754513,(0-2),1
2,0.707581,(no-recurrence-events),1
3,0.537906,(premeno),1
4,0.523466,(left),1
5,0.649819,"(0-2, no)",2
6,0.599278,"(0-2, no-recurrence-events)",2
7,0.592058,"(no-recurrence-events, no)",2
8,0.530686,"(no-recurrence-events, 0-2, no)",3


In [15]:
assoc_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(0-2),(no),0.754513,0.776173,0.649819,0.861244,1.109603,0.064187,1.613096
1,(no),(0-2),0.776173,0.754513,0.649819,0.837209,1.109603,0.064187,1.507994
2,(0-2),(no-recurrence-events),0.754513,0.707581,0.599278,0.794258,1.122498,0.065399,1.421291
3,(no-recurrence-events),(0-2),0.707581,0.754513,0.599278,0.846939,1.122498,0.065399,1.603851
4,(no-recurrence-events),(no),0.707581,0.776173,0.592058,0.836735,1.078026,0.042852,1.370939
5,(no),(no-recurrence-events),0.776173,0.707581,0.592058,0.762791,1.078026,0.042852,1.232746
6,"(0-2, no-recurrence-events)",(no),0.599278,0.776173,0.530686,0.885542,1.140908,0.065542,1.955539
7,"(no-recurrence-events, no)",(0-2),0.592058,0.754513,0.530686,0.896341,1.187974,0.083971,2.368231
8,"(0-2, no)",(no-recurrence-events),0.649819,0.707581,0.530686,0.816667,1.154167,0.070886,1.595011
9,(no-recurrence-events),"(0-2, no)",0.707581,0.649819,0.530686,0.75,1.154167,0.070886,1.400722


In [16]:
df.head()

Unnamed: 0,Class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
