In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Pattern mining on relational data

In [2]:
mydf = pd.read_csv("../../../Dataset/credit_default_cleaned.csv", skipinitialspace=True)
mydf.head()

Unnamed: 0,limit,sex,education,status,age,ps-sep,ps-aug,ps-jul,ps-jun,ps-may,...,ba-jun,ba-may,ba-apr,pa-sep,pa-aug,pa-jul,pa-jun,pa-may,pa-apr,credit_default
0,50000,male,graduate school,single,25.0,2,0,0,0,0,...,49535,30358,30302,2130,1905,1811,1100,1100,1200,no
1,200000,male,university,married,54.0,-1,-1,-1,-1,-1,...,6335,4616,7956,10120,7852,6336,4622,7956,5499,no
2,30000,female,high school,married,41.0,2,3,2,2,2,...,30496,29731,29047,0,1700,1100,3,1053,1303,no
3,140000,female,university,single,28.0,0,0,0,0,0,...,25224,26855,23783,2000,2000,900,2000,10000,5000,no
4,60000,female,high school,married,36.0,1,2,2,0,0,...,48738,49601,52773,1788,0,1894,1801,3997,0,yes


From relational to transactional

In [3]:
from fim import apriori

Extract frequent patterns

Extract decision rules

Pattern mining on categorical data

In [4]:
df = pd.read_csv("titanic.csv", skipinitialspace=True, sep=',')

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Remove useless columns

In [6]:
column2drop = ['PassengerId', 'Name', 'Cabin', 'SibSp', 
               'Parch', 'Ticket']
df.drop(column2drop, axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,22.0,7.25,S
1,1,1,female,38.0,71.2833,C
2,1,3,female,26.0,7.925,S
3,1,1,female,35.0,53.1,S
4,0,3,male,35.0,8.05,S


Apply binning to continuous features

In [7]:
mydf['AgeBin'] = pd.cut(mydf['age'].astype(int), 10, right=False)
mydf['LimitBin'] = pd.cut(mydf['limit'].astype(int), 50, right=False)

mydf.drop(['age', 'limit'], axis=1, inplace=True)

columnsTitles = ["credit_default", "sex", "education", "status", "AgeBin", "LimitBin", "ps-sep", "ps-aug", "ps-jul", "ps-jun", "ps-may", "ps-apr", "pa-sep", "pa-aug", "pa-jul", "pa-jun", "pa-may", "pa-apr", "ba-sep", "ba-aug", "ba-jul", "ba-jun", "ba-may", "ba-apr"]

mydf = mydf[columnsTitles]
mydf.head()

Unnamed: 0,credit_default,sex,education,status,AgeBin,LimitBin,ps-sep,ps-aug,ps-jul,ps-jun,...,pa-jul,pa-jun,pa-may,pa-apr,ba-sep,ba-aug,ba-jul,ba-jun,ba-may,ba-apr
0,no,male,graduate school,single,"[21.0, 26.4)","[40800.0, 56200.0)",2,0,0,0,...,1811,1100,1100,1200,48966,49985,50760,49535,30358,30302
1,no,male,university,married,"[53.4, 58.8)","[194800.0, 210200.0)",-1,-1,-1,-1,...,6336,4622,7956,5499,10755,9981,7843,6335,4616,7956
2,no,female,high school,married,"[37.2, 42.6)","[25400.0, 40800.0)",2,3,2,2,...,1100,3,1053,1303,30057,29280,30206,30496,29731,29047
3,no,female,university,single,"[26.4, 31.8)","[133200.0, 148600.0)",0,0,0,0,...,900,2000,10000,5000,56426,41717,42823,25224,26855,23783
4,yes,female,high school,married,"[31.8, 37.2)","[56200.0, 71600.0)",1,2,2,0,...,1894,1801,3997,0,47987,48716,47632,48738,49601,52773


Remap values (not labeling ps, since they are the only values in [-2, +6]

In [8]:
mydf['ba-sep'] = mydf['ba-sep'].astype(str) + '_ba-sep'
mydf['ba-aug'] = mydf['ba-aug'].astype(str) + '_ba-aug'
mydf['ba-jul'] = mydf['ba-jul'].astype(str) + '_ba-jul'
mydf['ba-jun'] = mydf['ba-jun'].astype(str) + '_ba-jun'
mydf['ba-may'] = mydf['ba-may'].astype(str) + '_ba-may'
mydf['ba-apr'] = mydf['ba-apr'].astype(str) + '_ba-apr'

mydf['pa-sep'] = mydf['pa-sep'].astype(str) + '_pa-sep'
mydf['pa-aug'] = mydf['pa-aug'].astype(str) + '_pa-aug'
mydf['pa-jul'] = mydf['pa-jul'].astype(str) + '_pa-jul'
mydf['pa-jun'] = mydf['pa-jun'].astype(str) + '_pa-jun'
mydf['pa-may'] = mydf['pa-may'].astype(str) + '_pa-may'
mydf['pa-apr'] = mydf['pa-apr'].astype(str) + '_pa-apr'

mydf['AgeBin'] = mydf['AgeBin'].astype(str) + '_Age'
mydf['LimitBin'] = mydf['LimitBin'].astype(str) + '_Limit'

mydf.head()

Unnamed: 0,credit_default,sex,education,status,AgeBin,LimitBin,ps-sep,ps-aug,ps-jul,ps-jun,...,pa-jul,pa-jun,pa-may,pa-apr,ba-sep,ba-aug,ba-jul,ba-jun,ba-may,ba-apr
0,no,male,graduate school,single,"[21.0, 26.4)_Age","[40800.0, 56200.0)_Limit",2,0,0,0,...,1811_pa-jul,1100_pa-jun,1100_pa-may,1200_pa-apr,48966_ba-sep,49985_ba-aug,50760_ba-jul,49535_ba-jun,30358_ba-may,30302_ba-apr
1,no,male,university,married,"[53.4, 58.8)_Age","[194800.0, 210200.0)_Limit",-1,-1,-1,-1,...,6336_pa-jul,4622_pa-jun,7956_pa-may,5499_pa-apr,10755_ba-sep,9981_ba-aug,7843_ba-jul,6335_ba-jun,4616_ba-may,7956_ba-apr
2,no,female,high school,married,"[37.2, 42.6)_Age","[25400.0, 40800.0)_Limit",2,3,2,2,...,1100_pa-jul,3_pa-jun,1053_pa-may,1303_pa-apr,30057_ba-sep,29280_ba-aug,30206_ba-jul,30496_ba-jun,29731_ba-may,29047_ba-apr
3,no,female,university,single,"[26.4, 31.8)_Age","[133200.0, 148600.0)_Limit",0,0,0,0,...,900_pa-jul,2000_pa-jun,10000_pa-may,5000_pa-apr,56426_ba-sep,41717_ba-aug,42823_ba-jul,25224_ba-jun,26855_ba-may,23783_ba-apr
4,yes,female,high school,married,"[31.8, 37.2)_Age","[56200.0, 71600.0)_Limit",1,2,2,0,...,1894_pa-jul,1801_pa-jun,3997_pa-may,0_pa-apr,47987_ba-sep,48716_ba-aug,47632_ba-jul,48738_ba-jun,49601_ba-may,52773_ba-apr


In [9]:
baskets = mydf.values.tolist()

In [10]:
baskets[0]

['no',
 'male',
 'graduate school',
 'single',
 '[21.0, 26.4)_Age',
 '[40800.0, 56200.0)_Limit',
 2,
 0,
 0,
 0,
 0,
 0,
 '2130_pa-sep',
 '1905_pa-aug',
 '1811_pa-jul',
 '1100_pa-jun',
 '1100_pa-may',
 '1200_pa-apr',
 '48966_ba-sep',
 '49985_ba-aug',
 '50760_ba-jul',
 '49535_ba-jun',
 '30358_ba-may',
 '30302_ba-apr']

In [11]:
itemsets = apriori(baskets, supp=1, zmin=2, target='a') 

In [12]:
print('Number of itemsets:', len(itemsets))

Number of itemsets: 197817


In [13]:
itemsets[:10]

[(('10000_pa-jun', 0), 100),
 (('10000_pa-may', 0), 104),
 (('10000_pa-may', 'no'), 100),
 (('10000_pa-apr', 0), 101),
 (('4000_pa-jun', 0), 111),
 (('4000_pa-jul', 0), 119),
 (('4000_pa-jul', 0, 'no'), 103),
 (('4000_pa-jul', 'no'), 104),
 (('10000_pa-sep', 0), 124),
 (('10000_pa-sep', 0, 'no'), 115)]

# All frequent patterns

In [62]:
#zmin -> minimum number of items per itemset
#supp -> support
#conf -> confidence
#target -> a = all
#report -> ascl = 
    #(a, absolute itemset support);
    #(s, relative itemset support as a fraction);
    #(c, rule confidence as a fraction);
    #(l, lift value of a rule)
#OBS: confidence value useless if not taking into account the association rules
for s in range(1, 100):
    #for c in range(1, 100):
    rules = apriori(baskets, supp=s, zmin=2, target='a', conf=60, report='scl')
    numRules= len(rules)
    if (numRules == 0):
        break
    print('~~~~~~~~~Supp: ', s, ' Conf: 60 Number of rules:', len(rules))
    for r in rules:
        if r[0] == 'yes':
            print(r)
                

~~~~~~~~~Supp:  1  Conf: 60 Number of rules: 197817
~~~~~~~~~Supp:  2  Conf: 60 Number of rules: 41668
~~~~~~~~~Supp:  3  Conf: 60 Number of rules: 13285
~~~~~~~~~Supp:  4  Conf: 60 Number of rules: 6038
~~~~~~~~~Supp:  5  Conf: 60 Number of rules: 2396
~~~~~~~~~Supp:  6  Conf: 60 Number of rules: 1176
~~~~~~~~~Supp:  7  Conf: 60 Number of rules: 618
~~~~~~~~~Supp:  8  Conf: 60 Number of rules: 401
~~~~~~~~~Supp:  9  Conf: 60 Number of rules: 275
~~~~~~~~~Supp:  10  Conf: 60 Number of rules: 203
~~~~~~~~~Supp:  11  Conf: 60 Number of rules: 157
~~~~~~~~~Supp:  12  Conf: 60 Number of rules: 122
~~~~~~~~~Supp:  13  Conf: 60 Number of rules: 96
~~~~~~~~~Supp:  14  Conf: 60 Number of rules: 80
~~~~~~~~~Supp:  15  Conf: 60 Number of rules: 71
~~~~~~~~~Supp:  16  Conf: 60 Number of rules: 61
~~~~~~~~~Supp:  17  Conf: 60 Number of rules: 57
~~~~~~~~~Supp:  18  Conf: 60 Number of rules: 45
~~~~~~~~~Supp:  19  Conf: 60 Number of rules: 38
~~~~~~~~~Supp:  20  Conf: 60 Number of rules: 37
~~~~~~~

# Closed frequent patterns

In [63]:
for s in range(1, 100):
    #for c in range(1, 100):
    rules = apriori(baskets, supp=s, zmin=2, target='c', conf=60, report='scl')
    numRules= len(rules)
    if (numRules == 0):
        break
    print('~~~~~~~~~Supp: ', s, ' Conf: 60 Number of rules:', len(rules))
    for r in rules:
        if r[0] == 'yes':
            print(r)

~~~~~~~~~Supp:  1  Conf: 60 Number of rules: 48759
~~~~~~~~~Supp:  2  Conf: 60 Number of rules: 14928
~~~~~~~~~Supp:  3  Conf: 60 Number of rules: 6120
~~~~~~~~~Supp:  4  Conf: 60 Number of rules: 3052
~~~~~~~~~Supp:  5  Conf: 60 Number of rules: 1614
~~~~~~~~~Supp:  6  Conf: 60 Number of rules: 934
~~~~~~~~~Supp:  7  Conf: 60 Number of rules: 563
~~~~~~~~~Supp:  8  Conf: 60 Number of rules: 379
~~~~~~~~~Supp:  9  Conf: 60 Number of rules: 268
~~~~~~~~~Supp:  10  Conf: 60 Number of rules: 203
~~~~~~~~~Supp:  11  Conf: 60 Number of rules: 157
~~~~~~~~~Supp:  12  Conf: 60 Number of rules: 122
~~~~~~~~~Supp:  13  Conf: 60 Number of rules: 96
~~~~~~~~~Supp:  14  Conf: 60 Number of rules: 80
~~~~~~~~~Supp:  15  Conf: 60 Number of rules: 71
~~~~~~~~~Supp:  16  Conf: 60 Number of rules: 61
~~~~~~~~~Supp:  17  Conf: 60 Number of rules: 57
~~~~~~~~~Supp:  18  Conf: 60 Number of rules: 45
~~~~~~~~~Supp:  19  Conf: 60 Number of rules: 38
~~~~~~~~~Supp:  20  Conf: 60 Number of rules: 37
~~~~~~~~~S

# Maximal frequent patterns

In [64]:
for s in range(1, 100):
    #for c in range(1, 100):
    rules = apriori(baskets, supp=s, zmin=2, target='m', conf=60, report='scl')
    numRules= len(rules)
    if (numRules == 0):
        break
    print('~~~~~~~~~Supp: ', s, ' Conf: 60 Number of rules:', len(rules))
    for r in rules:
        if r[0] == 'yes':
            print(r)
                

~~~~~~~~~Supp:  1  Conf: 60 Number of rules: 5798
~~~~~~~~~Supp:  2  Conf: 60 Number of rules: 2152
~~~~~~~~~Supp:  3  Conf: 60 Number of rules: 1129
~~~~~~~~~Supp:  4  Conf: 60 Number of rules: 628
~~~~~~~~~Supp:  5  Conf: 60 Number of rules: 419
~~~~~~~~~Supp:  6  Conf: 60 Number of rules: 284
~~~~~~~~~Supp:  7  Conf: 60 Number of rules: 220
~~~~~~~~~Supp:  8  Conf: 60 Number of rules: 155
~~~~~~~~~Supp:  9  Conf: 60 Number of rules: 123
~~~~~~~~~Supp:  10  Conf: 60 Number of rules: 99
~~~~~~~~~Supp:  11  Conf: 60 Number of rules: 85
~~~~~~~~~Supp:  12  Conf: 60 Number of rules: 65
~~~~~~~~~Supp:  13  Conf: 60 Number of rules: 48
~~~~~~~~~Supp:  14  Conf: 60 Number of rules: 42
~~~~~~~~~Supp:  15  Conf: 60 Number of rules: 33
~~~~~~~~~Supp:  16  Conf: 60 Number of rules: 26
~~~~~~~~~Supp:  17  Conf: 60 Number of rules: 24
~~~~~~~~~Supp:  18  Conf: 60 Number of rules: 26
~~~~~~~~~Supp:  19  Conf: 60 Number of rules: 22
~~~~~~~~~Supp:  20  Conf: 60 Number of rules: 23
~~~~~~~~~Supp:  2

# Association rules

In [67]:
for s in range(1, 100):
    for c in range(1, 100, 10):
        numYes=0
        rules = apriori(baskets, supp=s, zmin=2, target='r', conf=60, report='scl')
        numRules= len(rules)
        if (numRules == 0):
            break
        print('~~~~~~~~~Supp: ', s, ' Conf: ', c, ' Number of rules:', len(rules))
        for r in rules:
            if r[0] == 'yes':
                #print(r)
                numYes = numYes + 1
        set(rules[0][1])
                

~~~~~~~~~Supp:  1  Conf:  1  Number of rules: 1600845
~~~~~~~~~Supp:  1  Conf:  11  Number of rules: 1600845
~~~~~~~~~Supp:  1  Conf:  21  Number of rules: 1600845
~~~~~~~~~Supp:  1  Conf:  31  Number of rules: 1600845
~~~~~~~~~Supp:  1  Conf:  41  Number of rules: 1600845
~~~~~~~~~Supp:  1  Conf:  51  Number of rules: 1600845
~~~~~~~~~Supp:  1  Conf:  61  Number of rules: 1600845
~~~~~~~~~Supp:  1  Conf:  71  Number of rules: 1600845
~~~~~~~~~Supp:  1  Conf:  81  Number of rules: 1600845
~~~~~~~~~Supp:  1  Conf:  91  Number of rules: 1600845
~~~~~~~~~Supp:  2  Conf:  1  Number of rules: 341796
~~~~~~~~~Supp:  2  Conf:  11  Number of rules: 341796
~~~~~~~~~Supp:  2  Conf:  21  Number of rules: 341796
~~~~~~~~~Supp:  2  Conf:  31  Number of rules: 341796
~~~~~~~~~Supp:  2  Conf:  41  Number of rules: 341796
~~~~~~~~~Supp:  2  Conf:  51  Number of rules: 341796
~~~~~~~~~Supp:  2  Conf:  61  Number of rules: 341796
~~~~~~~~~Supp:  2  Conf:  71  Number of rules: 341796
~~~~~~~~~Supp:  2  C

~~~~~~~~~Supp:  21  Conf:  31  Number of rules: 92
~~~~~~~~~Supp:  21  Conf:  41  Number of rules: 92
~~~~~~~~~Supp:  21  Conf:  51  Number of rules: 92
~~~~~~~~~Supp:  21  Conf:  61  Number of rules: 92
~~~~~~~~~Supp:  21  Conf:  71  Number of rules: 92
~~~~~~~~~Supp:  21  Conf:  81  Number of rules: 92
~~~~~~~~~Supp:  21  Conf:  91  Number of rules: 92
~~~~~~~~~Supp:  22  Conf:  1  Number of rules: 79
~~~~~~~~~Supp:  22  Conf:  11  Number of rules: 79
~~~~~~~~~Supp:  22  Conf:  21  Number of rules: 79
~~~~~~~~~Supp:  22  Conf:  31  Number of rules: 79
~~~~~~~~~Supp:  22  Conf:  41  Number of rules: 79
~~~~~~~~~Supp:  22  Conf:  51  Number of rules: 79
~~~~~~~~~Supp:  22  Conf:  61  Number of rules: 79
~~~~~~~~~Supp:  22  Conf:  71  Number of rules: 79
~~~~~~~~~Supp:  22  Conf:  81  Number of rules: 79
~~~~~~~~~Supp:  22  Conf:  91  Number of rules: 79
~~~~~~~~~Supp:  23  Conf:  1  Number of rules: 65
~~~~~~~~~Supp:  23  Conf:  11  Number of rules: 65
~~~~~~~~~Supp:  23  Conf:  21  Nu

~~~~~~~~~Supp:  38  Conf:  1  Number of rules: 20
~~~~~~~~~Supp:  38  Conf:  11  Number of rules: 20
~~~~~~~~~Supp:  38  Conf:  21  Number of rules: 20
~~~~~~~~~Supp:  38  Conf:  31  Number of rules: 20
~~~~~~~~~Supp:  38  Conf:  41  Number of rules: 20
~~~~~~~~~Supp:  38  Conf:  51  Number of rules: 20
~~~~~~~~~Supp:  38  Conf:  61  Number of rules: 20
~~~~~~~~~Supp:  38  Conf:  71  Number of rules: 20
~~~~~~~~~Supp:  38  Conf:  81  Number of rules: 20
~~~~~~~~~Supp:  38  Conf:  91  Number of rules: 20
~~~~~~~~~Supp:  39  Conf:  1  Number of rules: 18
~~~~~~~~~Supp:  39  Conf:  11  Number of rules: 18
~~~~~~~~~Supp:  39  Conf:  21  Number of rules: 18
~~~~~~~~~Supp:  39  Conf:  31  Number of rules: 18
~~~~~~~~~Supp:  39  Conf:  41  Number of rules: 18
~~~~~~~~~Supp:  39  Conf:  51  Number of rules: 18
~~~~~~~~~Supp:  39  Conf:  61  Number of rules: 18
~~~~~~~~~Supp:  39  Conf:  71  Number of rules: 18
~~~~~~~~~Supp:  39  Conf:  81  Number of rules: 18
~~~~~~~~~Supp:  39  Conf:  91  Nu

~~~~~~~~~Supp:  54  Conf:  61  Number of rules: 7
~~~~~~~~~Supp:  54  Conf:  71  Number of rules: 7
~~~~~~~~~Supp:  54  Conf:  81  Number of rules: 7
~~~~~~~~~Supp:  54  Conf:  91  Number of rules: 7
~~~~~~~~~Supp:  55  Conf:  1  Number of rules: 7
~~~~~~~~~Supp:  55  Conf:  11  Number of rules: 7
~~~~~~~~~Supp:  55  Conf:  21  Number of rules: 7
~~~~~~~~~Supp:  55  Conf:  31  Number of rules: 7
~~~~~~~~~Supp:  55  Conf:  41  Number of rules: 7
~~~~~~~~~Supp:  55  Conf:  51  Number of rules: 7
~~~~~~~~~Supp:  55  Conf:  61  Number of rules: 7
~~~~~~~~~Supp:  55  Conf:  71  Number of rules: 7
~~~~~~~~~Supp:  55  Conf:  81  Number of rules: 7
~~~~~~~~~Supp:  55  Conf:  91  Number of rules: 7
~~~~~~~~~Supp:  56  Conf:  1  Number of rules: 7
~~~~~~~~~Supp:  56  Conf:  11  Number of rules: 7
~~~~~~~~~Supp:  56  Conf:  21  Number of rules: 7
~~~~~~~~~Supp:  56  Conf:  31  Number of rules: 7
~~~~~~~~~Supp:  56  Conf:  41  Number of rules: 7
~~~~~~~~~Supp:  56  Conf:  51  Number of rules: 7
~~

~~~~~~~~~Supp:  71  Conf:  21  Number of rules: 2
~~~~~~~~~Supp:  71  Conf:  31  Number of rules: 2
~~~~~~~~~Supp:  71  Conf:  41  Number of rules: 2
~~~~~~~~~Supp:  71  Conf:  51  Number of rules: 2
~~~~~~~~~Supp:  71  Conf:  61  Number of rules: 2
~~~~~~~~~Supp:  71  Conf:  71  Number of rules: 2
~~~~~~~~~Supp:  71  Conf:  81  Number of rules: 2
~~~~~~~~~Supp:  71  Conf:  91  Number of rules: 2
~~~~~~~~~Supp:  72  Conf:  1  Number of rules: 2
~~~~~~~~~Supp:  72  Conf:  11  Number of rules: 2
~~~~~~~~~Supp:  72  Conf:  21  Number of rules: 2
~~~~~~~~~Supp:  72  Conf:  31  Number of rules: 2
~~~~~~~~~Supp:  72  Conf:  41  Number of rules: 2
~~~~~~~~~Supp:  72  Conf:  51  Number of rules: 2
~~~~~~~~~Supp:  72  Conf:  61  Number of rules: 2
~~~~~~~~~Supp:  72  Conf:  71  Number of rules: 2
~~~~~~~~~Supp:  72  Conf:  81  Number of rules: 2
~~~~~~~~~Supp:  72  Conf:  91  Number of rules: 2
~~~~~~~~~Supp:  73  Conf:  1  Number of rules: 2
~~~~~~~~~Supp:  73  Conf:  11  Number of rules: 2
~~