In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Pattern mining on relational data

In [2]:
mydf = pd.read_csv("../../../Dataset/credit_default_cleaned.csv", skipinitialspace=True)
mydf.head()

Unnamed: 0,limit,sex,education,status,age,ps-sep,ps-aug,ps-jul,ps-jun,ps-may,...,ba-jun,ba-may,ba-apr,pa-sep,pa-aug,pa-jul,pa-jun,pa-may,pa-apr,credit_default
0,50000,male,graduate school,single,25.0,2,0,0,0,0,...,49535,30358,30302,2130,1905,1811,1100,1100,1200,no
1,200000,male,university,married,54.0,-1,-1,-1,-1,-1,...,6335,4616,7956,10120,7852,6336,4622,7956,5499,no
2,30000,female,high school,married,41.0,2,3,2,2,2,...,30496,29731,29047,0,1700,1100,3,1053,1303,no
3,140000,female,university,single,28.0,0,0,0,0,0,...,25224,26855,23783,2000,2000,900,2000,10000,5000,no
4,60000,female,high school,married,36.0,1,2,2,0,0,...,48738,49601,52773,1788,0,1894,1801,3997,0,yes


From relational to transactional

In [3]:
from fim import apriori

Extract frequent patterns

Extract decision rules

Pattern mining on categorical data

In [4]:
df = pd.read_csv("titanic.csv", skipinitialspace=True, sep=',')

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Remove useless columns

In [6]:
column2drop = ['PassengerId', 'Name', 'Cabin', 'SibSp', 
               'Parch', 'Ticket']
df.drop(column2drop, axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,22.0,7.25,S
1,1,1,female,38.0,71.2833,C
2,1,3,female,26.0,7.925,S
3,1,1,female,35.0,53.1,S
4,0,3,male,35.0,8.05,S


Apply binning to continuous features

In [7]:
mydf['AgeBin'] = pd.cut(mydf['age'].astype(int), 10, right=False)
mydf['LimitBin'] = pd.cut(mydf['limit'].astype(int), 50, right=False)

mydf.drop(['age', 'limit'], axis=1, inplace=True)

columnsTitles = ["credit_default", "sex", "education", "status", "AgeBin", "LimitBin", "ps-sep", "ps-aug", "ps-jul", "ps-jun", "ps-may", "ps-apr", "pa-sep", "pa-aug", "pa-jul", "pa-jun", "pa-may", "pa-apr", "ba-sep", "ba-aug", "ba-jul", "ba-jun", "ba-may", "ba-apr"]

mydf = mydf[columnsTitles]
mydf.head()

Unnamed: 0,sex,education,status,ps-sep,ps-aug,ps-jul,ps-jun,ps-may,ps-apr,ba-sep,...,ba-apr,pa-sep,pa-aug,pa-jul,pa-jun,pa-may,pa-apr,credit_default,AgeBin,LimitBin
0,male,graduate school,single,2,0,0,0,0,0,48966,...,30302,2130,1905,1811,1100,1100,1200,no,"[21.0, 26.4)","[40800.0, 56200.0)"
1,male,university,married,-1,-1,-1,-1,-1,-1,10755,...,7956,10120,7852,6336,4622,7956,5499,no,"[53.4, 58.8)","[194800.0, 210200.0)"
2,female,high school,married,2,3,2,2,2,0,30057,...,29047,0,1700,1100,3,1053,1303,no,"[37.2, 42.6)","[25400.0, 40800.0)"
3,female,university,single,0,0,0,0,0,0,56426,...,23783,2000,2000,900,2000,10000,5000,no,"[26.4, 31.8)","[133200.0, 148600.0)"
4,female,high school,married,1,2,2,0,0,0,47987,...,52773,1788,0,1894,1801,3997,0,yes,"[31.8, 37.2)","[56200.0, 71600.0)"


Remap values (not labeling ps, since they are the only values in [-2, +6]

In [9]:
mydf['ba-sep'] = mydf['ba-sep'].astype(str) + '_ba-sep'
mydf['ba-aug'] = mydf['ba-aug'].astype(str) + '_ba-aug'
mydf['ba-jul'] = mydf['ba-jul'].astype(str) + '_ba-jul'
mydf['ba-jun'] = mydf['ba-jun'].astype(str) + '_ba-jun'
mydf['ba-may'] = mydf['ba-may'].astype(str) + '_ba-may'
mydf['ba-apr'] = mydf['ba-apr'].astype(str) + '_ba-apr'

mydf['pa-sep'] = mydf['pa-sep'].astype(str) + '_pa-sep'
mydf['pa-aug'] = mydf['pa-aug'].astype(str) + '_pa-aug'
mydf['pa-jul'] = mydf['pa-jul'].astype(str) + '_pa-jul'
mydf['pa-jun'] = mydf['pa-jun'].astype(str) + '_pa-jun'
mydf['pa-may'] = mydf['pa-may'].astype(str) + '_pa-may'
mydf['pa-apr'] = mydf['pa-apr'].astype(str) + '_pa-apr'

mydf['AgeBin'] = mydf['AgeBin'].astype(str) + '_Age'
mydf['LimitBin'] = mydf['LimitBin'].astype(str) + '_Limit'

mydf.head()

Unnamed: 0,sex,education,status,ps-sep,ps-aug,ps-jul,ps-jun,ps-may,ps-apr,ba-sep,...,ba-apr,pa-sep,pa-aug,pa-jul,pa-jun,pa-may,pa-apr,credit_default,AgeBin,LimitBin
0,male,graduate school,single,2,0,0,0,0,0,48966_ba-sep_ba-sep,...,30302_ba-apr_ba-apr,2130_pa-sep_pa-sep,1905_pa-aug_pa-aug,1811_pa-jul_pa-jul,1100_pa-jun_pa-jun,1100_pa-may_pa-may,1200_pa-apr_pa-apr,no,"[21.0, 26.4)_Age_Age","[40800.0, 56200.0)_Limit_Limit"
1,male,university,married,-1,-1,-1,-1,-1,-1,10755_ba-sep_ba-sep,...,7956_ba-apr_ba-apr,10120_pa-sep_pa-sep,7852_pa-aug_pa-aug,6336_pa-jul_pa-jul,4622_pa-jun_pa-jun,7956_pa-may_pa-may,5499_pa-apr_pa-apr,no,"[53.4, 58.8)_Age_Age","[194800.0, 210200.0)_Limit_Limit"
2,female,high school,married,2,3,2,2,2,0,30057_ba-sep_ba-sep,...,29047_ba-apr_ba-apr,0_pa-sep_pa-sep,1700_pa-aug_pa-aug,1100_pa-jul_pa-jul,3_pa-jun_pa-jun,1053_pa-may_pa-may,1303_pa-apr_pa-apr,no,"[37.2, 42.6)_Age_Age","[25400.0, 40800.0)_Limit_Limit"
3,female,university,single,0,0,0,0,0,0,56426_ba-sep_ba-sep,...,23783_ba-apr_ba-apr,2000_pa-sep_pa-sep,2000_pa-aug_pa-aug,900_pa-jul_pa-jul,2000_pa-jun_pa-jun,10000_pa-may_pa-may,5000_pa-apr_pa-apr,no,"[26.4, 31.8)_Age_Age","[133200.0, 148600.0)_Limit_Limit"
4,female,high school,married,1,2,2,0,0,0,47987_ba-sep_ba-sep,...,52773_ba-apr_ba-apr,1788_pa-sep_pa-sep,0_pa-aug_pa-aug,1894_pa-jul_pa-jul,1801_pa-jun_pa-jun,3997_pa-may_pa-may,0_pa-apr_pa-apr,yes,"[31.8, 37.2)_Age_Age","[56200.0, 71600.0)_Limit_Limit"


In [15]:
baskets = mydf.values.tolist()

In [16]:
baskets[0]

['male',
 'graduate school',
 'single',
 2,
 0,
 0,
 0,
 0,
 0,
 '48966_ba-sep_ba-sep',
 '49985_ba-aug_ba-aug',
 '50760_ba-jul_ba-jul',
 '49535_ba-jun_ba-jun',
 '30358_ba-may_ba-may',
 '30302_ba-apr_ba-apr',
 '2130_pa-sep_pa-sep',
 '1905_pa-aug_pa-aug',
 '1811_pa-jul_pa-jul',
 '1100_pa-jun_pa-jun',
 '1100_pa-may_pa-may',
 '1200_pa-apr_pa-apr',
 'no',
 '[21.0, 26.4)_Age_Age',
 '[40800.0, 56200.0)_Limit_Limit']

In [17]:
itemsets = apriori(baskets, supp=1, zmin=2, target='a') 

In [18]:
print('Number of itemsets:', len(itemsets))

Number of itemsets: 197817


In [19]:
itemsets[:10]

[(('10000_pa-jun_pa-jun', 0), 100),
 (('10000_pa-may_pa-may', 0), 104),
 (('10000_pa-may_pa-may', 'no'), 100),
 (('10000_pa-apr_pa-apr', 0), 101),
 (('4000_pa-jun_pa-jun', 0), 111),
 (('4000_pa-jul_pa-jul', 0), 119),
 (('4000_pa-jul_pa-jul', 0, 'no'), 103),
 (('4000_pa-jul_pa-jul', 'no'), 104),
 (('10000_pa-sep_pa-sep', 0), 124),
 (('10000_pa-sep_pa-sep', 0, 'no'), 115)]

In [20]:
rules = apriori(baskets, supp=10, zmin=2, target='r', conf=60, 
                report='ascl') 

In [22]:
print('Number of rules:', len(rules))

Number of rules: 649


In [23]:
for r in rules:
    if r[0] == 'yes':
        print(r)

In [30]:
set(rules[0][1])

IndexError: list index out of range