In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Pattern mining on relational data

In [2]:
mydf = pd.read_csv("../../../Dataset/credit_default_cleaned.csv", skipinitialspace=True)
mydf.head()

Unnamed: 0,limit,sex,education,status,age,ps-sep,ps-aug,ps-jul,ps-jun,ps-may,...,ba-jun,ba-may,ba-apr,pa-sep,pa-aug,pa-jul,pa-jun,pa-may,pa-apr,credit_default
0,50000,male,graduate school,single,25.0,2,0,0,0,0,...,49535,30358,30302,2130,1905,1811,1100,1100,1200,no
1,200000,male,university,married,54.0,-1,-1,-1,-1,-1,...,6335,4616,7956,10120,7852,6336,4622,7956,5499,no
2,30000,female,high school,married,41.0,2,3,2,2,2,...,30496,29731,29047,0,1700,1100,3,1053,1303,no
3,140000,female,university,single,28.0,0,0,0,0,0,...,25224,26855,23783,2000,2000,900,2000,10000,5000,no
4,60000,female,high school,married,36.0,1,2,2,0,0,...,48738,49601,52773,1788,0,1894,1801,3997,0,yes


From relational to transactional

In [4]:
from fim import apriori

ModuleNotFoundError: No module named 'fim'

Extract frequent patterns

In [None]:
df.isnull().sum()

Extract decision rules

Pattern mining on categorical data

In [14]:
df = pd.read_csv("../dataset/titanic.csv", skipinitialspace=True, sep=',')

In [15]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Correct missing values

In [17]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

df['Age'] = df['Age'].groupby([df['Sex'], df['Pclass']]).apply(
    lambda x: x.fillna(x.median()))
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
FamilySize       0
dtype: int64

Remove useless columns

In [18]:
column2drop = ['PassengerId', 'Name', 'Cabin', 'SibSp', 
               'Parch', 'Ticket']
df.drop(column2drop, axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,0,3,male,22.0,7.25,S,2
1,1,1,female,38.0,71.2833,C,2
2,1,3,female,26.0,7.925,S,1
3,1,1,female,35.0,53.1,S,2
4,0,3,male,35.0,8.05,S,1


Apply binning to continuous features

In [19]:
df['AgeBin'] = pd.cut(df['Age'].astype(int), 10, right=False)
df['FareBin'] = pd.cut(df['Fare'].astype(int), 10, right=False)

df.drop(['Age', 'Fare'], axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked,FamilySize,AgeBin,FareBin
0,0,3,male,S,2,"[16.0, 24.0)","[0.0, 51.2)"
1,1,1,female,C,2,"[32.0, 40.0)","[51.2, 102.4)"
2,1,3,female,S,1,"[24.0, 32.0)","[0.0, 51.2)"
3,1,1,female,S,2,"[32.0, 40.0)","[51.2, 102.4)"
4,0,3,male,S,1,"[32.0, 40.0)","[0.0, 51.2)"


Remap values

In [20]:
df['Survived'] = df['Survived'].map(
    {0: 'Not Survived', 1: 'Survived'}).astype(str)
df['Pclass'] = df['Pclass'].map(
    {1: '1st', 2: '2nd', 3: '3rd'}).astype(str)

df['FamilySize'] = df['FamilySize'].astype(str) + '_Family'
df['AgeBin'] = df['AgeBin'].astype(str) + '_Age'
df['FareBin'] = df['FareBin'].astype(str) + '_Fare'

df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked,FamilySize,AgeBin,FareBin
0,Not Survived,3rd,male,S,2_Family,"[16.0, 24.0)_Age","[0.0, 51.2)_Fare"
1,Survived,1st,female,C,2_Family,"[32.0, 40.0)_Age","[51.2, 102.4)_Fare"
2,Survived,3rd,female,S,1_Family,"[24.0, 32.0)_Age","[0.0, 51.2)_Fare"
3,Survived,1st,female,S,2_Family,"[32.0, 40.0)_Age","[51.2, 102.4)_Fare"
4,Not Survived,3rd,male,S,1_Family,"[32.0, 40.0)_Age","[0.0, 51.2)_Fare"


In [21]:
baskets = df.values.tolist()

In [25]:
baskets[0]

[1580, 1661, 2068, 2556, 2650, 4225]

In [26]:
itemsets = apriori(baskets, supp=1, zmin=2, target='a') 

In [27]:
print('Number of itemsets:', len(itemsets))

Number of itemsets: 392


In [28]:
itemsets[:10]

[((3086, 2443), 3),
 ((441, 2050), 3),
 ((441, 2650), 3),
 ((1278, 2243), 4),
 ((4805, 437), 3),
 ((396, 2650), 3),
 ((476, 2650), 3),
 ((4163, 2650), 3),
 ((385, 2729), 3),
 ((2498, 920), 3)]

In [29]:
rules = apriori(baskets, supp=10, zmin=2, target='r', conf=60, 
                report='ascl') 

In [22]:
print('Number of rule:', len(rules))

Number of rule: 0


In [23]:
for r in rules:
    if r[0] == 'Survived':
        print(r)

In [30]:
set(rules[0][1])

IndexError: list index out of range