<img align="right" width="250" src="http://www.sobigdata.eu/sites/default/files/logo-SoBigData-DEFINITIVO.png">
**Author:** [Riccardo Guidotti](http://kdd.isti.cnr.it/people/riccardo-guidotti)  
**Python version:**  3.x

In [39]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Pattern mining on relational data

In [40]:
#df = pd.read_csv("../dataset/small_transactions.csv", skipinitialspace=True, sep=',')
df = pd.read_csv("/Users/riccardo/Documents/Didattica/CorsoDM_2018_2019/Dataset/small_transactions.csv", 
                 skipinitialspace=True, sep=',', nrows=2000)
df.head()

Unnamed: 0,SCONTRINO_ID,COD_MKT_ID
0,2558064013053,1580
1,2558064013053,1661
2,2558064013053,2068
3,2558064013053,2556
4,2558064013053,2650


From relational to transactional

In [41]:
dt = df.groupby('SCONTRINO_ID')['COD_MKT_ID'].apply(list)
dt.head()

SCONTRINO_ID
2558064013053                 [1580, 1661, 2068, 2556, 2650, 4225]
2558064013054    [437, 1278, 1614, 2089, 2243, 2245, 2443, 2551...
2558064013055                         [151, 595, 2650, 4600, 4872]
2558064013056             [142, 437, 2499, 2515, 3458, 3675, 4044]
2558064013057                                          [437, 3087]
Name: COD_MKT_ID, dtype: object

In [42]:
baskets = dt.values

In [43]:
baskets[0:10]

array([list([1580, 1661, 2068, 2556, 2650, 4225]),
       list([437, 1278, 1614, 2089, 2243, 2245, 2443, 2551, 3448, 6172]),
       list([151, 595, 2650, 4600, 4872]),
       list([142, 437, 2499, 2515, 3458, 3675, 4044]), list([437, 3087]),
       list([445, 446, 2050, 2650, 5046]),
       list([483, 920, 1461, 1488, 2241, 3682, 4069, 4079, 4109, 4844, 4847]),
       list([1581, 2650, 2731, 3087, 4176]),
       list([384, 560, 2065, 2243, 2499, 4041]),
       list([607, 2122, 2650, 4655])], dtype=object)

In [44]:
from fim import apriori

Extract frequent patterns

In [45]:
itemsets = apriori(baskets, supp=1, zmin=2, target='a') 

In [46]:
print('Number of itemsets:', len(itemsets))

Number of itemsets: 392


In [47]:
itemsets[:10]

[((3086, 2443), 3),
 ((441, 2050), 3),
 ((441, 2650), 3),
 ((1278, 2243), 4),
 ((4805, 437), 3),
 ((396, 2650), 3),
 ((476, 2650), 3),
 ((4163, 2650), 3),
 ((385, 2729), 3),
 ((2498, 920), 3)]

Extract decision rules

In [48]:
rules = apriori(baskets, supp=1, zmin=2, target='r', conf=60, 
                report='ascl') 

In [49]:
print('Number of rule:', len(rules))

Number of rule: 1096


In [50]:
count = 0
for r in rules:
    if r[5] > 2 and r[4] > 0.7:
        print(r)
        count += 1
        if count == 10:
            break

(2443, (3086,), 3, 0.013215859030837005, 1.0, 9.869565217391305)
(2050, (441,), 3, 0.013215859030837005, 0.75, 13.096153846153847)
(2650, (441,), 3, 0.013215859030837005, 0.75, 2.541044776119403)
(2243, (1278,), 4, 0.01762114537444934, 1.0, 14.1875)
(437, (4805,), 3, 0.013215859030837005, 1.0, 9.458333333333334)
(2650, (396,), 3, 0.013215859030837005, 1.0, 3.388059701492537)
(2650, (4163,), 3, 0.013215859030837005, 1.0, 3.388059701492537)
(2729, (385,), 3, 0.013215859030837005, 0.75, 8.5125)
(2650, (4182, 445), 3, 0.013215859030837005, 1.0, 3.388059701492537)
(445, (4182, 2650), 3, 0.013215859030837005, 1.0, 5.27906976744186)


In [51]:
#help(apriori)

In [57]:
rules[0]

(2650, (595,), 2, 0.00881057268722467, 0.6666666666666666, 2.2587064676616917)

In [56]:
for b in baskets:
    if set(rules[0][1]) < set(b):
        print(b)
        break

[151, 595, 2650, 4600, 4872]


Pattern mining on categorical data

In [14]:
df = pd.read_csv("../dataset/titanic.csv", skipinitialspace=True, sep=',')

In [15]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Correct missing values

In [17]:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

df['Age'] = df['Age'].groupby([df['Sex'], df['Pclass']]).apply(
    lambda x: x.fillna(x.median()))
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
FamilySize       0
dtype: int64

Remove useless columns

In [18]:
column2drop = ['PassengerId', 'Name', 'Cabin', 'SibSp', 
               'Parch', 'Ticket']
df.drop(column2drop, axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,0,3,male,22.0,7.25,S,2
1,1,1,female,38.0,71.2833,C,2
2,1,3,female,26.0,7.925,S,1
3,1,1,female,35.0,53.1,S,2
4,0,3,male,35.0,8.05,S,1


Apply binning to continuous features

In [19]:
df['AgeBin'] = pd.cut(df['Age'].astype(int), 10, right=False)
df['FareBin'] = pd.cut(df['Fare'].astype(int), 10, right=False)

df.drop(['Age', 'Fare'], axis=1, inplace=True)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked,FamilySize,AgeBin,FareBin
0,0,3,male,S,2,"[16.0, 24.0)","[0.0, 51.2)"
1,1,1,female,C,2,"[32.0, 40.0)","[51.2, 102.4)"
2,1,3,female,S,1,"[24.0, 32.0)","[0.0, 51.2)"
3,1,1,female,S,2,"[32.0, 40.0)","[51.2, 102.4)"
4,0,3,male,S,1,"[32.0, 40.0)","[0.0, 51.2)"


Remap values

In [20]:
df['Survived'] = df['Survived'].map(
    {0: 'Not Survived', 1: 'Survived'}).astype(str)
df['Pclass'] = df['Pclass'].map(
    {1: '1st', 2: '2nd', 3: '3rd'}).astype(str)

df['FamilySize'] = df['FamilySize'].astype(str) + '_Family'
df['AgeBin'] = df['AgeBin'].astype(str) + '_Age'
df['FareBin'] = df['FareBin'].astype(str) + '_Fare'

df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked,FamilySize,AgeBin,FareBin
0,Not Survived,3rd,male,S,2_Family,"[16.0, 24.0)_Age","[0.0, 51.2)_Fare"
1,Survived,1st,female,C,2_Family,"[32.0, 40.0)_Age","[51.2, 102.4)_Fare"
2,Survived,3rd,female,S,1_Family,"[24.0, 32.0)_Age","[0.0, 51.2)_Fare"
3,Survived,1st,female,S,2_Family,"[32.0, 40.0)_Age","[51.2, 102.4)_Fare"
4,Not Survived,3rd,male,S,1_Family,"[32.0, 40.0)_Age","[0.0, 51.2)_Fare"


In [21]:
baskets = df.values.tolist()

In [25]:
baskets[0]

[1580, 1661, 2068, 2556, 2650, 4225]

In [26]:
itemsets = apriori(baskets, supp=1, zmin=2, target='a') 

In [27]:
print('Number of itemsets:', len(itemsets))

Number of itemsets: 392


In [28]:
itemsets[:10]

[((3086, 2443), 3),
 ((441, 2050), 3),
 ((441, 2650), 3),
 ((1278, 2243), 4),
 ((4805, 437), 3),
 ((396, 2650), 3),
 ((476, 2650), 3),
 ((4163, 2650), 3),
 ((385, 2729), 3),
 ((2498, 920), 3)]

In [29]:
rules = apriori(baskets, supp=10, zmin=2, target='r', conf=60, 
                report='ascl') 

In [22]:
print('Number of rule:', len(rules))

Number of rule: 0


In [23]:
for r in rules:
    if r[0] == 'Survived':
        print(r)

In [30]:
set(rules[0][1])

IndexError: list index out of range