# Apriori Algorithm

In [1]:
CSV_LOCATION = 'data/heart_2020_cleaned.csv'

In [2]:
# importing relevant libraries

import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

In [3]:
# Import dataset from file

data = pd.read_csv(CSV_LOCATION, low_memory=False)

data

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [4]:
# Preprocess the data

data['HeartDisease'] = data['HeartDisease'].eq('Yes').astype(int)
data['Smoking'] = data['Smoking'].eq('Yes').astype(int)
data['AlcoholDrinking'] = data['AlcoholDrinking'].eq('Yes').astype(int)
data['Stroke'] = data['Stroke'].eq('Yes').astype(int)
data['DiffWalking'] = data['DiffWalking'].eq('Yes').astype(int)
data['Diabetic'] = data['Diabetic'].eq('Yes').astype(int)
data['PhysicalActivity'] = data['PhysicalActivity'].eq('Yes').astype(int)
data['Asthma'] = data['Asthma'].eq('Yes').astype(int)
data['KidneyDisease'] = data['KidneyDisease'].eq('Yes').astype(int)
data['SkinCancer'] = data['SkinCancer'].eq('Yes').astype(int)

def min_max_scaling(series):
  """ Normalise value as explained in https://datagy.io/pandas-normalize-column/ """
  return (series - series.min()) / (series.max() - series.min())

# Categorise BMI as per https://www.health.nsw.gov.au/heal/Pages/bmi.aspx
data['BMI'] = pd.cut(
  data['BMI'],
  bins=[0, 18.5, 25, 30, 100],
  labels=[0, 1, 2, 3] # ['Underweight', 'Normal', 'Overweight', 'Obese']
)

data['PhysicalHealth'] = min_max_scaling(data['PhysicalHealth'])
data['MentalHealth'] = min_max_scaling(data['MentalHealth'])

# Categorise amount of sleep as per https://www.sleep.org/how-sleep-works/how-many-hours-of-sleep-do-we-need/
data['SleepTime'] = pd.cut(
  data['SleepTime'],
  bins=[0, 7, 9, 24],
  labels=['Low', 'Normal', 'High']
)
data

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,0,1,0,0,0.100000,1.0,0,Female,55-59,White,1,1,Very good,Low,1,0,1
1,0,1,0,0,1,0.000000,0.0,0,Female,80 or older,White,0,1,Very good,Low,0,0,0
2,0,2,1,0,0,0.666667,1.0,0,Male,65-69,White,1,1,Fair,Normal,1,0,0
3,0,1,0,0,0,0.000000,0.0,0,Female,75-79,White,0,0,Good,Low,0,0,1
4,0,1,0,0,0,0.933333,0.0,1,Female,40-44,White,0,1,Very good,Normal,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,2,1,0,0,0.233333,0.0,1,Male,60-64,Hispanic,1,0,Fair,Low,1,0,0
319791,0,2,1,0,0,0.000000,0.0,0,Male,35-39,Hispanic,0,1,Very good,Low,1,0,0
319792,0,1,0,0,0,0.000000,0.0,0,Female,45-49,Hispanic,0,1,Good,Low,0,0,0
319793,0,3,0,0,0,0.000000,0.0,0,Female,25-29,Hispanic,0,0,Good,High,0,0,0


In [5]:
data['HeartDisease'] = pd.to_numeric(data['HeartDisease'])
data['Smoking'] = pd.to_numeric(data['Smoking'])
data['AlcoholDrinking'] = pd.to_numeric(data['AlcoholDrinking'])
data['Stroke'] = pd.to_numeric(data['Stroke'])
data['DiffWalking'] = pd.to_numeric(data['DiffWalking'])
data['Diabetic'] = pd.to_numeric(data['Diabetic'])
data['PhysicalActivity'] = pd.to_numeric(data['PhysicalActivity'])
data['Asthma'] = pd.to_numeric(data['Asthma'])
data['KidneyDisease'] = pd.to_numeric(data['KidneyDisease'])
data['SkinCancer'] = pd.to_numeric(data['SkinCancer'])

print(data.dtypes.value_counts())

int32       10
object       4
float64      2
category     1
category     1
dtype: int64


In [6]:
data2 = data[['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking',
              'Diabetic', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']]
print(data2)

        HeartDisease  Smoking  AlcoholDrinking  Stroke  DiffWalking  Diabetic  \
0                  0        1                0       0            0         1   
1                  0        0                0       1            0         0   
2                  0        1                0       0            0         1   
3                  0        0                0       0            0         0   
4                  0        0                0       0            1         0   
...              ...      ...              ...     ...          ...       ...   
319790             1        1                0       0            1         1   
319791             0        1                0       0            0         0   
319792             0        0                0       0            0         0   
319793             0        0                0       0            0         0   
319794             0        0                0       0            0         0   

        PhysicalActivity  A

In [7]:
data3 = data2.loc[data['HeartDisease'] != 0]
data3

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Diabetic,PhysicalActivity,Asthma,KidneyDisease,SkinCancer
5,1,1,0,0,1,0,0,0,0,0
10,1,1,0,0,1,1,0,1,0,0
35,1,1,0,1,1,1,1,0,0,1
42,1,0,0,0,1,1,0,0,0,1
43,1,1,0,0,1,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
319765,1,0,0,1,1,1,1,1,0,0
319767,1,1,0,0,1,1,1,0,0,0
319781,1,1,0,0,0,0,1,0,0,0
319786,1,1,0,0,0,1,1,1,0,0


In [8]:
data4 = data2.loc[data['HeartDisease'] != 1]
data3

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Diabetic,PhysicalActivity,Asthma,KidneyDisease,SkinCancer
5,1,1,0,0,1,0,0,0,0,0
10,1,1,0,0,1,1,0,1,0,0
35,1,1,0,1,1,1,1,0,0,1
42,1,0,0,0,1,1,0,0,0,1
43,1,1,0,0,1,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
319765,1,0,0,1,1,1,1,1,0,0
319767,1,1,0,0,1,1,1,0,0,0
319781,1,1,0,0,0,0,1,0,0,0
319786,1,1,0,0,0,1,1,1,0,0


In [9]:
freq_items_all = apriori(data2, min_support=0.1, use_colnames=True, verbose=1)
freq_items_all.head(7)

Processing 20 combinations | Sampling itemset size 2


Unnamed: 0,support,itemsets
0,0.412477,(Smoking)
1,0.13887,(DiffWalking)
2,0.127588,(Diabetic)
3,0.775362,(PhysicalActivity)
4,0.134061,(Asthma)
5,0.299855,"(PhysicalActivity, Smoking)"


In [10]:
freq_items_pos = apriori(data3, min_support=0.1, use_colnames=True, verbose=1)
freq_items_pos.head(7)

Processing 80 combinations | Sampling itemset size 43


Unnamed: 0,support,itemsets
0,1.0,(HeartDisease)
1,0.585869,(Smoking)
2,0.16034,(Stroke)
3,0.366346,(DiffWalking)
4,0.32722,(Diabetic)
5,0.638914,(PhysicalActivity)
6,0.180214,(Asthma)


In [11]:
freq_items_neg = apriori(data4, min_support=0.1, use_colnames=True, verbose=1)
freq_items_neg.head(7)

Processing 20 combinations | Sampling itemset size 2


Unnamed: 0,support,itemsets
0,0.396246,(Smoking)
1,0.117577,(DiffWalking)
2,0.108901,(Diabetic)
3,0.788135,(PhysicalActivity)
4,0.129741,(Asthma)
5,0.294451,"(PhysicalActivity, Smoking)"


In [12]:
rules = association_rules(freq_items_all, metric="confidence", min_threshold=0.9)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction


In [13]:
data5 = pd.get_dummies(data.drop(['PhysicalHealth', 'MentalHealth'], axis=1), drop_first=True)

data5

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Diabetic,PhysicalActivity,Asthma,KidneyDisease,SkinCancer,...,Race_Black,Race_Hispanic,Race_Other,Race_White,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,SleepTime_Normal,SleepTime_High
0,0,1,0,0,0,1,1,1,0,1,...,0,0,0,1,0,0,0,1,0,0
1,0,0,0,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,0,1,0,0,0,1,1,1,0,0,...,0,0,0,1,1,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,1,0,0,1,1,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
319791,0,1,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,0,1,0,0
319792,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
319793,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1


In [14]:
data6 = data5.loc[data['HeartDisease'] != 0]
data6

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Diabetic,PhysicalActivity,Asthma,KidneyDisease,SkinCancer,...,Race_Black,Race_Hispanic,Race_Other,Race_White,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,SleepTime_Normal,SleepTime_High
5,1,1,0,0,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,1
10,1,1,0,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
35,1,1,0,1,1,1,1,0,0,1,...,0,0,0,1,0,0,1,0,0,0
42,1,0,0,0,1,1,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
43,1,1,0,0,1,1,1,0,1,0,...,0,0,0,1,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319765,1,0,0,1,1,1,1,1,0,0,...,0,1,0,0,0,0,1,0,0,0
319767,1,1,0,0,1,1,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
319781,1,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
319786,1,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,0,0,1,1,0


In [15]:
data7 = data5.loc[data['HeartDisease'] != 1]
data7

Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Diabetic,PhysicalActivity,Asthma,KidneyDisease,SkinCancer,...,Race_Black,Race_Hispanic,Race_Other,Race_White,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,SleepTime_Normal,SleepTime_High
0,0,1,0,0,0,1,1,1,0,1,...,0,0,0,1,0,0,0,1,0,0
1,0,0,0,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,0,1,0,0,0,1,1,1,0,0,...,0,0,0,1,1,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
4,0,0,0,0,1,0,1,0,0,0,...,0,0,0,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319789,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,1,0
319791,0,1,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,0,1,0,0
319792,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,1,0,0,0,0
319793,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1


In [21]:
freq_items_true_all = apriori(data5, min_support=0.1, use_colnames=True, verbose=1)
freq_items_true_all.head(7)

Processing 25 combinations | Sampling itemset size 54


Unnamed: 0,support,itemsets
0,0.412477,(Smoking)
1,0.13887,(DiffWalking)
2,0.127588,(Diabetic)
3,0.775362,(PhysicalActivity)
4,0.134061,(Asthma)
5,0.305752,(BMI_1)
6,0.357588,(BMI_2)


In [22]:
freq_items_true_pos = apriori(data6, min_support=0.1, use_colnames=True, verbose=1)
freq_items_true_pos.head(7)

Processing 144 combinations | Sampling itemset size 65


Unnamed: 0,support,itemsets
0,1.0,(HeartDisease)
1,0.585869,(Smoking)
2,0.16034,(Stroke)
3,0.366346,(DiffWalking)
4,0.32722,(Diabetic)
5,0.638914,(PhysicalActivity)
6,0.180214,(Asthma)


In [25]:
freq_items_true_neg = apriori(data7, min_support=0.1, use_colnames=True, verbose=1)
freq_items_true_neg.head(7)

Processing 30 combinations | Sampling itemset size 54


Unnamed: 0,support,itemsets
0,0.396246,(Smoking)
1,0.117577,(DiffWalking)
2,0.108901,(Diabetic)
3,0.788135,(PhysicalActivity)
4,0.129741,(Asthma)
5,0.312695,(BMI_1)
6,0.356881,(BMI_2)


In [19]:
rules = association_rules(freq_items_true_all, metric="confidence", min_threshold=0.1)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(PhysicalActivity),(Smoking),0.775362,0.412477,0.299855,0.386728,0.937576,-0.019964,0.958015
1,(Smoking),(PhysicalActivity),0.412477,0.775362,0.299855,0.726961,0.937576,-0.019964,0.822732
2,(Smoking),(BMI_1),0.412477,0.305752,0.118517,0.287329,0.939745,-0.007599,0.974149
3,(BMI_1),(Smoking),0.305752,0.412477,0.118517,0.387623,0.939745,-0.007599,0.959414
4,(Smoking),(BMI_2),0.412477,0.357588,0.150015,0.363693,1.017071,0.002518,1.009594


In [20]:
rules = association_rules(freq_items_true_pos, metric="confidence", min_threshold=0.1)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(HeartDisease),(Smoking),1.0,0.585869,0.585869,0.585869,1.0,0.0,1.0
1,(Smoking),(HeartDisease),0.585869,1.0,0.585869,1.0,1.0,0.0,inf
2,(HeartDisease),(Stroke),1.0,0.16034,0.16034,0.16034,1.0,0.0,1.0
3,(Stroke),(HeartDisease),0.16034,1.0,0.16034,1.0,1.0,0.0,inf
4,(HeartDisease),(DiffWalking),1.0,0.366346,0.366346,0.366346,1.0,0.0,1.0
