Step 1 - Load the data

In [1]:
import pandas as pd

# Load the dataset using pandas
df = pd.read_csv('archive/heart.csv')

print(df[['age', 'sex', 'cp']].head())

   age  sex  cp
0   63    1   3
1   37    1   2
2   41    0   1
3   56    1   1
4   57    0   0


Step 2 - Preprocessing

In [2]:
from preprocessing import one_hot_encode

columns_to_use = ['age', 'sex', 'cp', 'trtbps', 'chol', 'fbs', 'restecg', 'thalachh', 'exng', 'oldpeak', 'slp', 'caa',
                  'thall', 'output']
arm_df = df[columns_to_use]

# Specify the numerical columns
numerical_columns = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']

# Perform one-hot encoding on the numerical columns
arm_df_encoded = one_hot_encode(arm_df, numerical_columns)

print(arm_df_encoded.head())

   sex  cp  fbs  restecg  exng  slp  caa  thall  output  age_29  ...  \
0    1   3    1        0     0    0    0      1       1       0  ...   
1    1   2    0        1     0    0    0      2       1       0  ...   
2    0   1    0        0     0    2    0      2       1       0  ...   
3    1   1    0        1     0    2    0      2       1       0  ...   
4    0   0    0        1     1    2    0      2       1       0  ...   

   oldpeak_3.2  oldpeak_3.4  oldpeak_3.5  oldpeak_3.6  oldpeak_3.8  \
0            0            0            0            0            0   
1            0            0            1            0            0   
2            0            0            0            0            0   
3            0            0            0            0            0   
4            0            0            0            0            0   

   oldpeak_4.0  oldpeak_4.2  oldpeak_4.4  oldpeak_5.6  oldpeak_6.2  
0            0            0            0            0            0  
1       

In [3]:
# Specify the categorical columns
categorical_columns = ['cp', 'restecg', 'slp', 'caa', 'thall']

# Perform one-hot encoding on the categorical columns
arm_df_encoded = one_hot_encode(arm_df_encoded, categorical_columns)

print(arm_df_encoded.head())

   sex  fbs  exng  output  age_29  age_34  age_35  age_37  age_38  age_39  \
0    1    1     0       1       0       0       0       0       0       0   
1    1    0     0       1       0       0       0       1       0       0   
2    0    0     0       1       0       0       0       0       0       0   
3    1    0     0       1       0       0       0       0       0       0   
4    0    0     1       1       0       0       0       0       0       0   

   ...  slp_2  caa_0  caa_1  caa_2  caa_3  caa_4  thall_0  thall_1  thall_2  \
0  ...      0      1      0      0      0      0        0        1        0   
1  ...      0      1      0      0      0      0        0        0        1   
2  ...      1      1      0      0      0      0        0        0        1   
3  ...      1      1      0      0      0      0        0        0        1   
4  ...      1      1      0      0      0      0        0        0        1   

   thall_3  
0        0  
1        0  
2        0  
3        0

Step 3 - ARM

In [4]:
from mlxtend.frequent_patterns import apriori, association_rules

# Set minimum thresholds for support and confidence
min_support = 0.4
min_confidence = 0.6

# Apply Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(arm_df_encoded, min_support=min_support, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=min_confidence)

# Sort the rules by support and confidence in descending order
rules = rules.sort_values(['support', 'confidence'], ascending=False)

# Explore and analyze the association rules
print(rules.head())

  antecedents consequents  antecedent support  consequent support   support  \
0    (output)     (caa_0)            0.544554            0.577558  0.429043   
2    (output)   (thall_2)            0.544554            0.547855  0.429043   
3   (thall_2)    (output)            0.547855            0.544554  0.429043   
1     (caa_0)    (output)            0.577558            0.544554  0.429043   

   confidence      lift  leverage  conviction  zhangs_metric  
0    0.787879  1.364156  0.114531    1.991513       0.586120  
2    0.787879  1.438116  0.130706    2.131542       0.668896  
3    0.783133  1.438116  0.130706    2.100110       0.673779  
1    0.742857  1.364156  0.114531    1.771177       0.631911  




In [5]:
print(pd.DataFrame(rules))

  antecedents consequents  antecedent support  consequent support   support  \
0    (output)     (caa_0)            0.544554            0.577558  0.429043   
2    (output)   (thall_2)            0.544554            0.547855  0.429043   
3   (thall_2)    (output)            0.547855            0.544554  0.429043   
1     (caa_0)    (output)            0.577558            0.544554  0.429043   

   confidence      lift  leverage  conviction  zhangs_metric  
0    0.787879  1.364156  0.114531    1.991513       0.586120  
2    0.787879  1.438116  0.130706    2.131542       0.668896  
3    0.783133  1.438116  0.130706    2.100110       0.673779  
1    0.742857  1.364156  0.114531    1.771177       0.631911  
