In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('../../data/raw/heart_attack_prediction_dataset.csv')

### Handle Categorical Features

In [3]:
categorical_features = [x for x in data.columns if data[x].dtype =='object']
categorical_features

['Patient ID',
 'Sex',
 'Blood Pressure',
 'Diet',
 'Country',
 'Continent',
 'Hemisphere']

In [4]:
# drop redundant features
df = data.drop(columns = ['Patient ID', 'Country', 'Continent', 'Hemisphere'])
df.head()

Unnamed: 0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Previous Heart Problems,Medication Use,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk
0,67,Male,208,158/88,72,0,0,1,0,0,...,0,0,9,6.615001,261404,31.251233,286,0,6,0
1,21,Male,389,165/93,98,1,1,1,1,1,...,1,0,1,4.963459,285768,27.194973,235,1,7,0
2,21,Female,324,174/99,72,1,0,0,0,0,...,1,1,9,9.463426,235282,28.176571,587,4,4,0
3,84,Male,383,163/100,73,1,1,1,0,1,...,1,0,9,7.648981,125640,36.464704,378,3,4,0
4,66,Male,318,91/88,93,1,1,1,1,0,...,1,0,6,1.514821,160555,21.809144,231,1,5,0


In [5]:
categorical = [x for x in df.columns if df[x].dtype =='object']
categorical

['Sex', 'Blood Pressure', 'Diet']

In [6]:
# count the instances of each class of categorical features
data["Sex"].value_counts()

Sex
Male      6111
Female    2652
Name: count, dtype: int64

In [7]:
data["Blood Pressure"].value_counts()

Blood Pressure
101/93     8
146/94     8
145/104    7
106/64     7
94/109     7
          ..
163/70     1
101/68     1
124/62     1
161/78     1
158/71     1
Name: count, Length: 3915, dtype: int64

In [8]:
data["Diet"].value_counts()

Diet
Healthy      2960
Average      2912
Unhealthy    2891
Name: count, dtype: int64

In [9]:
# divide Blood Pressure feature into two columns
df[['Systolic Pressure', 'Diastolic Pressure']] = df['Blood Pressure'].str.split('/', expand=True).astype(int)
df.head()

Unnamed: 0,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,...,Stress Level,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk,Systolic Pressure,Diastolic Pressure
0,67,Male,208,158/88,72,0,0,1,0,0,...,9,6.615001,261404,31.251233,286,0,6,0,158,88
1,21,Male,389,165/93,98,1,1,1,1,1,...,1,4.963459,285768,27.194973,235,1,7,0,165,93
2,21,Female,324,174/99,72,1,0,0,0,0,...,9,9.463426,235282,28.176571,587,4,4,0,174,99
3,84,Male,383,163/100,73,1,1,1,0,1,...,9,7.648981,125640,36.464704,378,3,4,0,163,100
4,66,Male,318,91/88,93,1,1,1,1,0,...,6,1.514821,160555,21.809144,231,1,5,0,91,88


In [10]:
# drop Blood Pressure feature column
df = df.drop(columns=['Blood Pressure'])

### One-Hot Encode Categorical Features

In [23]:
categorical_feature = [x for x in df.columns if df[x].dtype =='object']
categorical_feature

['Sex', 'Diet']

In [34]:
# encode Sex feature column
df_encoded = pd.get_dummies(df, columns=['Sex'], drop_first=True)
# encode Diet feature column
df_encoded = pd.get_dummies(df_encoded, columns=['Diet'])

In [35]:
df_encoded.head()

Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,...,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Heart Attack Risk,Systolic Pressure,Diastolic Pressure,Sex_Male,Diet_Average,Diet_Healthy,Diet_Unhealthy
0,67,208,72,0,0,1,0,0,4.168189,0,...,286,0,6,0,158,88,True,True,False,False
1,21,389,98,1,1,1,1,1,1.813242,1,...,235,1,7,0,165,93,True,False,False,True
2,21,324,72,1,0,0,0,0,2.078353,1,...,587,4,4,0,174,99,False,False,True,False
3,84,383,73,1,1,1,0,1,9.82813,1,...,378,3,4,0,163,100,True,True,False,False
4,66,318,93,1,1,1,1,0,5.804299,1,...,231,1,5,0,91,88,True,False,False,True


In [36]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              8763 non-null   int64  
 1   Cholesterol                      8763 non-null   int64  
 2   Heart Rate                       8763 non-null   int64  
 3   Diabetes                         8763 non-null   int64  
 4   Family History                   8763 non-null   int64  
 5   Smoking                          8763 non-null   int64  
 6   Obesity                          8763 non-null   int64  
 7   Alcohol Consumption              8763 non-null   int64  
 8   Exercise Hours Per Week          8763 non-null   float64
 9   Previous Heart Problems          8763 non-null   int64  
 10  Medication Use                   8763 non-null   int64  
 11  Stress Level                     8763 non-null   int64  
 12  Sedentary Hours Per 

In [37]:
df_encoded.columns

Index(['Age', 'Cholesterol', 'Heart Rate', 'Diabetes', 'Family History',
       'Smoking', 'Obesity', 'Alcohol Consumption', 'Exercise Hours Per Week',
       'Previous Heart Problems', 'Medication Use', 'Stress Level',
       'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides',
       'Physical Activity Days Per Week', 'Sleep Hours Per Day',
       'Heart Attack Risk', 'Systolic Pressure', 'Diastolic Pressure',
       'Sex_Male', 'Diet_Average', 'Diet_Healthy', 'Diet_Unhealthy'],
      dtype='object')

In [38]:
# add encoded dataset to data_encoded folder
df_encoded.to_csv("data_encoded/encoded_heart_attack_risk_dataset.csv", index=False)
