In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('heart_attack_dataset.csv')

In [3]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [5]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
input_cols = list(train_df.columns)[:-1]
target_col = 'HeartDisease'

In [9]:
input_cols

['Age',
 'Sex',
 'ChestPainType',
 'RestingBP',
 'Cholesterol',
 'FastingBS',
 'RestingECG',
 'MaxHR',
 'ExerciseAngina',
 'Oldpeak',
 'ST_Slope']

In [10]:
import numpy as np

In [12]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

In [13]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes('object').columns.tolist()

In [14]:
from sklearn.preprocessing import MinMaxScaler

In [15]:
scaler_heart = MinMaxScaler()
scaler_heart.fit(df[numeric_cols])

train_inputs[numeric_cols] = scaler_heart.transform(train_df[numeric_cols])
val_inputs[numeric_cols] = scaler_heart.transform(val_df[numeric_cols])

In [16]:
train_inputs

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
795,0.285714,M,NAP,0.60,0.398010,1.0,Normal,0.943662,N,0.386364,Down
25,0.163265,M,NAP,0.65,0.346600,0.0,Normal,0.830986,N,0.295455,Up
84,0.571429,M,ASY,0.75,0.353234,1.0,Normal,0.457746,Y,0.409091,Flat
10,0.183673,F,NAP,0.65,0.349917,0.0,Normal,0.577465,N,0.295455,Up
344,0.469388,M,ASY,0.60,0.000000,1.0,Normal,0.309859,N,0.295455,Flat
...,...,...,...,...,...,...,...,...,...,...,...
106,0.408163,F,ASY,0.60,0.421227,0.0,ST,0.352113,N,0.295455,Up
270,0.346939,M,ASY,0.60,0.373134,0.0,Normal,0.563380,N,0.295455,Up
860,0.653061,M,ASY,0.65,0.419569,0.0,Normal,0.591549,Y,0.454545,Up
435,0.653061,M,ASY,0.76,0.000000,0.0,ST,0.408451,Y,0.295455,Up


In [17]:
from sklearn.preprocessing import OneHotEncoder

In [18]:
enc_heart = OneHotEncoder()
enc_heart.fit(train_df[categorical_cols])
enc_heart.categories_

[array(['F', 'M'], dtype=object),
 array(['ASY', 'ATA', 'NAP', 'TA'], dtype=object),
 array(['LVH', 'Normal', 'ST'], dtype=object),
 array(['N', 'Y'], dtype=object),
 array(['Down', 'Flat', 'Up'], dtype=object)]

In [19]:
one_hot_heart = enc_heart.transform(df[categorical_cols]).toarray()
one_hot_heart

array([[0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [20]:
encoded_cols = list(enc_heart.get_feature_names_out(categorical_cols))
print(encoded_cols)

['Sex_F', 'Sex_M', 'ChestPainType_ASY', 'ChestPainType_ATA', 'ChestPainType_NAP', 'ChestPainType_TA', 'RestingECG_LVH', 'RestingECG_Normal', 'RestingECG_ST', 'ExerciseAngina_N', 'ExerciseAngina_Y', 'ST_Slope_Down', 'ST_Slope_Flat', 'ST_Slope_Up']


In [21]:
df[encoded_cols] = one_hot_heart
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,M,ATA,140,289,0,Normal,172,N,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,M,ATA,130,283,0,ST,98,N,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,54,M,NAP,150,195,0,Normal,122,N,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
914,68,M,ASY,144,193,1,Normal,141,N,3.4,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
916,57,F,ATA,130,236,0,LVH,174,N,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [22]:
train_inputs[encoded_cols] = df[encoded_cols].copy()
val_inputs[encoded_cols] = df[encoded_cols].copy()

In [23]:
train_inputs

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
795,0.285714,M,NAP,0.60,0.398010,1.0,Normal,0.943662,N,0.386364,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
25,0.163265,M,NAP,0.65,0.346600,0.0,Normal,0.830986,N,0.295455,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
84,0.571429,M,ASY,0.75,0.353234,1.0,Normal,0.457746,Y,0.409091,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
10,0.183673,F,NAP,0.65,0.349917,0.0,Normal,0.577465,N,0.295455,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
344,0.469388,M,ASY,0.60,0.000000,1.0,Normal,0.309859,N,0.295455,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.408163,F,ASY,0.60,0.421227,0.0,ST,0.352113,N,0.295455,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
270,0.346939,M,ASY,0.60,0.373134,0.0,Normal,0.563380,N,0.295455,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
860,0.653061,M,ASY,0.65,0.419569,0.0,Normal,0.591549,Y,0.454545,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
435,0.653061,M,ASY,0.76,0.000000,0.0,ST,0.408451,Y,0.295455,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [24]:
X_train = train_inputs[numeric_cols + encoded_cols]
X_val = val_inputs[numeric_cols + encoded_cols]

In [25]:
X_train

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
795,0.285714,0.60,0.398010,1.0,0.943662,0.386364,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
25,0.163265,0.65,0.346600,0.0,0.830986,0.295455,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
84,0.571429,0.75,0.353234,1.0,0.457746,0.409091,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
10,0.183673,0.65,0.349917,0.0,0.577465,0.295455,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
344,0.469388,0.60,0.000000,1.0,0.309859,0.295455,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,0.408163,0.60,0.421227,0.0,0.352113,0.295455,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
270,0.346939,0.60,0.373134,0.0,0.563380,0.295455,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
860,0.653061,0.65,0.419569,0.0,0.591549,0.454545,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
435,0.653061,0.76,0.000000,0.0,0.408451,0.295455,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
model_heart = LogisticRegression(solver='liblinear')


In [28]:
model_heart.fit(X_train, train_targets)

In [30]:
train_preds = model_heart.predict(X_train)

In [31]:
from sklearn.metrics import accuracy_score

accuracy_score(train_targets, train_preds)

0.8692098092643051

In [32]:
val_preds = model_heart.predict(X_val)

In [33]:
accuracy_score(val_preds, val_targets)

0.8695652173913043