# Model Training for Heart Disease

In this notebook we will predict the person is likely to have heart disease(1) or not(0)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv("C:/HealthWellness/Notebook/data/heart.csv")

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
train, test=train_test_split(df, test_size=0.3)

In [5]:
train_y=train[['HeartDisease']]
test_y=test[['HeartDisease']]

In [6]:
train_inputs=train.drop(['HeartDisease'],axis=1)
test_inputs=test.drop(['HeartDisease'],axis=1)

In [7]:
# Categorizing columns based on data type
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.to_list()
binary_columns = ['FastingBS']
numeric_columns = df.select_dtypes(include=['number']).columns.to_list()

In [8]:
numeric_columns.remove('FastingBS')

In [9]:
numeric_columns.remove('HeartDisease')

In [10]:
categorical_columns

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [11]:
numeric_columns

['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

In [12]:
binary_columns

['FastingBS']

### Pipeline

In [13]:
numeric_transformer = Pipeline(steps=[('scaler',StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot',OneHotEncoder(handle_unknown='ignore'))])
binary_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))])

In [14]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='drop')

In [16]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[-1.41692469, -0.1364862 ,  0.67001975, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.15033491,  0.50271724,  0.67929248, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.57821151,  0.12984857, -1.87998105, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.41692469, -0.40282096, -1.87998105, ...,  1.        ,
         0.        ,  1.        ],
       [ 1.04336576, -1.73449479, -1.87998105, ...,  1.        ,
         0.        ,  1.        ],
       [-0.66814064,  1.4615224 ,  0.81838344, ...,  1.        ,
         0.        ,  0.        ]])

In [17]:
train_x.shape

(642, 20)

In [18]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[ 1.15033491, -0.40282096,  0.98529258, ...,  1.        ,
         0.        ,  0.        ],
       [-0.34723319, -0.9354905 , -1.87998105, ...,  1.        ,
         0.        ,  0.        ],
       [-1.52389384, -0.66915573,  0.35474693, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.40155086,  0.39618334,  0.57729245, ...,  1.        ,
         0.        ,  0.        ],
       [-0.13329489, -0.66915573,  0.06729229, ...,  0.        ,
         1.        ,  0.        ],
       [-0.77510979, -0.9354905 , -1.87998105, ...,  1.        ,
         0.        ,  0.        ]])

In [19]:
test_x.shape

(276, 20)

### Baseline

In [20]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_y)

In [21]:
from sklearn.metrics import accuracy_score

In [22]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_y, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.5389408099688473


In [23]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_y, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.5869565217391305


### Training Logistic Regression