# Model Training for Heart Disease

In this notebook we will predict the person is likely to have heart disease(1) or not(0)

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [10]:
df = pd.read_csv("../Notebook/data/heart.csv")

In [11]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [12]:
train, test=train_test_split(df, test_size=0.3)

In [13]:
train_y=train[['HeartDisease']]
test_y=test[['HeartDisease']]

In [14]:
train_inputs=train.drop(['HeartDisease'],axis=1)
test_inputs=test.drop(['HeartDisease'],axis=1)

In [15]:
# Categorizing columns based on data type
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.to_list()
binary_columns = ['FastingBS']
numeric_columns = df.select_dtypes(include=['number']).columns.to_list()

In [16]:
numeric_columns.remove('FastingBS')

In [17]:
numeric_columns.remove('HeartDisease')

In [18]:
categorical_columns

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [19]:
numeric_columns

['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

In [20]:
binary_columns

['FastingBS']

### Pipeline

In [21]:
numeric_transformer = Pipeline(steps=[('scaler',StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot',OneHotEncoder(handle_unknown='ignore'))])
binary_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))])

In [22]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='drop')

In [23]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[-0.69622953,  1.28363797, -1.81620438, ...,  1.        ,
         0.        ,  0.        ],
       [-1.55333281, -0.70240797,  0.37252645, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.55366657,  0.31898709, -1.81620438, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.16053998,  1.11340546,  0.89019308, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.8037012 , -0.70240797, -1.81620438, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.91083911,  0.31898709,  0.03649724, ...,  1.        ,
         0.        ,  0.        ]])

In [24]:
train_x.shape

(642, 20)

In [25]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[ 0.05373584,  0.14875458,  0.9446843 , ...,  0.        ,
         1.        ,  1.        ],
       [ 1.01797702, -0.13496627,  1.18081294, ...,  0.        ,
         1.        ,  1.        ],
       [ 0.8037012 ,  0.43247543,  0.76304689, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.05373584, -0.70240797,  0.52691825, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.16087375, -7.51170836, -1.81620438, ...,  1.        ,
         0.        ,  0.        ],
       [-1.87474654, -1.26984967, -1.81620438, ...,  1.        ,
         0.        ,  1.        ]])

In [26]:
test_x.shape

(276, 20)

### Baseline

In [27]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_y)

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_y, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.5607476635514018


In [30]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_y, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.5362318840579711


### Training Logistic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(penalty='none')

log_reg.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


### Predicted vs actual

In [32]:
log_reg.predict(test_x)

array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1], dtype=int64)

In [33]:
# Create a new DataFrame

predictions = pd.DataFrame(log_reg.predict(test_x), columns=['Predicted'])

predictions

Unnamed: 0,Predicted
0,0
1,1
2,1
3,1
4,0
...,...
271,0
272,1
273,1
274,1


In [34]:
# Add the actual to the same DataFrame

predictions['Actual'] = np.array(test_y)

predictions

Unnamed: 0,Predicted,Actual
0,0,0
1,1,1
2,1,1
3,1,1
4,0,0
...,...,...
271,0,0
272,1,1
273,1,0
274,1,1


### Calculating overall Accuracy

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
#Predict the train values
train_y_pred = log_reg.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8551401869158879

In [37]:
#Predict the test values
test_y_pred = log_reg.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.9057971014492754

In [38]:
from sklearn.metrics import confusion_matrix

#We usually create the confusion matrix on test set
confusion_matrix(test_y, test_y_pred)

array([[109,  19],
       [  7, 141]], dtype=int64)

In [39]:
from sklearn.metrics import classification_report

#We usually create the classification report on test set
print(classification_report(test_y, test_y_pred))

              precision    recall  f1-score   support

           0       0.94      0.85      0.89       128
           1       0.88      0.95      0.92       148

    accuracy                           0.91       276
   macro avg       0.91      0.90      0.90       276
weighted avg       0.91      0.91      0.91       276



As Model is not overfitted so we are not using regularisation

### SVM Binary classification

SVC (kernel='linear')

In [40]:
from sklearn.svm import SVC
 
lin_svm = SVC(kernel="linear")

lin_svm.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


Accuracy

In [41]:
from sklearn.metrics import accuracy_score

In [42]:
#Predict the train values
train_y_pred = lin_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8535825545171339

In [43]:
#Predict the test values
test_y_pred = lin_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.9057971014492754

Classification matrix

In [44]:
from sklearn.metrics import confusion_matrix

#We usually create the confusion matrix on test set
confusion_matrix(test_y, test_y_pred)

array([[109,  19],
       [  7, 141]], dtype=int64)

Classification report

In [45]:
from sklearn.metrics import classification_report

#We usually create the classification report on test set
print(classification_report(test_y, test_y_pred))

              precision    recall  f1-score   support

           0       0.94      0.85      0.89       128
           1       0.88      0.95      0.92       148

    accuracy                           0.91       276
   macro avg       0.91      0.90      0.90       276
weighted avg       0.91      0.91      0.91       276



SVC (kernel='poly')

In [46]:
from sklearn.svm import SVC

# You need to enter a value for gamma. Remember, gamma controls the shape of the bell curve for rbf
# You can also set it is as gamma='scale'. This will be the default option in future releases

pol_svm = SVC(kernel="poly", degree=3, coef0=1, C=10)

pol_svm.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [47]:
#Predict the train values
train_y_pred = pol_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.9610591900311527

In [48]:
#Predict the test values
test_y_pred = pol_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8695652173913043

Here we can see overfitting is found

SVC(kernel='rbf')

In [49]:
rbf_svm = SVC(kernel="rbf", C=10, gamma='scale')

rbf_svm.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [50]:
#Predict the train values
train_y_pred = rbf_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.9517133956386293

In [51]:
#Predict the test values
test_y_pred = rbf_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8985507246376812

### Decision Tree

In [52]:
from sklearn.tree import DecisionTreeClassifier 

tree_clf = DecisionTreeClassifier(max_depth=5)

tree_clf.fit(train_x, train_y)

In [53]:
from sklearn.metrics import accuracy_score

In [54]:
#Train accuracy:
train_y_pred = tree_clf.predict(train_x)

print(accuracy_score(train_y, train_y_pred))

0.8909657320872274


In [55]:
#Test accuracy:
test_y_pred = tree_clf.predict(test_x)

print(accuracy_score(test_y, test_y_pred))

0.9021739130434783


In [56]:
from sklearn.metrics import confusion_matrix

#Test confusion matrix
confusion_matrix(test_y, test_y_pred)

array([[110,  18],
       [  9, 139]], dtype=int64)

Randomized Grid Search

In [57]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_grid = {'max_depth': randint(low=5, high=20), 
              'min_samples_leaf': randint(low=5, high=20)}

tree_gs = RandomizedSearchCV(DecisionTreeClassifier(), param_grid, 
                             n_iter=15, cv=5, verbose=1,
                             scoring='accuracy',
                             return_train_score=True)

tree_gs.fit(train_x, train_y)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [58]:
cvres = tree_gs.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.7803415697674418 {'max_depth': 18, 'min_samples_leaf': 9}
0.7819282945736434 {'max_depth': 13, 'min_samples_leaf': 6}
0.8254723837209301 {'max_depth': 12, 'min_samples_leaf': 18}
0.7928052325581396 {'max_depth': 14, 'min_samples_leaf': 8}
0.8099321705426356 {'max_depth': 17, 'min_samples_leaf': 12}
0.7990673449612403 {'max_depth': 10, 'min_samples_leaf': 11}
0.803718507751938 {'max_depth': 8, 'min_samples_leaf': 11}
0.8301598837209301 {'max_depth': 16, 'min_samples_leaf': 17}
0.7865431201550388 {'max_depth': 19, 'min_samples_leaf': 9}
0.8006298449612403 {'max_depth': 12, 'min_samples_leaf': 12}
0.8099200581395349 {'max_depth': 6, 'min_samples_leaf': 14}
0.8286337209302326 {'max_depth': 14, 'min_samples_leaf': 19}
0.7959181201550388 {'max_depth': 12, 'min_samples_leaf': 10}
0.8301598837209301 {'max_depth': 5, 'min_samples_leaf': 17}
0.8286216085271316 {'max_depth': 13, 'min_samples_leaf': 16}


In [59]:
#Find the best parameter set
tree_gs.best_params_

{'max_depth': 16, 'min_samples_leaf': 17}

In [60]:
tree_gs.best_estimator_

In [61]:
#Train accuracy:
train_y_pred = tree_gs.best_estimator_.predict(train_x)

print(accuracy_score(train_y, train_y_pred))

0.8551401869158879


In [62]:
#Test accuracy:
test_y_pred = tree_gs.best_estimator_.predict(test_x)

print(accuracy_score(test_y, test_y_pred))

0.8985507246376812


In [63]:
#Test confusion matrix
confusion_matrix(test_y, test_y_pred)

array([[111,  17],
       [ 11, 137]], dtype=int64)

### Random Forest

In [64]:
from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs=-1) 

rnd_clf.fit(train_x, train_y)

  return fit_method(estimator, *args, **kwargs)


In [65]:
#Train accuracy

train_y_pred = rnd_clf.predict(train_x)

train_acc = accuracy_score(train_y
                           , train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.9906542056074766


In [66]:
#Test accuracy

test_y_pred = rnd_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.9094202898550725


### AdaBoost Classifier

In [67]:
from sklearn.ensemble import AdaBoostClassifier 


ada_clf = AdaBoostClassifier( 
            DecisionTreeClassifier(max_depth=5), n_estimators=50, 
            learning_rate=0.1) 


ada_clf.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [68]:
#Train accuracy

train_y_pred = ada_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 1.0


In [69]:
#Test accuracy

test_y_pred = ada_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8840579710144928


### Neural Network

In [70]:
from sklearn.neural_network import MLPClassifier

#Default settings create 1 hidden layer with 100 neurons
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,))

mlp_clf.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [71]:
#Predict the train values
train_y_pred = mlp_clf.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.9143302180685359

In [72]:
#Predict the test values
test_y_pred = mlp_clf.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.9021739130434783

In [73]:
#We usually create the confusion matrix on test set
confusion_matrix(test_y, test_y_pred)

array([[110,  18],
       [  9, 139]], dtype=int64)

In [74]:
#Default settings create 1 hidden layer with 100 neurons
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, verbose=True)

mlp_clf.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.69607343
Iteration 2, loss = 0.64722773
Iteration 3, loss = 0.60418984
Iteration 4, loss = 0.56725417
Iteration 5, loss = 0.53513296
Iteration 6, loss = 0.50773051
Iteration 7, loss = 0.48499008
Iteration 8, loss = 0.46494616
Iteration 9, loss = 0.44867948
Iteration 10, loss = 0.43476027
Iteration 11, loss = 0.42331678
Iteration 12, loss = 0.41350892
Iteration 13, loss = 0.40553308
Iteration 14, loss = 0.39845559
Iteration 15, loss = 0.39203301
Iteration 16, loss = 0.38612774
Iteration 17, loss = 0.38110951
Iteration 18, loss = 0.37646891
Iteration 19, loss = 0.37225896
Iteration 20, loss = 0.36854573
Iteration 21, loss = 0.36515934
Iteration 22, loss = 0.36196042
Iteration 23, loss = 0.35936553
Iteration 24, loss = 0.35653924
Iteration 25, loss = 0.35426568
Iteration 26, loss = 0.35154316
Iteration 27, loss = 0.34942096
Iteration 28, loss = 0.34765810
Iteration 29, loss = 0.34579009
Iteration 30, loss = 0.34423936
Iteration 31, loss = 0.34238589
Iteration 32, los



In [75]:
#Predict the train values
train_y_pred = mlp_clf.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.9968847352024922

In [76]:
#Predict the test values
test_y_pred = mlp_clf.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8804347826086957

In [77]:
#Increase neurons from 100 to 50
mlp_clf = MLPClassifier(max_iter=1000, verbose=False,
                        hidden_layer_sizes=(50,))

mlp_clf.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [78]:
#Predict the train values
train_y_pred = mlp_clf.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.9735202492211839

In [79]:
#Predict the test values
test_y_pred = mlp_clf.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8985507246376812

### Deep Neural Network

In [80]:
dnn_clf = MLPClassifier(hidden_layer_sizes=(50,25,10),
                       max_iter=1000)

dnn_clf.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [81]:
#Let's check the number of iterations:
dnn_clf.n_iter_

497

In [82]:
#Let's check the number of layers:
dnn_clf.n_layers_

5

In [83]:
#Predict the train values
train_y_pred = dnn_clf.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

1.0

In [84]:
#Predict the test values
test_y_pred = dnn_clf.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8623188405797102

In [85]:
dnn_clf = MLPClassifier(hidden_layer_sizes=(50,25,10),
                       max_iter=1000,
                       early_stopping=True)

dnn_clf.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [86]:
#Let's check the number of iterations:
dnn_clf.n_iter_

17

In [87]:
#Predict the train values
train_y_pred = dnn_clf.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8364485981308412

In [88]:
#Predict the test values
test_y_pred = dnn_clf.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8731884057971014