# Model Training for Heart Disease

In this notebook we will predict the person is likely to have heart disease(1) or not(0)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv("C:/HealthWellness/Notebook/data/heart.csv")

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
train, test=train_test_split(df, test_size=0.3)

In [5]:
train_y=train[['HeartDisease']]
test_y=test[['HeartDisease']]

In [6]:
train_inputs=train.drop(['HeartDisease'],axis=1)
test_inputs=test.drop(['HeartDisease'],axis=1)

In [7]:
# Categorizing columns based on data type
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.to_list()
binary_columns = ['FastingBS']
numeric_columns = df.select_dtypes(include=['number']).columns.to_list()

In [8]:
numeric_columns.remove('FastingBS')

In [9]:
numeric_columns.remove('HeartDisease')

In [10]:
categorical_columns

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [11]:
numeric_columns

['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']

In [12]:
binary_columns

['FastingBS']

### Pipeline

In [13]:
numeric_transformer = Pipeline(steps=[('scaler',StandardScaler())])
categorical_transformer = Pipeline(steps=[('onehot',OneHotEncoder(handle_unknown='ignore'))])
binary_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))])

In [14]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='drop')

In [16]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[-1.41692469, -0.1364862 ,  0.67001975, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.15033491,  0.50271724,  0.67929248, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.57821151,  0.12984857, -1.87998105, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.41692469, -0.40282096, -1.87998105, ...,  1.        ,
         0.        ,  1.        ],
       [ 1.04336576, -1.73449479, -1.87998105, ...,  1.        ,
         0.        ,  1.        ],
       [-0.66814064,  1.4615224 ,  0.81838344, ...,  1.        ,
         0.        ,  0.        ]])

In [17]:
train_x.shape

(642, 20)

In [18]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[ 1.15033491, -0.40282096,  0.98529258, ...,  1.        ,
         0.        ,  0.        ],
       [-0.34723319, -0.9354905 , -1.87998105, ...,  1.        ,
         0.        ,  0.        ],
       [-1.52389384, -0.66915573,  0.35474693, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.40155086,  0.39618334,  0.57729245, ...,  1.        ,
         0.        ,  0.        ],
       [-0.13329489, -0.66915573,  0.06729229, ...,  0.        ,
         1.        ,  0.        ],
       [-0.77510979, -0.9354905 , -1.87998105, ...,  1.        ,
         0.        ,  0.        ]])

In [19]:
test_x.shape

(276, 20)

### Baseline

In [20]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_y)

In [21]:
from sklearn.metrics import accuracy_score

In [22]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_y, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.5389408099688473


In [23]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_y, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.5869565217391305


### Training Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(penalty='none')

log_reg.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


### Predicted vs actual

In [25]:
log_reg.predict(test_x)

array([1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1], dtype=int64)

In [26]:
# Create a new DataFrame

predictions = pd.DataFrame(log_reg.predict(test_x), columns=['Predicted'])

predictions

Unnamed: 0,Predicted
0,1
1,1
2,0
3,1
4,1
...,...
271,0
272,0
273,1
274,0


In [27]:
# Add the actual to the same DataFrame

predictions['Actual'] = np.array(test_y)

predictions

Unnamed: 0,Predicted,Actual
0,1,1
1,1,1
2,0,0
3,1,1
4,1,1
...,...,...
271,0,0
272,0,0
273,1,1
274,0,0


### Calculating overall Accuracy

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
#Predict the train values
train_y_pred = log_reg.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.8566978193146417

In [30]:
#Predict the test values
test_y_pred = log_reg.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8731884057971014

In [31]:
from sklearn.metrics import confusion_matrix

#We usually create the confusion matrix on test set
confusion_matrix(test_y, test_y_pred)

array([[ 96,  18],
       [ 17, 145]], dtype=int64)

In [32]:
from sklearn.metrics import classification_report

#We usually create the classification report on test set
print(classification_report(test_y, test_y_pred))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85       114
           1       0.89      0.90      0.89       162

    accuracy                           0.87       276
   macro avg       0.87      0.87      0.87       276
weighted avg       0.87      0.87      0.87       276



As Model is not overfitted so we are not using regularisation

### SVM Binary classification

SVC (kernel='linear')

In [33]:
from sklearn.svm import SVC
 
lin_svm = SVC(kernel="linear")

lin_svm.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


Accuracy

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
#Predict the train values
train_y_pred = lin_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.867601246105919

In [36]:
#Predict the test values
test_y_pred = lin_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8840579710144928

Classification matrix

In [37]:
from sklearn.metrics import confusion_matrix

#We usually create the confusion matrix on test set
confusion_matrix(test_y, test_y_pred)

array([[ 97,  17],
       [ 15, 147]], dtype=int64)

Classification report

In [38]:
from sklearn.metrics import classification_report

#We usually create the classification report on test set
print(classification_report(test_y, test_y_pred))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86       114
           1       0.90      0.91      0.90       162

    accuracy                           0.88       276
   macro avg       0.88      0.88      0.88       276
weighted avg       0.88      0.88      0.88       276



SVC (kernel='poly')

In [39]:
from sklearn.svm import SVC

# You need to enter a value for gamma. Remember, gamma controls the shape of the bell curve for rbf
# You can also set it is as gamma='scale'. This will be the default option in future releases

pol_svm = SVC(kernel="poly", degree=3, coef0=1, C=10)

pol_svm.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [40]:
#Predict the train values
train_y_pred = pol_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.9719626168224299

In [41]:
#Predict the test values
test_y_pred = pol_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.822463768115942

Here we can see overfitting is found

SVC(kernel='rbf')

In [42]:
rbf_svm = SVC(kernel="rbf", C=10, gamma='scale')

rbf_svm.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [43]:
#Predict the train values
train_y_pred = rbf_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

0.9626168224299065

In [44]:
#Predict the test values
test_y_pred = rbf_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

0.8442028985507246

### Decision Tree

In [45]:
from sklearn.tree import DecisionTreeClassifier 

tree_clf = DecisionTreeClassifier(max_depth=5)

tree_clf.fit(train_x, train_y)

In [46]:
from sklearn.metrics import accuracy_score

In [47]:
#Train accuracy:
train_y_pred = tree_clf.predict(train_x)

print(accuracy_score(train_y, train_y_pred))

0.9018691588785047


In [48]:
#Test accuracy:
test_y_pred = tree_clf.predict(test_x)

print(accuracy_score(test_y, test_y_pred))

0.8623188405797102


In [49]:
from sklearn.metrics import confusion_matrix

#Test confusion matrix
confusion_matrix(test_y, test_y_pred)

array([[ 95,  19],
       [ 19, 143]], dtype=int64)

Randomized Grid Search

In [51]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_grid = {'max_depth': randint(low=5, high=20), 
              'min_samples_leaf': randint(low=5, high=20)}

tree_gs = RandomizedSearchCV(DecisionTreeClassifier(), param_grid, 
                             n_iter=15, cv=5, verbose=1,
                             scoring='accuracy',
                             return_train_score=True)

tree_gs.fit(train_x, train_y)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [52]:
cvres = tree_gs.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.7959665697674418 {'max_depth': 12, 'min_samples_leaf': 6}
0.8037669573643411 {'max_depth': 15, 'min_samples_leaf': 9}
0.8193192829457365 {'max_depth': 7, 'min_samples_leaf': 17}
0.8177931201550388 {'max_depth': 12, 'min_samples_leaf': 18}
0.8115673449612404 {'max_depth': 14, 'min_samples_leaf': 10}
0.8193556201550388 {'max_depth': 15, 'min_samples_leaf': 13}
0.8177931201550388 {'max_depth': 9, 'min_samples_leaf': 18}
0.7975169573643411 {'max_depth': 19, 'min_samples_leaf': 6}
0.8271317829457365 {'max_depth': 18, 'min_samples_leaf': 15}
0.8271317829457365 {'max_depth': 10, 'min_samples_leaf': 15}
0.8301719961240309 {'max_depth': 5, 'min_samples_leaf': 5}
0.8115067829457365 {'max_depth': 12, 'min_samples_leaf': 8}
0.8255692829457365 {'max_depth': 5, 'min_samples_leaf': 14}
0.8255692829457365 {'max_depth': 18, 'min_samples_leaf': 14}
0.8224321705426357 {'max_depth': 6, 'min_samples_leaf': 8}


In [53]:
#Find the best parameter set
tree_gs.best_params_

{'max_depth': 5, 'min_samples_leaf': 5}

In [54]:
tree_gs.best_estimator_

In [55]:
#Train accuracy:
train_y_pred = tree_gs.best_estimator_.predict(train_x)

print(accuracy_score(train_y, train_y_pred))

0.8909657320872274


In [56]:
#Test accuracy:
test_y_pred = tree_gs.best_estimator_.predict(test_x)

print(accuracy_score(test_y, test_y_pred))

0.8695652173913043


In [57]:
#Test confusion matrix
confusion_matrix(test_y, test_y_pred)

array([[ 92,  22],
       [ 14, 148]], dtype=int64)

### Random Forest

In [64]:
from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs=-1) 

rnd_clf.fit(train_x, train_y)

  return fit_method(estimator, *args, **kwargs)


In [65]:
#Train accuracy

train_y_pred = rnd_clf.predict(train_x)

train_acc = accuracy_score(train_y
                           , train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.9844236760124611


In [66]:
#Test accuracy

test_y_pred = rnd_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8913043478260869


### AdaBoost Classifier

In [67]:
from sklearn.ensemble import AdaBoostClassifier 


ada_clf = AdaBoostClassifier( 
            DecisionTreeClassifier(max_depth=5), n_estimators=50, 
            learning_rate=0.1) 


ada_clf.fit(train_x, train_y)

  y = column_or_1d(y, warn=True)


In [68]:
#Train accuracy

train_y_pred = ada_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 1.0


In [69]:
#Test accuracy

test_y_pred = ada_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.822463768115942
