In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('heart_preprocessed.csv')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,0
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,0
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,0
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,0
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,0


In [3]:
# Separating features and target variable
X = data.drop('target', axis=1)
y = data['target']

In [4]:
# Splitting into training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1000)
print(type(X_train))
print(type(y_train))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [5]:
# Standardizing our data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(type(X_train_scaled))

<class 'numpy.ndarray'>


## Creating Base Model

## Logistic Regression:

In [6]:
from sklearn.linear_model import LogisticRegression

Reg = LogisticRegression()
Reg.fit(X_train_scaled, y_train)
print('Training Accuracy: ',Reg.score(X_train_scaled, y_train))

Training Accuracy:  0.8760330578512396


In [7]:
# Interpreting Coefficients

coef = pd.DataFrame()
coef['Features'] = X.columns.values
coef['Coefficients'] = np.transpose(Reg.coef_)
coef.head()

Unnamed: 0,Features,Coefficients
0,age,0.016931
1,sex,0.840808
2,cp,-0.892122
3,trestbps,0.279955
4,chol,0.041359


In [8]:
exp = np.exp(coef.Coefficients)

In [9]:
coef['Odds Ratio'] = exp

In [10]:
coef.sort_values('Odds Ratio', ascending=False)



Unnamed: 0,Features,Coefficients,Odds Ratio
1,sex,0.840808,2.31824
11,ca,0.775146,2.17091
9,oldpeak,0.735741,2.087029
12,thal,0.652405,1.920154
8,exang,0.645179,1.906328
3,trestbps,0.279955,1.32307
4,chol,0.041359,1.042227
0,age,0.016931,1.017075
5,fbs,-0.144191,0.865723
6,restecg,-0.315886,0.729142


In [11]:
from sklearn.model_selection import KFold 

# We use K Fold Validation
kf = KFold(n_splits=10, random_state=1000, shuffle=False)
acc=[]

for train_index,test_index in kf.split(X_train_scaled, y_train):
    X_kf_train, X_kf_test = X_train_scaled[train_index], X_train_scaled[test_index]
    y_kf_train, y_kf_test = y_train.iloc[train_index], y_train.iloc[test_index]
    reg_new = LogisticRegression().fit(X_kf_train, y_kf_train)
    print('Training Accuracy: ', reg_new.score(X_kf_train, y_kf_train))
    print('Validation Accuracy: ', reg_new.score(X_kf_test, y_kf_test))
    print('------------------------------------------------')
    acc.append(reg_new.score(X_kf_test, y_kf_test))


Training Accuracy:  0.8894009216589862
Validation Accuracy:  0.8
------------------------------------------------
Training Accuracy:  0.880184331797235
Validation Accuracy:  0.8
------------------------------------------------
Training Accuracy:  0.8532110091743119
Validation Accuracy:  0.8333333333333334
------------------------------------------------
Training Accuracy:  0.8669724770642202
Validation Accuracy:  0.9583333333333334
------------------------------------------------
Training Accuracy:  0.8623853211009175
Validation Accuracy:  0.9166666666666666
------------------------------------------------
Training Accuracy:  0.8715596330275229
Validation Accuracy:  0.8333333333333334
------------------------------------------------
Training Accuracy:  0.8807339449541285
Validation Accuracy:  0.8333333333333334
------------------------------------------------
Training Accuracy:  0.8807339449541285
Validation Accuracy:  0.8333333333333334
------------------------------------------------

In [12]:
print('Mean Accuracy after K Fold Validation: ', np.mean(acc))

Mean Accuracy after K Fold Validation:  0.8433333333333334


## SVM:

In [13]:
from sklearn import svm

# SVM model trained over entire training set
Svm_Model = svm.SVC().fit(X_train_scaled, y_train)

# We use K Fold Validation
kf = KFold(n_splits=10, random_state=1000, shuffle=False)
acc=[]

for train_index,test_index in kf.split(X_train_scaled, y_train):
    X_kf_train, X_kf_test = X_train_scaled[train_index], X_train_scaled[test_index]
    y_kf_train, y_kf_test = y_train.iloc[train_index], y_train.iloc[test_index]
    svm_model_new = svm.SVC().fit(X_kf_train, y_kf_train)
    print('Training Accuracy: ', svm_model_new.score(X_kf_train, y_kf_train))
    print('Validation Accuracy: ', svm_model_new.score(X_kf_test, y_kf_test))
    print('------------------------------------------------')
    acc.append(svm_model_new.score(X_kf_test, y_kf_test))

Training Accuracy:  0.9308755760368663
Validation Accuracy:  0.84
------------------------------------------------
Training Accuracy:  0.9447004608294931
Validation Accuracy:  0.72
------------------------------------------------
Training Accuracy:  0.9357798165137615
Validation Accuracy:  0.9166666666666666
------------------------------------------------
Training Accuracy:  0.9403669724770642
Validation Accuracy:  0.9166666666666666
------------------------------------------------
Training Accuracy:  0.9403669724770642
Validation Accuracy:  0.8333333333333334
------------------------------------------------
Training Accuracy:  0.9403669724770642
Validation Accuracy:  0.875
------------------------------------------------
Training Accuracy:  0.9311926605504587
Validation Accuracy:  0.875
------------------------------------------------
Training Accuracy:  0.9403669724770642
Validation Accuracy:  0.875
------------------------------------------------
Training Accuracy:  0.9449541284403

In [14]:
print('Mean Accuracy after K Fold Validation: ', np.mean(acc))

Mean Accuracy after K Fold Validation:  0.8518333333333332


## Random Forest Classifier:

In [15]:
from sklearn.ensemble import RandomForestClassifier

# RFC Model trained over entire training set
RFC = RandomForestClassifier(n_estimators=500, min_samples_leaf=2).fit(X_train_scaled, y_train)

# We use K Fold Validation
kf = KFold(n_splits=10, random_state=1000, shuffle=False)
acc=[]

for train_index,test_index in kf.split(X_train_scaled, y_train):
    X_kf_train, X_kf_test = X_train_scaled[train_index], X_train_scaled[test_index]
    y_kf_train, y_kf_test = y_train.iloc[train_index], y_train.iloc[test_index]
    rfc_new = RandomForestClassifier(n_estimators=500, min_samples_leaf=2).fit(X_kf_train, y_kf_train)
    print('Training Accuracy: ', rfc_new.score(X_kf_train, y_kf_train))
    print('Validation Accuracy: ', rfc_new.score(X_kf_test, y_kf_test))
    print('------------------------------------------------')
    acc.append(rfc_new.score(X_kf_test, y_kf_test))



Training Accuracy:  0.9907834101382489
Validation Accuracy:  0.8
------------------------------------------------
Training Accuracy:  0.9861751152073732
Validation Accuracy:  0.76
------------------------------------------------
Training Accuracy:  0.9770642201834863
Validation Accuracy:  0.7916666666666666
------------------------------------------------
Training Accuracy:  0.981651376146789
Validation Accuracy:  0.8333333333333334
------------------------------------------------
Training Accuracy:  0.981651376146789
Validation Accuracy:  0.875
------------------------------------------------
Training Accuracy:  0.9908256880733946
Validation Accuracy:  0.875
------------------------------------------------
Training Accuracy:  0.9908256880733946
Validation Accuracy:  0.875
------------------------------------------------
Training Accuracy:  0.9862385321100917
Validation Accuracy:  0.875
------------------------------------------------
Training Accuracy:  0.9908256880733946
Validation A

In [16]:
print('Mean Accuracy after K Fold Validation: ', np.mean(acc))

Mean Accuracy after K Fold Validation:  0.8268333333333334


# Testing

In [17]:
print('Testing Accuracy for Logistic Regression :', Reg.score(X_test_scaled, y_test))
print('Testing Accuracy for Support Vector Machine :', Svm_Model.score(X_test_scaled, y_test))
print('Testing Accuracy for Random Forest Classifier :', RFC.score(X_test_scaled, y_test))

Testing Accuracy for Logistic Regression : 0.8032786885245902
Testing Accuracy for Support Vector Machine : 0.7704918032786885
Testing Accuracy for Random Forest Classifier : 0.7868852459016393


In [18]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


scores = [Reg.score(X_test_scaled, y_test),
          Svm_Model.score(X_test_scaled, y_test),
          RFC.score(X_test_scaled, y_test) ]

precision = [precision_score(y_test, Reg.predict(X_test_scaled)),
             precision_score(y_test, Svm_Model.predict(X_test_scaled)),
             precision_score(y_test, RFC.predict(X_test_scaled))]

recall = [recall_score(y_test, Reg.predict(X_test_scaled)),
          recall_score(y_test, Svm_Model.predict(X_test_scaled)),
          recall_score(y_test, RFC.predict(X_test_scaled))]

In [19]:

Summary = pd.DataFrame()
Summary['Model'] = ['Logistic Regression', 'Support Vector Machine', 'Random Forest Classifier']
Summary['Accuracy'] = scores
Summary['Precision'] = precision
Summary['Recall'] = recall

Summary.head()

Unnamed: 0,Model,Accuracy,Precision,Recall
0,Logistic Regression,0.803279,0.846154,0.733333
1,Support Vector Machine,0.770492,0.785714,0.733333
2,Random Forest Classifier,0.786885,0.793103,0.766667
