I would like to thank **Rob Harrand** for sharing the notebook and knowledge with us.
https://www.kaggle.com/tentotheminus9/what-causes-heart-disease-explaining-the-model

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/heart-disease-uci/heart.csv


# Path
Checking for path.

In [2]:
%ls ../input

[0m[01;34mheart-disease-uci[0m/


**Lets import all required libraries**

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc,classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
pd.options.mode.chained_assignment = None
np.random.seed(687)

# Data
We have to load our dataset, and have a look at our data.

In [4]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')
df.head(10)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


Now,lets change our columns names to their real representations.

In [5]:
df.rename({'cp':'chest_pain_type','trestbps':'resting_blood_pressure','chol':'cholesterol','fbs':'fasting_blood_sugar','restecg':'rest_ecg','thalach':'max_heart_rate_achieved','exang':'exercise_induced_angina','oldpeak':'st_depression','slope':'st_slope','ca':'num_major_vessels','thal':'thalassemia'},axis=1,inplace=True)

That looks nice,

In [6]:
df.head(5)

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,fasting_blood_sugar,rest_ecg,max_heart_rate_achieved,exercise_induced_angina,st_depression,st_slope,num_major_vessels,thalassemia,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


Now,we are going to change the values of our data to **string data** and later convert those values to **one-hot-encoding** for better interpretation of the data.

In [7]:
df['sex'][df['sex'] == 0] = 'female'
df['sex'][df['sex'] == 1] = 'male'
#print(len(df.columns))
df['chest_pain_type'][df['chest_pain_type'] == 0] = 'typical angina'
df['chest_pain_type'][df['chest_pain_type'] == 1] = 'atypical angina'
df['chest_pain_type'][df['chest_pain_type'] == 2] = 'non-anginal pain'
df['chest_pain_type'][df['chest_pain_type'] == 3] = 'asymptomatic'

df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
df['fasting_blood_sugar'][df['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'

df['rest_ecg'][df['rest_ecg'] == 0] = 'normal'
df['rest_ecg'][df['rest_ecg'] == 1] = 'ST-T wave abnormality'
df['rest_ecg'][df['rest_ecg'] == 2] = 'left ventricular hypertrophy'

df['exercise_induced_angina'][df['exercise_induced_angina'] == 0] = 'no'
df['exercise_induced_angina'][df['exercise_induced_angina'] == 1] = 'yes'

df['st_slope'][df['st_slope'] == 0] = 'upsloping'
df['st_slope'][df['st_slope'] == 1] = 'flat'
df['st_slope'][df['st_slope'] == 2] = 'downsloping'

df['thalassemia'][df['thalassemia'] == 1] = 'normal'
df['thalassemia'][df['thalassemia'] == 2] = 'fixed defect'
df['thalassemia'][df['thalassemia'] == 3] = 'reversable defect'

Lets have a look,

In [8]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,fasting_blood_sugar,rest_ecg,max_heart_rate_achieved,exercise_induced_angina,st_depression,st_slope,num_major_vessels,thalassemia,target
0,63,male,asymptomatic,145,233,greater than 120mg/ml,normal,150,no,2.3,upsloping,0,normal,1
1,37,male,non-anginal pain,130,250,lower than 120mg/ml,ST-T wave abnormality,187,no,3.5,upsloping,0,fixed defect,1
2,41,female,atypical angina,130,204,lower than 120mg/ml,normal,172,no,1.4,downsloping,0,fixed defect,1
3,56,male,atypical angina,120,236,lower than 120mg/ml,ST-T wave abnormality,178,no,0.8,downsloping,0,fixed defect,1
4,57,female,typical angina,120,354,lower than 120mg/ml,ST-T wave abnormality,163,yes,0.6,downsloping,0,fixed defect,1


# Change few categories to object type
We have to change few categories to object type because they contain string data.

In [9]:
df['sex'] = df['sex'].astype('object')
df['chest_pain_type'] = df['chest_pain_type'].astype('object')
df['fasting_blood_sugar'] = df['fasting_blood_sugar'].astype('object')
df['rest_ecg'] = df['rest_ecg'].astype('object')
df['exercise_induced_angina'] = df['exercise_induced_angina'].astype('object')
df['st_slope'] = df['st_slope'].astype('object')
df['thalassemia'] = df['thalassemia'].astype('object')

Lets take a look again, here we can see that their types have been changed

In [10]:
df.dtypes

age                          int64
sex                         object
chest_pain_type             object
resting_blood_pressure       int64
cholesterol                  int64
fasting_blood_sugar         object
rest_ecg                    object
max_heart_rate_achieved      int64
exercise_induced_angina     object
st_depression              float64
st_slope                    object
num_major_vessels            int64
thalassemia                 object
target                       int64
dtype: object

The reason for adding this line is to prevent the loss of information.

In [11]:
df = pd.get_dummies(df,prefix=['st_slope'],columns=['st_slope'])

In [12]:
df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,cholesterol,fasting_blood_sugar,rest_ecg,max_heart_rate_achieved,exercise_induced_angina,st_depression,num_major_vessels,thalassemia,target,st_slope_downsloping,st_slope_flat,st_slope_upsloping
0,63,male,asymptomatic,145,233,greater than 120mg/ml,normal,150,no,2.3,0,normal,1,0,0,1
1,37,male,non-anginal pain,130,250,lower than 120mg/ml,ST-T wave abnormality,187,no,3.5,0,fixed defect,1,0,0,1
2,41,female,atypical angina,130,204,lower than 120mg/ml,normal,172,no,1.4,0,fixed defect,1,1,0,0
3,56,male,atypical angina,120,236,lower than 120mg/ml,ST-T wave abnormality,178,no,0.8,0,fixed defect,1,1,0,0
4,57,female,typical angina,120,354,lower than 120mg/ml,ST-T wave abnormality,163,yes,0.6,0,fixed defect,1,1,0,0


Now, here we are going to create dummy variable for the categorical variables and drop the first column from each of them.

In [13]:
df = pd.get_dummies(df, drop_first=True)

In [14]:
df.dtypes

age                                          int64
resting_blood_pressure                       int64
cholesterol                                  int64
max_heart_rate_achieved                      int64
st_depression                              float64
num_major_vessels                            int64
target                                       int64
st_slope_downsloping                         uint8
st_slope_flat                                uint8
st_slope_upsloping                           uint8
sex_male                                     uint8
chest_pain_type_atypical angina              uint8
chest_pain_type_non-anginal pain             uint8
chest_pain_type_typical angina               uint8
fasting_blood_sugar_lower than 120mg/ml      uint8
rest_ecg_left ventricular hypertrophy        uint8
rest_ecg_normal                              uint8
exercise_induced_angina_yes                  uint8
thalassemia_fixed defect                     uint8
thalassemia_normal             

In [15]:
df.head()

Unnamed: 0,age,resting_blood_pressure,cholesterol,max_heart_rate_achieved,st_depression,num_major_vessels,target,st_slope_downsloping,st_slope_flat,st_slope_upsloping,...,chest_pain_type_atypical angina,chest_pain_type_non-anginal pain,chest_pain_type_typical angina,fasting_blood_sugar_lower than 120mg/ml,rest_ecg_left ventricular hypertrophy,rest_ecg_normal,exercise_induced_angina_yes,thalassemia_fixed defect,thalassemia_normal,thalassemia_reversable defect
0,63,145,233,150,2.3,0,1,0,0,1,...,0,0,0,0,0,1,0,0,1,0
1,37,130,250,187,3.5,0,1,0,0,1,...,0,1,0,1,0,0,0,1,0,0
2,41,130,204,172,1.4,0,1,1,0,0,...,1,0,0,1,0,1,0,1,0,0
3,56,120,236,178,0.8,0,1,1,0,0,...,1,0,0,1,0,0,0,1,0,0
4,57,120,354,163,0.6,0,1,1,0,0,...,0,0,1,1,0,0,1,1,0,0


Now we are done with our data, lets move to the model.

# Model
Now we are going to use a Support Vector Machine (SVM) on our data.

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', 1), df['target'], test_size = .271, random_state=37)
model = SVC(kernel='linear',gamma='scale',probability=True)
model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

# Performance
For understanding FPR(False positive rate)(Sensitivity) and TPR(True positive rate)(Specificity) refer to this link https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc

An AUC score above 0.9 is good.

In [17]:
acc = model.score(X_test,y_test)*100
print("Accuracy = ",acc)

y_predict = model.predict(X_test)
y_pred_quant = model.predict_proba(X_test)[:, 1]
y_pred_bin = model.predict(X_test)

confusion_matrix = confusion_matrix(y_test, y_pred_bin)
total=sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)

fpr, tpr, thresholds = roc_curve(y_test, y_pred_quant)
print(auc(fpr, tpr))

Accuracy =  87.95180722891565
Sensitivity :  0.8717948717948718
Specificity :  0.8863636363636364
0.9172494172494172
