In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv('../datafiles/heart disease/heart_disease.csv', sep=';')
df = df.dropna(how="all")

In [3]:
df = df.drop(df[df['major_vessels_colored'] == '?'].index)
df = df.drop(df[df['thal'] == '?'].index)

In [4]:
df.head()

Unnamed: 0,age,gender,chest_pain,rest_SBP,cholesterol,fasting_blood_sugar_over_120,rest_ECG,max_HR,exerc_ind_ang,ST_by_exercise,slope_peak_exc_ST,major_vessels_colored,thal,diameter_narrowing
0,63,male,typical ang,145,233,1,left vent hypertrophy,150,0,2.3,downsloping,0,fixed defect,0
1,67,male,asymptomatic,160,286,0,left vent hypertrophy,108,1,1.5,flat,3,normal,1
2,67,male,asymptomatic,120,229,0,left vent hypertrophy,129,1,2.6,flat,2,reversable defect,1
3,37,male,non-anginal,130,250,0,normal,187,0,3.5,downsloping,0,normal,0
4,41,female,atypical ang,130,204,0,left vent hypertrophy,172,0,1.4,upsloping,0,normal,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 297 entries, 0 to 301
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   age                           297 non-null    int64  
 1   gender                        297 non-null    object 
 2   chest_pain                    297 non-null    object 
 3   rest_SBP                      297 non-null    int64  
 4   cholesterol                   297 non-null    int64  
 5   fasting_blood_sugar_over_120  297 non-null    int64  
 6   rest_ECG                      297 non-null    object 
 7   max_HR                        297 non-null    int64  
 8   exerc_ind_ang                 297 non-null    int64  
 9   ST_by_exercise                297 non-null    float64
 10  slope_peak_exc_ST             297 non-null    object 
 11  major_vessels_colored         297 non-null    object 
 12  thal                          297 non-null    object 
 13  diame

In [6]:
df.isnull().sum()

age                             0
gender                          0
chest_pain                      0
rest_SBP                        0
cholesterol                     0
fasting_blood_sugar_over_120    0
rest_ECG                        0
max_HR                          0
exerc_ind_ang                   0
ST_by_exercise                  0
slope_peak_exc_ST               0
major_vessels_colored           0
thal                            0
diameter_narrowing              0
dtype: int64

In [7]:
le = preprocessing.LabelEncoder()
gender = le.fit_transform(list(df['gender']))
gender_names = df['gender'].unique()
chestpain = le.fit_transform(list(df['chest_pain']))
chestpain_names = df['chest_pain'].unique()
restecg = le.fit_transform(list(df['rest_ECG']))
restecg_names = df['rest_ECG'].unique()
slopepeak = le.fit_transform(list(df['slope_peak_exc_ST']))
slopepeak_names = df['slope_peak_exc_ST'].unique()
mvc = le.fit_transform(list(df['major_vessels_colored']))
mvc_names = df['major_vessels_colored'].unique()
thal = le.fit_transform(list(df['thal']))
thal_names = df['thal'].unique()

In [8]:
X = list(zip(
    list(df['age']),
    gender,
    list(df['rest_SBP']),
    list(df['cholesterol']),
    list(df['fasting_blood_sugar_over_120']),
    restecg,
    list(df['max_HR']),
    list(df['exerc_ind_ang']),
    list(df['ST_by_exercise']),
    slopepeak,
    mvc,
    thal,
    list(df['diameter_narrowing'])
))

y = list(chestpain)

In [9]:
sc = preprocessing.StandardScaler()
X = sc.fit_transform(X)

In [10]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y,test_size=0.3)

In [11]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)
acc

0.5555555555555556

In [12]:
y_prediction = model.predict(X_test)

In [13]:
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.62      0.91      0.74        44
           1       0.33      0.17      0.22        18
           2       0.38      0.26      0.31        23
           3       1.00      0.20      0.33         5

    accuracy                           0.56        90
   macro avg       0.58      0.38      0.40        90
weighted avg       0.52      0.56      0.50        90



In [14]:
print(confusion_matrix(y_test, y_prediction))

[[40  0  4  0]
 [10  3  5  0]
 [11  6  6  0]
 [ 3  0  1  1]]


In [15]:
for x in range(len(y_prediction)):
    print('Predicted:', chestpain_names[y_prediction[x]], 'Actual:', chestpain_names[y_test[x]])

Predicted: typical ang Actual: typical ang
Predicted: typical ang Actual: typical ang
Predicted: typical ang Actual: typical ang
Predicted: non-anginal Actual: asymptomatic
Predicted: typical ang Actual: atypical ang
Predicted: typical ang Actual: asymptomatic
Predicted: non-anginal Actual: asymptomatic
Predicted: typical ang Actual: typical ang
Predicted: typical ang Actual: typical ang
Predicted: typical ang Actual: typical ang
Predicted: typical ang Actual: asymptomatic
Predicted: typical ang Actual: non-anginal
Predicted: typical ang Actual: asymptomatic
Predicted: non-anginal Actual: atypical ang
Predicted: typical ang Actual: non-anginal
Predicted: non-anginal Actual: non-anginal
Predicted: typical ang Actual: non-anginal
Predicted: typical ang Actual: typical ang
Predicted: typical ang Actual: typical ang
Predicted: typical ang Actual: non-anginal
Predicted: typical ang Actual: typical ang
Predicted: typical ang Actual: asymptomatic
Predicted: typical ang Actual: typical ang
Pre