In [10]:
''' Import libraries '''
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
''' Load data '''
file_path = "/Users/bryanmcelvy/Documents/datasets/Heart_Disease_Prediction.csv" # Make sure to change this
df_all = pd.read_csv(file_path).drop(["index"], axis=1)
df = df_all.drop("Heart Disease", axis=1) # DataFrame containing the dataset without the target variable

''' Drop removed feature (FBS over 120) from data '''
df.drop([df.columns[5]], axis=1, inplace=True)
df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,70,1,4,130,322,2,109,0,2.4,2,3,3
1,67,0,3,115,564,2,160,0,1.6,2,0,7
2,57,1,2,124,261,0,141,0,0.3,1,0,7
3,64,1,4,128,263,0,105,1,0.2,2,1,7
4,74,0,2,120,269,2,121,1,0.2,1,1,3


In [3]:
s_target = df_all["Heart Disease"]       # DataFrame containing only the target variable
labels = s_target.unique()
s_target.head()

0    Presence
1     Absence
2    Presence
3     Absence
4     Absence
Name: Heart Disease, dtype: object

In [4]:
''' Split into training and test sets '''
X = df.to_numpy()
y = s_target.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
''' Scale data '''
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
''' Create KNN Classifier and fit to data '''
for i in range(1,21):
    print(f"k = {i}")
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=labels))


k = 1
              precision    recall  f1-score   support

    Presence       0.82      0.82      0.82        40
     Absence       0.75      0.75      0.75        28

    accuracy                           0.79        68
   macro avg       0.79      0.79      0.79        68
weighted avg       0.79      0.79      0.79        68

k = 2
              precision    recall  f1-score   support

    Presence       0.76      0.93      0.83        40
     Absence       0.84      0.57      0.68        28

    accuracy                           0.78        68
   macro avg       0.80      0.75      0.76        68
weighted avg       0.79      0.78      0.77        68

k = 3
              precision    recall  f1-score   support

    Presence       0.85      0.85      0.85        40
     Absence       0.79      0.79      0.79        28

    accuracy                           0.82        68
   macro avg       0.82      0.82      0.82        68
weighted avg       0.82      0.82      0.82        68

k

In [17]:
''' Create logistic regression classifier and fit to data '''
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print(classification_report(y_test, y_pred_lr, target_names=labels))

              precision    recall  f1-score   support

    Presence       0.88      0.88      0.88        40
     Absence       0.82      0.82      0.82        28

    accuracy                           0.85        68
   macro avg       0.85      0.85      0.85        68
weighted avg       0.85      0.85      0.85        68

