In [1]:
import pandas as pd
dataset = pd.read_csv('processed.cleveland.data')
dataset.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


# Heart Disease Databases

## Relevant Information
This database contains 76 attributes, but all published experiments
refer to using a subset of 14 of them.  In particular, the Cleveland
database is the only one that has been used by ML researchers to
this date.  The "goal" field refers to the presence of heart disease
in the patient.  It is integer valued from 0 (no presence) to 4.
Experiments with the Cleveland database have concentrated on simply
attempting to distinguish presence (values 1,2,3,4) from absence (value
0).

## Attribute Information

| Column | Description |
|:------:|:------------|
| age | age in years |
| sex | sex (1 = male; 0 = female) |
| cp | chest pain type (Value 1: typical angina, Value 2: atypical angina, Value 3: non-anginal pain, Value 4: asymptomatic) |
| trestbps | resting blood pressure (in mm Hg on admission to the hospital) |
| chol | serum cholestoral in mg/dl |
| fbs | (fasting blood sugar > 120 mg/dl)  (1 = true; 0 = false) |
| restecg | resting electrocardiographic results (Value 0: normal, Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria) |
| thalach | maximum heart rate achieved |
| exang | exercise induced angina (1 = yes; 0 = no) |
| oldpeak | ST depression induced by exercise relative to rest |
| slope | the slope of the peak exercise ST segment (Value 1: upsloping, Value 2: flat, Value 3: downsloping) |
| ca | number of major vessels (0-3) colored by flourosopy |
| num | diagnosis of heart disease |

In [2]:
dataset_copy = dataset.copy()
train_set = dataset_copy.sample(frac=2/3, random_state=0)
test_set = dataset_copy.drop(train_set.index)

In [3]:
train_classified = {}
for _, d in train_set.iterrows():
    t = tuple(f for f in d)
    if d['num'] == 0:
        train_classified[t] = False
    else:
        train_classified[t] = True

test_classified = {}
for _, d in test_set.iterrows():
    t = tuple(f for f in d)
    if d['num'] == 0:
        test_classified[t] = False
    else:
        test_classified[t] = True

In [56]:
import numpy as np

def knn(classified, k):
    def predict(point):
        distances = []
        for t in classified:
            distance = 0
            l = len(t)
            if len(t) != len(point):
                raise ValueError()
            for i in range(0, l):
                if t[i] != '?' and point[i] != '?': # ignores unkown fields
                    distance += (float(point[i]) - float(t[i])) ** 2
            distances.append((distance, classified[t]))
        distances.sort()
        prediction = 0
        for _, s in distances[:k]:
            if s:
                prediction += 1
            else:
                prediction -= 1
        return True if prediction >= 0 else False
    return predict

for k in [*range(1, 8)] + [10, 15]:
    # false positive
    fp = 0
    # false negative
    fn = 0
    # true positive
    tp = 0
    # true negative
    tn = 0
    for t in test_classified:
        actual = test_classified[t]
        predicted = knn(train_classified, k)(t)
        if actual == True and predicted == True:
            tp += 1
        if actual == True and predicted == False:
            fn += 1
        if actual == False and predicted == True:
            fp += 1
        if actual == False and predicted == False:
            tn += 1

    print(f'{f"k = {k}":-^20}\n')
    print(f'{tp:>2} | {fp:>2}\n{"":-^7}\n{fn:>2} | {tn:>2}\n')

    total = fp + fn + tp + tn
    precision = (tp + tn) / total
    print(f'precision: {precision}\n')

-------k = 1--------

27 | 23
-------
18 | 33

precision: 0.594059405940594

-------k = 2--------

37 | 34
-------
 8 | 22

precision: 0.5841584158415841

-------k = 3--------

29 | 17
-------
16 | 39

precision: 0.6732673267326733

-------k = 4--------

33 | 24
-------
12 | 32

precision: 0.6435643564356436

-------k = 5--------

29 | 15
-------
16 | 41

precision: 0.693069306930693

-------k = 6--------

33 | 22
-------
12 | 34

precision: 0.6633663366336634

-------k = 7--------

27 | 18
-------
18 | 38

precision: 0.6435643564356436

-------k = 10-------

33 | 20
-------
12 | 36

precision: 0.6831683168316832

-------k = 15-------

31 | 18
-------
14 | 38

precision: 0.6831683168316832

