# Notebook Setup

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [2]:
sns.set()
sns.set_style("ticks")
sns.set_palette("colorblind")
sns.set_context("notebook")

# Load Data

In [3]:
data = pd.read_csv("./data/cleveland.csv")
data = data.replace(to_replace="?", value=np.nan)
data = data.apply(pd.to_numeric)

print("--- Columns ---")
print(" | ".join(list(data.columns.to_numpy().flatten())))
print()

print("--- Head ---")
display(data.head(3))
print()

print("--- Tail ---")
display(data.tail(3))
print()

print("--- Bad Rows ---")
display(data[data.isnull().any(axis=1)])

--- Columns ---
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | num

--- Head ---


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1



--- Tail ---


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,,3.0,0



--- Bad Rows ---


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
87,53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0.0,,0
166,52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,,3.0,0
192,43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,,7.0,1
266,52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0.0,,2
287,58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,,7.0,0
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,,3.0,0


# KNN

In [4]:
def point_distance(point1: np.ndarray, point2: np.ndarray) -> float:
    """
    Computes the Euclidean distance between two sets of points.
    """

    return np.linalg.norm(point1 - point2)


def knn(train: pd.DataFrame, test: pd.DataFrame, k: int) -> float:
    """
    Finds the median class from k-nearest neighbors.
    """

    predictions = []
    
    for _, point1 in test.iterrows():
        distances = []
        point1 = point1.values

        for _, point2 in train.iterrows():
            point2 = point2.values
            distances.append([point2[-1], point_distance(point1[:-1], point2[:-1])])

        distances = sorted(distances, key=lambda pair: pair[1])

        vote = np.median(distances[:k], axis=0)[0]
        predictions.append(vote)

    return np.array(predictions)


def knn_cross_validation(data: pd.DataFrame, k: int, n: int) -> float:
    """
    Applies a k-nn classifier to the input data using n-fold
    cross validation and returns relevant error metrics.
    """

    data = data.sample(frac=1)  # shuffles the data
    size = len(data) // n

    errors = []
    for x in range(n):
        train = data.iloc[np.r_[0 : x * size, (x + 1) * size : 100]]
        test = data.iloc[x * size : (x + 1) * size]
        
        predictions = knn(train, test, k)
        print(classification_report(test.num.values, predictions))

#     return np.mean(errors)

In [5]:
knn_cross_validation(data.dropna(), k=1, n=3)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        50
           1       0.00      0.00      0.00        20
           2       0.00      0.00      0.00        11
           3       0.10      1.00      0.18        10
           4       0.00      0.00      0.00         8

    accuracy                           0.10        99
   macro avg       0.02      0.20      0.04        99
weighted avg       0.01      0.10      0.02        99



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.63      0.51      0.56        53
           1       0.10      0.16      0.12        19
           2       0.07      0.08      0.07        13
           3       0.25      0.08      0.12        12
           4       0.14      0.50      0.22         2

    accuracy                           0.33        99
   macro avg       0.24      0.27      0.22        99
weighted avg       0.40      0.33      0.35        99

              precision    recall  f1-score   support

           0       0.62      0.61      0.62        57
           1       0.11      0.13      0.12        15
           2       0.18      0.18      0.18        11
           3       0.12      0.08      0.10        13
           4       0.20      0.33      0.25         3

    accuracy                           0.41        99
   macro avg       0.25      0.27      0.25        99
weighted avg       0.42      0.41      0.41        99



In [6]:
attribute_group1 = {
    
}
attribute_group2 = {
    
}