In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from scipy.stats import mode

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [None]:

df_cleveland = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data")
df_cleveland.rename(columns = {'63.0':'age', '1.0':'sex', '1.0.1':'cp', '145.0':'trestbps', '233.0':'chol', '150.0':'thalach', '6.0':'thal', '2.3':'oldpeak','0':'num' }, inplace = True)

df_cleveland.drop(df_cleveland.columns[[5,6,8,10,11]], axis = 1, inplace = True)

df_cleveland['num'] = df_cleveland['num'].map({0:'0', 1:'0',2:'1',3:'1'})
df_cleveland = df_cleveland.dropna(subset=['num'])
df_cleveland['num'] = df_cleveland['num'].astype(int)

df_cleveland.drop(df_cleveland[df_cleveland['thal'] == '?'].index, inplace = True)
df_cleveland['thal'] = df_cleveland['thal'].astype(float)

x = df_cleveland.iloc[:, 0:3]
y = df_cleveland.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size = 0.7)


In [None]:

def euc_dist(p1, p2):
    return np.linalg.norm(p1-p2)

In [None]:
def knn_scratch(X_train, X_test, y_train, y_test, k):
    y_hat = []
    for test_p in X_test.to_numpy():
        distances = []
        for i in range(len(X_train)):
            distances.append(euc_dist(X_train.to_numpy()[i], test_p))

        distance_df = pd.DataFrame(data = distances, columns = ['distance'],index = X_train.index) 
        kNN_distances = distance_df.sort_values(by = ['distance'], axis = 0)[:k]
        targets = y_train.loc[kNN_distances.index]
        labeling = mode(targets).mode[0]           
        y_hat.append(labeling) 
    return y_hat

In [None]:


external_k_results = []
my_k_results = []
for i in range(1,30):
    y_hat_pred = knn_scratch(X_train, X_test, y_train, y_test, k = i)
    my_k_results.append(accuracy_score(y_hat_pred, y_test))
    
    external_model = KNeighborsClassifier(n_neighbors=i)

    external_model.fit(X_train,y_train)
    external_model_pred = external_model.predict(X_test)
    external_k_results.append(accuracy_score(external_model_pred, y_test))

In [None]:

plt.rcParams['figure.figsize'] = [20, 7]
plt.plot(range(1,i+1), my_k_results, color = 'orange', marker = 'o')
plt.plot(range(1,i+1), external_k_results, color = 'blue', marker = '*', linestyle='dashed')

print('\nMY MODEL: ')
max_accuracy = np.amax(my_k_results)
print('best accuracy: ' + str(max_accuracy))
print('optimal k=' + str(my_k_results.index(max_accuracy)+1))

print('\nKNeighborsClassifier ')
max_accuracy = np.amax(external_k_results)
print('best accuracy: ' + str(max_accuracy))
print('optimal k=' + str(external_k_results.index(max_accuracy)+1))