In [1]:
import numpy as np
import statistics

In [2]:
class KNN_classifier():
    def __init__(self, distance_metric):
        self.distance_metric = distance_metric

    def get_distance_metric(self, training_data_point, test_data_point):

        if (self.distance_metric == 'Euclidean'):
            dist = 0
            for i in range(len(training_data_point) - 1):
                dist = dist + (training_data_point[i] - test_data_point[i])**2
                euclidean_dist = np.sqrt(dist)
            return euclidean_dist
        elif (self.distance_metric == 'Manhattan'):
            dist = 0
            for i in range(len(training_data_point) - 1):
                dist = dist + abs(training_data_point[i] - test_data_point[i])
                manhattan_dist = dist
            return manhattan_dist

    def nearest_neighbour(self, X_train, test_data, k):
        distance_list = []
        for training_data in X_train:
            distance = self.get_distance_metric(training_data, test_data)
            distance_list.append((training_data, distance))
        distance_list.sort(key = lambda x:x[1])

        neighbors = []
        for j in range(k):
            neighbors.append(distance_list[j][0])
        
        return neighbors
        

    def predict(self, X_train, test_data, k):
        neighbors = self.nearest_neighbour(X_train, test_data, k)

        lable = []
        for data in neighbors:
            lable.append(data[-1])
        predicted = statistics.mode(lable)
        return predicted

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
diabetes_data = pd.read_csv('diabetes.csv')

In [5]:
diabetes_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [6]:
diabetes_data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
X = diabetes_data.drop(columns='Outcome', axis=1)
Y = diabetes_data['Outcome']

In [8]:
X = X.to_numpy()
Y = Y.to_numpy()

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [10]:
X_train = np.insert(X_train, 8, Y_train, axis=1)

In [11]:
X_train.shape

(614, 9)

In [12]:
classifier = KNN_classifier(distance_metric='Manhattan')

In [13]:
prediction = classifier.predict(X_train, X_test[1], k=5)

In [28]:
X_test[1]

array([  3.  , 113.  ,  44.  ,  13.  ,   0.  ,  22.4 ,   0.14,  22.  ])

In [32]:
Y_test[1]

0

In [34]:
prediction

0.0

In [36]:
X_test.shape

(154, 8)

In [38]:
X_test_size = X_test.shape[0]

In [46]:
y_pred = []

for i in range(X_test_size):
    prediction = classifier.predict(X_train, X_test[i], k=7)
    y_pred.append(prediction)

In [48]:
y_true = Y_test

In [50]:
accuracy = accuracy_score(y_true, y_pred)

In [52]:
print(accuracy*100)

75.97402597402598
