## K - Nearest Neighbours Algorithm Implentation using Python

We are going to predict whether a person will have diabetes or not

Importing the necessary libraries/modules

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

Loading the dataset

In [None]:
dataset = pd.read_csv('diabetes.csv')
print(len(dataset))
dataset.head()

768


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Replacing zeros with the means of the respective columns

In [None]:
zero_not_accepted = ['Glucose','BloodPressure','SkinThickness','BMI','Insulin']

for column in zero_not_accepted:
  dataset[column] = dataset[column].replace(0,np.NaN)
  mean = int(dataset[column].mean(skipna=True))
  dataset[column] = dataset[column].replace(np.NaN,mean)

Splitting the dataset into traing and testing datasets

In [None]:
X = dataset.iloc[:,0:8]
Y = dataset.iloc[:,8]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.2)

# Feature Scaling : 
Any algorithm that computes distance or assumes normality, we have to scale our features

In [None]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

Finding the best value of k for this model

In [None]:
import math
math.sqrt(len(Y_test))

12.409673645990857

but 12 is an even number, so we subtract one from it and use k=11

Defining the model : Init K-NN

In [None]:
classifier = KNeighborsClassifier(n_neighbors = 11, p=2, metric='euclidean')

Fitting the train data into the model

In [None]:
classifier.fit(X_train, Y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=11)

Predicting the test set results

In [None]:
Y_pred = classifier.predict(X_test)
Y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Evaluating the Model

Confusion matrix

In [None]:
cm = confusion_matrix(Y_test, Y_pred)
print(cm)

[[94 13]
 [15 32]]


f1 score

In [None]:
print(f1_score(Y_test,Y_pred))

0.6956521739130436


Accuracy

In [None]:
print(accuracy_score(Y_test,Y_pred))

0.8181818181818182
