K Nearest Neighbors (KNN) - is one of the simpliest Supervised Machine Learning algorithm mostly used to classify a data point based on how its neighbors are classified.

Project Objective: Predict whether a person has diabetes or not.

In [29]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [30]:
# add column names to the data
names = ['pregnacies', 'glucose', 'blood_pressure', 'skin_thickness', 'insulin', 'bmi', 'diabetes_pedigree', 'age', 'outcome']
dataset = pd.read_csv('diabetes.csv', names=names)
dataset.head()

Unnamed: 0,pregnacies,glucose,blood_pressure,skin_thickness,insulin,bmi,diabetes_pedigree,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [16]:
len(dataset)

768

In [31]:
# some columns cant have Zero, cause its not realistic
zero_not_accepted = ['glucose', 'blood_pressure', 'skin_thickness', 'insulin', 'bmi']

for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NaN)
    mean = int(dataset[column].mean(skipna=True))
    dataset[column] = dataset[column].replace(np.NaN, mean)

In [33]:
print(dataset['glucose'])

0      148.0
1       85.0
2      183.0
3       89.0
4      137.0
       ...  
763    101.0
764    122.0
765    121.0
766    126.0
767     93.0
Name: glucose, Length: 768, dtype: float64


In [34]:
# split the data
X = dataset.iloc[:, 0:8]  # get all data except the last column which has the answers
y = dataset.iloc[:, 8] # get the last column ONLY which has the answers

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [35]:
# feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [36]:
import math
math.sqrt(len(y_test))

12.409673645990857

In [37]:
# Define the model: init K-NN
classifier = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')

# TRAIN THE MODEL
classifier.fit(X_train, y_train)

In [39]:
# predict the test set results
y_pred = classifier.predict(X_test)

In [41]:
# evaluate Model
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[94 13]
 [15 32]]


In [42]:
print(f1_score(y_test, y_pred))

0.6956521739130436


In [43]:
print(accuracy_score(y_test, y_pred))

0.8181818181818182
