In [1]:
# import required modules
import itertools
import numpy as np
import matplotlib.pylab as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
from sklearn import preprocessing

# read csv file which is our dataset
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# print out shape of dataset
df.shape

(768, 9)

In [3]:
# remove Outcome column from the dataset
x = df.drop(columns = ['Outcome'])

# print out first 5 raw from dataset
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [4]:
# Print target values
y = df['Outcome'].values
y[0:5]

array([1, 0, 1, 0, 1])

In [5]:
from sklearn.model_selection import train_test_split
# create train and test dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)

In [6]:
from sklearn.neighbors import KNeighborsClassifier as KNN
knn = KNN(n_neighbors=3)
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [7]:
knn.predict(x_test)[0:5]

array([0, 0, 0, 0, 1])

In [8]:
knn.score(x_test, y_test)

0.6688311688311688

In [9]:
from sklearn.model_selection import cross_val_score

# cv is cross-validation
knn_cv = KNN(n_neighbors=3)
# train model with cv of 5
cv_scores = cross_val_score(knn_cv, x, y, cv=5)
print(cv_scores)
print("cv scores mean: {}".format(np.mean(cv_scores)))

[0.68181818 0.69480519 0.75324675 0.75163399 0.68627451]
cv scores mean: 0.7135557253204311


In [10]:
from sklearn.model_selection import GridSearchCV as gcv

# create new knn model
knn2 = KNN()
# create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(1, 25)}
# use gridsearch to test all values for n_neighbors 
knn_gscv = gcv(knn2, param_grid, cv=5)
# fit model to data 
knn_gscv.fit(x, y)
# check top performing n_neighbors value
print("Best Parameter of this nearest neighbor is: {}".format(knn_gscv.best_params_))
# check mean score for the top performing value of n_neighbors
print("Avarage Score of This model is: {}".format(knn_gscv.best_score_))

Best Parameter of this nearest neighbor is: {'n_neighbors': 14}
Avarage Score of This model is: 0.7578558696205755
