In [1]:
import pandas as pdd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from kydavra import PointBiserialCorrSelector
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [2]:
# Read csv file

df = pd.read_csv('diabetes.csv')

<IPython.core.display.Javascript object>

In [3]:
# Dataframe

df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
# Define X And y values

X = df.iloc[:, :-1].values
y = df['Outcome'].values

In [5]:
# Split Dataframe into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=69)

In [6]:
# Build model and fit train data

KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)

KNeighborsClassifier()

In [7]:
# Predict test data

y_pred = KNN.predict(X_test)

In [8]:
# Get accuracy score

accuracy_score(y_test, y_pred)

0.70995670995671

In [9]:
# Select best features by correlation with kydavra

selector = PointBiserialCorrSelector()
best = selector.select(df, 'Outcome')
best

['Glucose', 'Pregnancies', 'BMI', 'Age']

In [10]:
# Define X and y with best features

X = df[best].values
y = df['Outcome'].values

In [11]:
# Split Dataframe into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=69)

In [12]:
# Build model, fit, predict and get accuracy score with best features

KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)
accuracy_score(y_test, y_pred)

0.7359307359307359

In [13]:
# Standardize features

SS = StandardScaler()
X_scaled = SS.fit_transform(X_train)
X_test_scaled = SS.transform(X_test)

In [14]:
# Fit, predict and get accuracy score with standardized features 

KNN.fit(X_scaled, y_train)
y_pred = KNN.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.7705627705627706

In [15]:
# Find best hyperparameters
# Search parameters

parameters = {
    'n_neighbors' : range(1, 20, 2),
    'p' : range(1, 5),
    'metric' : ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
    'weights' : ['uniform', 'distance']
}

In [16]:
# Search over specified parameter

GS = GridSearchCV(KNN, parameters, verbose=1, cv =20, n_jobs=-1, scoring = 'accuracy')
GS.fit(X_scaled, y_train)

Fitting 20 folds for each of 320 candidates, totalling 6400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 5128 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 6400 out of 6400 | elapsed:    5.0s finished


GridSearchCV(cv=20, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan', 'chebyshev',
                                    'minkowski'],
                         'n_neighbors': range(1, 20, 2), 'p': range(1, 5),
                         'weights': ['uniform', 'distance']},
             scoring='accuracy', verbose=1)

In [17]:
# Estimator that was chosen in search, and best parameters

GS.best_estimator_

KNeighborsClassifier(metric='chebyshev', n_neighbors=15, p=1)

In [18]:
# Best score of estimator

GS.best_score_

0.7727207977207977

In [19]:
# Parameter that gave the best results

GS.best_params_

{'metric': 'chebyshev', 'n_neighbors': 15, 'p': 1, 'weights': 'uniform'}