In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [3]:
diabetes=pd.read_csv('C:\\Users\\Gaya\\Desktop\\R\\Data mining\\final\\Diabetes.csv')

In [4]:
print(len(diabetes))

768


In [6]:
print(diabetes.head())

   NTS  PGC  DBP  TSFT  INS   BMI    DPF  Age Class
0    6  148   72    35    0  33.6  0.627   50   Yes
1    1   85   66    29    0  26.6  0.351   31    No
2    8  183   64     0    0  23.3  0.672   32   Yes
3    1   89   66    23   94  28.1  0.167   21    No
4    0  137   40    35  168  43.1  2.288   33   Yes


In [7]:
zero_not_accepted=['PGC','DBP','TSFT','INS','BMI']

In [8]:
for column in zero_not_accepted:
    diabetes[column]=diabetes[column].replace(0,np.NaN)
    mean=int(diabetes[column].mean(skipna=True))
    diabetes[column]=diabetes[column].replace(np.NaN,mean)

In [11]:
diabetes.head()

Unnamed: 0,NTS,PGC,DBP,TSFT,INS,BMI,DPF,Age,Class
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50,Yes
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,No
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32,Yes
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,No
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,Yes


In [16]:
X=diabetes.iloc[:,0:8]
Y=diabetes.iloc[:,8]

In [20]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [21]:
sc_X=StandardScaler()

In [22]:
X_train=sc_X.fit_transform(X_train)

In [25]:
X_test=sc_X.transform(X_test)

## Choosing K ... Rule of thumb sqrt of # smaples in dataset

In [30]:
import math
math.sqrt(len(Y_test))

15.198684153570664

In [35]:
classifier=KNeighborsClassifier(n_neighbors=15,p=2,metric='euclidean')

In [37]:
classifier.fit(X_train,Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=15, p=2,
           weights='uniform')

In [38]:
Y_pred=classifier.predict(X_test)

In [49]:
cm=confusion_matrix(Y_test,Y_pred)
print(cm)

[[124  27]
 [ 33  47]]


In [46]:
Y_pred=pd.DataFrame(Y_pred).replace('Yes',1)
Y_pred=pd.DataFrame(Y_pred).replace('No',0)
Y_train=pd.DataFrame(Y_train).replace('Yes',1)
Y_train=pd.DataFrame(Y_train).replace('No',0)
Y_test=pd.DataFrame(Y_test).replace('Yes',1)
Y_test=pd.DataFrame(Y_test).replace('No',0)

In [47]:
print(f1_score(Y_test,Y_pred))

0.6103896103896105


In [48]:
print(accuracy_score(Y_test,Y_pred))

0.7402597402597403


### Hyperparameter tuning with GridSearchCV

In [50]:
from sklearn.model_selection import GridSearchCV

In [80]:
diabetes=pd.read_csv('C:\\Users\\Gaya\\Desktop\\R\\Data mining\\final\\Diabetes.csv')

In [81]:
zero_not_accepted=['PGC','DBP','TSFT','INS','BMI']

In [82]:
for column in zero_not_accepted:
    diabetes[column]=diabetes[column].replace(0,np.NaN)
    mean=int(diabetes[column].mean(skipna=True))
    diabetes[column]=diabetes[column].replace(np.NaN,mean)

In [83]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)

In [84]:
sc_X=StandardScaler()

In [85]:
X_train=sc_X.fit_transform(X_train)

In [86]:
X_test=sc_X.transform(X_test)

In [88]:
param_grid = [
  {'n_neighbors': np.arange(10,30), 'metric': ['manhattan','euclidean']},
 ]

In [89]:
cl=KNeighborsClassifier(p=2)

In [90]:
mod2=GridSearchCV(cl, param_grid,n_jobs=-1,cv=5)

In [91]:
mod2.fit(X_train,Y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'n_neighbors': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
       27, 28, 29]), 'metric': ['manhattan', 'euclidean']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [92]:
Y_pred2=mod2.predict(X_test)

In [93]:
cm=confusion_matrix(Y_test,Y_pred2)
print(cm)

[[125  26]
 [ 37  43]]


In [94]:
Y_pred2=pd.DataFrame(Y_pred2).replace('Yes',1)
Y_pred2=pd.DataFrame(Y_pred2).replace('No',0)
Y_train=pd.DataFrame(Y_train).replace('Yes',1)
Y_train=pd.DataFrame(Y_train).replace('No',0)
Y_test=pd.DataFrame(Y_test).replace('Yes',1)
Y_test=pd.DataFrame(Y_test).replace('No',0)

In [95]:
print(f1_score(Y_test,Y_pred))

0.6103896103896105


In [96]:
print(accuracy_score(Y_test,Y_pred))

0.7402597402597403
