In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [3]:
dataset = pd.read_csv('daibetes.csv')
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,class
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [4]:
len(dataset)

767

In [9]:
dataset.size

6903

In [7]:
dataset.shape

(767, 9)

In [12]:
# replace Zeros
zero_not_accepted = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction',
                    'Age', 'class']

for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NaN)
    mean = int(dataset[column].mean(skipna = True))
    dataset[column] = dataset[column].replace(np.NaN, mean)

In [13]:
print(dataset['Glucose'])

0       85.0
1      183.0
2       89.0
3      137.0
4      116.0
5       78.0
6      115.0
7      197.0
8      125.0
9      110.0
10     168.0
11     139.0
12     189.0
13     166.0
14     100.0
15     118.0
16     107.0
17     103.0
18     115.0
19     126.0
20      99.0
21     196.0
22     119.0
23     143.0
24     125.0
25     147.0
26      97.0
27     145.0
28     117.0
29     109.0
       ...  
737     99.0
738    102.0
739    120.0
740    102.0
741    109.0
742    140.0
743    153.0
744    100.0
745    147.0
746     81.0
747    187.0
748    162.0
749    136.0
750    121.0
751    108.0
752    181.0
753    154.0
754    128.0
755    137.0
756    123.0
757    106.0
758    190.0
759     88.0
760    170.0
761     89.0
762    101.0
763    122.0
764    121.0
765    126.0
766     93.0
Name: Glucose, Length: 767, dtype: float64


In [14]:
#split data
x = dataset.iloc[:,0:8]
y = dataset.iloc[:, 8]
x_train,x_test,y_train,y_test = train_test_split(x, y, random_state = 0, test_size= 0.2)

In [16]:
#feature scaling
sc_x =StandardScaler()
x_train =sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
print(x_train)
print(x_test)

[[-0.12459448  2.00188135 -0.04088647 ... -0.59412526 -0.76640618
   0.23925174]
 [-0.12459448 -0.75606157  0.28998359 ... -1.34773992 -0.7345974
  -1.04800381]
 [ 1.55770547  0.06475478 -0.20632151 ...  0.42035603 -0.56398668
   0.06761766]
 ...
 [ 0.88478549 -0.26357176 -0.70262661 ... -0.73905115  0.73728155
   0.06761766]
 [ 0.5483255   0.09758743  0.28998359 ...  0.18847459 -1.02955153
   1.78395839]
 [-0.79751446  0.22891805  0.12454856 ...  0.10151905  0.32955084
  -0.70473566]]
[[ 1.22124548  1.47655889  2.77150909 ...  0.73919301 -0.90231642
   0.83997099]
 [ 0.21186551  0.32741601  0.62085366 ... -0.82600669 -0.84159057
   3.07121394]
 [ 0.88478549  1.24673031 -1.52980177 ... -0.28978087  0.32087571
   0.49670285]
 ...
 [ 0.5483255   0.39308131  0.62085366 ...  1.98555573 -0.69122179
   1.0974221 ]
 [-0.12459448 -0.1650738  -0.53719157 ... -0.2463031   0.04616353
  -0.96218677]
 [-0.12459448 -0.55906564 -0.37175654 ... -1.8115028  -0.69700521
  -0.96218677]]


In [17]:
len(y)

767

In [18]:
import math
math.sqrt(len(x_train))

24.758836806279895

In [19]:
math.sqrt(len(x_test))

12.409673645990857

In [23]:
#Define the model ; init k-nn
classifier = KNeighborsClassifier(n_neighbors = 11, p= 2, metric ='euclidean')

In [25]:
#frist model
classifier.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')

In [26]:
# predict the test 
y_pred = classifier.predict(x_test)
y_pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1.])

In [28]:
#Evaluation model
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[154]]


In [29]:
print(f1_score(y_test, y_pred))

1.0


In [30]:
print(accuracy_score(y_test, y_pred))

1.0
