In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [3]:
dataset = pd.read_csv("datasets/diabetes.csv")
print(len(dataset))
print(dataset.head())

768
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [4]:
# replacing zerose with mean of column
zero_not_accepted = ["Glucose", "BloodPressure", "SkinThickness", "BMI", "Insulin"]

for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NaN)
    mean = int(dataset[column].mean(skipna=True))
    dataset[column] = dataset[column].replace(np.NAN, mean)

In [5]:
#split data in to train and test

X = dataset.iloc[:, 0:8]
Y = dataset.iloc[:, 8]
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=0, test_size=0.2)

In [6]:
#feature scalling

sc_X = StandardScaler()
X_train  = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [7]:
# choosing the k
import math
neighbors = math.floor(math.sqrt(len(Y_test)))
if  neighbors % 2 == 0:
    neighbors = neighbors - 1
else:
    neighbors = neighbors

# define the model
classifier = KNeighborsClassifier(n_neighbors=neighbors, p=2, metric="euclidean")

# fit model
classifier.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='uniform')

In [8]:
# predict the test set results 
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### confusion matrix is used for "classification" model Evaluation.
the results of prediction on test comes in the form of matrix which we normalize by confusion matrics and get the resuluts in the better form and understandable form
like 
| column     |  |  |  
| ---------- | -------------| ------------- |  
|   |     TN       |     FP        |  
|  |     FN       |     TP        |  


| Left align | Predicted no | Predicted yes |
|:-----------|------------:|:------------:|
| actual no       |        This |     This     |
| actual yes     |      column |    column    |
| will       |        will |     will     |
| be         |          be |      be      |
| left       |       right |    center    |
| aligned    |     aligned |    aligned   |


In [9]:
# evaluate model

cm  = confusion_matrix(Y_test, y_pred)
print(cm)

[[94 13]
 [15 32]]


In [10]:
# data scintists are more worry about f1-score because it tells 
print(f1_score(Y_test, y_pred))

0.6956521739130436


In [13]:
# in other hand the juneral public or decision makers in the bussiness they will ask
# about accuracy score or how accurate the model is. but in my point of view the developer
# should also worry about f1-score/f-score.
print(accuracy_score(Y_test, y_pred))

0.8181818181818182
