## Read and Describe the Dataset

In [26]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

dataset_path = './diabetes.csv'
dataset = pd.read_csv(dataset_path)

In [27]:
dataset.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Age,Outcome
0,148,72,35.0,0,33.6,50,1
1,85,66,29.0,0,26.6,31,0
2,183,64,0.0,0,23.3,32,1
3,89,66,23.0,94,28.1,21,0
4,137,40,35.0,168,43.1,33,1


In [28]:
dataset.shape

(768, 7)

In [29]:
dataset.describe()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Age,Outcome
count,768.0,768.0,763.0,768.0,768.0,768.0,768.0
mean,120.894531,69.105469,20.520315,79.799479,31.992578,33.240885,0.348958
std,31.972618,19.355807,15.966929,115.244002,7.88416,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,21.0,0.0
25%,99.0,62.0,0.0,0.0,27.3,24.0,0.0
50%,117.0,72.0,23.0,30.5,32.0,29.0,0.0
75%,140.25,80.0,32.0,127.25,36.6,41.0,1.0
max,199.0,122.0,99.0,846.0,67.1,81.0,1.0


## Data Preprocessing 

In [30]:
dataset.isnull().sum()

Glucose          0
BloodPressure    0
SkinThickness    5
Insulin          0
BMI              0
Age              0
Outcome          0
dtype: int64

In [31]:
from sklearn.impute import SimpleImputer

# strategy: mean, medien, most frequent
imputer = SimpleImputer(fill_value=np.nan, strategy='most_frequent')
X = imputer.fit_transform(dataset)

## Split the dataset into training set and test set

In [32]:
# X -> features
X = np.delete(X, -1, axis=1)
# Y -> label
Y = dataset.Outcome

In [33]:
from sklearn.model_selection import train_test_split

# Split into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, random_state=42)

## Classification Algorithms (KNN, Naïve Bayes)

### KNN Algorithm

In [34]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train)
 
# Predict on dataset which model has not seen before
knn_predict = knn.predict(X_test)

### Naïve Bayes Algorithm

In [35]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, Y_train)
nb_predict  =  nb.predict(X_test)

## Compare the behavior of KNN and Naïve Bayes

In [36]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Calculate the accuracy of KNN
print("accuracy score: %.2f " % accuracy_score(Y_test, knn_predict))
print("precision score: %.2f " % precision_score(Y_test, knn_predict))
print("recall score: %.2f " % recall_score(Y_test, knn_predict))
print("F1 score: %.2f " % f1_score(Y_test, knn_predict))

accuracy score: 0.69 
precision score: 0.55 
recall score: 0.55 
F1 score: 0.55 


In [37]:
# Calculate the accuracy of Naive Bayes
print("accuracy score: %.2f " % accuracy_score(Y_test, nb_predict))
print("precision score: %.2f " % precision_score(Y_test, nb_predict))
print("recall score: %.2f " % recall_score(Y_test, nb_predict))
print("F1 score: %.2f " % f1_score(Y_test, nb_predict))

accuracy score: 0.72 
precision score: 0.60 
recall score: 0.59 
F1 score: 0.59 
