In [1]:
#===========================================================
#  Stage 1: Load data set and perform data pre-processing
#===========================================================

# Import pandas and load data downloaded from https://www.kaggle.com/uciml/pima-indians-diabetes-database#diabetes.csv: 
import pandas as pd
d = pd.read_csv("E:\PythonLearning\diabetes.csv")

# Drop Pregnancies and Outcome column: 
d1 = d.drop(["Pregnancies", "Outcome"], axis = 1)

# Convert zeros to NA values: 
import numpy as np
d1 = d1.replace(0, np.nan)

# Replace NA values by mean: 
d1 = d1.fillna(d1.mean())

# Add Pregnancies column: 
df = d1.assign(Pregnancies = d["Pregnancies"])

# Write a function for normalizing 0 - 1 data: 
def normalize_01(data):
    nho_nhat = data.min()
    lon_nhat = data.max(0)
    scaled = (data - nho_nhat) / (lon_nhat - nho_nhat)
    return scaled


# Apply this function for our data: 
df = df[["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", 
        "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]].apply(normalize_01)

# Add Outcome column: 
df = df.assign(Outcome = d["Outcome"])

In [2]:
#========================================================================
#  Using GridSearchCV for Searching Multiple Parameters Simultaneously
#========================================================================

X = df[df.columns[0:8]]
Y = df["Outcome"]

# Set a range of K for evaluating KNN model: 
so_lang_gieng = np.array([1, 3, 5, 7])

# Another Parameter besides K that we might turn is the voting weights: 
weight_options = ["uniform", "distance"]

# Set a parameter grid: 
param_grid = dict(n_neighbors = so_lang_gieng, weights = weight_options)

# Import KNeighborsClassifier() function for using KNN: 
from sklearn.neighbors import KNeighborsClassifier

# Instantiate a KNN model: 
knn = KNeighborsClassifier(n_neighbors = 1)

# Fit the grid with our data:
from sklearn.grid_search import GridSearchCV
grid = GridSearchCV(knn, param_grid, cv = 10, scoring = "accuracy") 
grid.fit(X, Y)

# Print results: 
grid.grid_scores_



[mean: 0.68490, std: 0.04113, params: {'n_neighbors': 1, 'weights': 'uniform'},
 mean: 0.68490, std: 0.04113, params: {'n_neighbors': 1, 'weights': 'distance'},
 mean: 0.73307, std: 0.04665, params: {'n_neighbors': 3, 'weights': 'uniform'},
 mean: 0.72917, std: 0.04866, params: {'n_neighbors': 3, 'weights': 'distance'},
 mean: 0.75260, std: 0.05535, params: {'n_neighbors': 5, 'weights': 'uniform'},
 mean: 0.75391, std: 0.05574, params: {'n_neighbors': 5, 'weights': 'distance'},
 mean: 0.75391, std: 0.05474, params: {'n_neighbors': 7, 'weights': 'uniform'},
 mean: 0.75521, std: 0.05547, params: {'n_neighbors': 7, 'weights': 'distance'}]

In [3]:
# Show the optimal K and method of weighting: 
grid.best_params_

{'n_neighbors': 7, 'weights': 'distance'}

In [4]:
# Show average accuracy rate (10-fold cross validation): 
grid.best_score_

0.7552083333333334

In [5]:
# Show all information about the best KNN: 
grid.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='distance')

In [6]:
#==============================================================
#  Stage 3: Using best parameters for building KNN classifier
#==============================================================

# Import function for spliting data: 
from sklearn.model_selection import train_test_split

# Split our data: 
train, test = train_test_split(df, 
                               # Use 80% data for training KNN: 
                               test_size = 0.2, 
                               # For reproducing results: 
                               random_state = 0, 
                               stratify = df["Outcome"])

X_train = train[train.columns[0:8]]
y_train = train[train.columns[-1]]

X_test = test[test.columns[0:8]]
y_test = test[test.columns[-1]]


# Train KNN model with best parameters: 
best_knn = KNeighborsClassifier(n_neighbors = 7, weights = "distance")

# Fit with X our data: 
best_knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='distance')

In [7]:
# Make predictions: 
pred = best_knn.predict(X_test)
pred

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0],
      dtype=int64)

In [8]:
# Calculate accuracy rate on test data: 
import numpy as np
np.mean(pred == y_test)

0.7662337662337663

In [9]:
# We can show probability for our predictions: 
pred_prob = best_knn.predict_proba(X_test)
pd.DataFrame(pred_prob).head()

Unnamed: 0,0,1
0,0.868386,0.131614
1,1.0,0.0
2,1.0,0.0
3,0.600912,0.399088
4,0.418403,0.581597
