In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/diabetes/diabetes.csv


# Import DataSet named diabetes.csv available on kaggle

In [2]:
DataSet = pd.DataFrame()
DataSet = pd.read_csv("/kaggle/input/diabetes/diabetes.csv")
DataSet.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Pedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


# Replace zeros with null values and then with mean of respective columns for these columns
# [Glucose, BloodPressure, SkinThickness, Insulin, BMI]

In [3]:
for column in DataSet.columns[1:-3]:
    DataSet[column].replace(0,np.NaN, inplace=True)
    DataSet[column].fillna(round(DataSet[column].mean(skipna=True)), inplace=True)
DataSet.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,Pedigree,Age,Outcome
0,6,148.0,72.0,35.0,156.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,156.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,156.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,29.0,156.0,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,72.0,29.0,156.0,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,29.0,156.0,32.0,0.232,54,1


In [4]:
X = DataSet.iloc[:, :8]
y = DataSet.iloc[:, 8:]

# Split DataSet into Training and Testing

In [5]:
# Split data into 20% for testing and 80% for training
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# Find distance of data points in testing dataset with all points in training DataSet

In [6]:
# Self explainatory: This function will find distance of data points in testing dataset with all points in training DataSet
def distance_with_all_train_Points(X_train, X_test):
    # find the distance with all the Training points
    eculidean_distance = {}
    for data_point in X_test.itertuples():
        for point in X_train.itertuples():
            eculidean_distance[tuple([list(data_point)[0],list(point)[0]])] = distance.euclidean(list(data_point)[1:], list(point)[1:])
    return eculidean_distance

# This function will take the eculidean_distances and then sort them, for checking which data points are nearest neighbor to the point.  And then it will check the labels of k nearest neighbors, here we are using k=5. Depending on that lebels, this function will give labesls to all testing dataSet and return Output_labels.

In [7]:
def k_nearest_neighbors(eculidean_distance, X_test, y, k):
        
        # Here i is just a counter. While, k is to indicate how muc neighbors we gonna consider. Here k_nearest neighbors are taken as k=5
        # yes would be to check iff label is 1 (i.e diabaties=1), and if there isn't diabaties no would be incremented. We'll check it on
        # nearest neighbors of a data point.
        # Output_labels is to store the labels predicted based on the nearest neighbors.
        all_neighbours = {}
        Output_labels = []
        for data_point in X_test.itertuples():
            for key, value in eculidean_distance.items():
                if key[0] == list(data_point)[0]:
                    all_neighbours[key] = value
                else:
                    continue
            i, yes, no = 0, 0, 0
            for item in sorted(all_neighbours.items(), key = lambda x: x[1]):
                if i < k:
                    if(y.iloc[item[0][1]]["Outcome"] == 1):
                        yes += 1
                    else:
                        no += 1
                    i += 1
        # Till this point, we know that how much nearest neighbours have label of one and label of 0 for having diabaties or not.
        # On the basis of this we'll assign label to new data Point.
            if(yes > no):
                Output_labels.append(1)
            else:
                Output_labels.append(0)
            all_neighbours.clear()
            
        return Output_labels

# Here we are just calling that functions to get labels of testing dataset

In [8]:
eculidean_distance = distance_with_all_train_Points(X_train, X_test)
Output_labels = k_nearest_neighbors(eculidean_distance, X_test, y, 5)

# Now, we are just using sckit learn library to use it's KNeighborsClassifier(). We'll train and then predict by using this model.

In [9]:
neighbor = KNeighborsClassifier(n_neighbors=5)
neighbor.fit(X_train, np.array(y_train).ravel())

KNeighborsClassifier()

# Prediction using scikit learn model

In [10]:
y_pred = neighbor.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# As we have labels, which we have predicted by checking the k nearest neighbors above (Output_labels) which are real labels, and now we have got predictions by scikit learn. We'll compare both predicted and actual labels to get accuracy score.

In [11]:
neighbor.score(X_test, Output_labels)

1.0