# Implementation of KNN

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

KNN is a lazy algorithm which only stores data during its training

In [2]:
## Implementing K Nearest Neighbours from scratch
from collections import Counter
class K_NN:
  def __init__(self,k=5):
    self.k=k


  def fit(self,x,y):
    # only used to store training data
    self.x_train = x
    self.y_train = y

  def get_euclidean(self,test_point):
    # Note : If we reset the indices of x_train, y_train ,
    # we donot have to store the information of the indices and this code will be similar to that of the second method shown below
    dist = {}
    for index, row in self.x_train.iterrows():
      dist[np.sqrt(np.sum((row - test_point)** 2))] = index
    #print("test sample", dist)
    return dist


  def predict(self,x):
    predictions =[]

    # calculating the error rate for k =1 to k=5
    for index, row in x.iterrows(): # For each data point in test_date, Calculate the euclidean distance of it from all training examples
      distances = self.get_euclidean(row)
      #print("distances",distances)
      sorted_dist = np.sort(list(distances.keys()))
      #print("sorted distance",sorted_dist[:self.k])
      k_indices = [distances[p] for p in sorted_dist[:self.k]]

      #print("sorted indexes",k_indices)
      predictions.append(Counter([y_train[x] for x in k_indices]).most_common()[0][0])



    return predictions

# KNN implementation if we assume both X and Y are nd arrays:


from collections import Counter
class K_NN1:
  def __init__(self,k=5):
    self.k=k


  def fit(self,x,y):
    # only used to store training data
    self.x_train = x
    self.y_train = y

  def get_euclidean(self,test_point, train_point):

    return np.sqrt(np.sum((train_point - test_point)** 2))


  def predict(self,X):
    predictions =[]
    for x in X:
      distances = [self.get_euclidean(x, train) for train in self.x_train]
      k_indices = np.argsort(distances)[:self.k]
      #print("k_indices",k_indices)
      pred = Counter([self.y_train[i] for i in k_indices]).most_common()[0][0]
      predictions.append(pred)
    return predictions



In [4]:
# loading Iris data set and converting into train and test data set
df = pd.read_csv("/content/IRIS.csv")
y = df['species']
x = df.drop(columns=['species'], axis =1)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3,random_state = 43)
model =K_NN()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
print("Accuracy score for KNN",accuracy_score(predictions,y_test))

iris = datasets.load_iris()
x,y = iris.data,iris.target
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3,random_state = 43)
model2 =K_NN1()
model2.fit(x_train,y_train)
predictions2 = model2.predict(x_test)
print("Accuracy score for KNN2",accuracy_score(predictions2,y_test))

Accuracy score for KNN 0.9777777777777777
Accuracy score for KNN2 0.9777777777777777


In [5]:
from sklearn.neighbors import KNeighborsClassifier

sklearn_KNN= KNeighborsClassifier(n_neighbors=5)
sklearn_KNN.fit(x_train,y_train)
sk_predictions=sklearn_KNN.predict(x_test)
print("Accuracy score for Sklearn KNN",accuracy_score(sk_predictions,y_test))

Accuracy score for Sklearn KNN 0.9777777777777777
