# KNN Algorithm implementation 

In [1]:
#Declaring all import statements
import numpy as np
import math

Below implementation of KNN algorithm is generalized and will work for all possible values of k

In [2]:
#Function to calculate and return Euclidean Distance between two points
#Input: arr1(numpy array), arr2(numpy array)
def distance(arr1,arr2):
    dist_sq=0
    for i in range(len(arr1)):
        dist_sq += (arr1[i] - arr2[i])**2
    dist = np.sqrt(dist_sq)
    return dist

In [3]:
#Below function returns the prediction class according to k nearest neighbours using distance returned by the distance function
#The function is generalized and will work with any value of k <= no of training sample, training set and test set passed
#assumption to be considered that the k value passed is within 1 to no of training samples
def predict(X_train,y_train,sample,k):
    neighbours = np.zeros([k,2])
    for i in range(len(X_train)):
        d = distance(X_train[i],sample)
        if  i < k :
            neighbours[i][0] = i
            neighbours[i][1] = d 
        else:
            max_value=0
            max_index=0
            for j in range(k):
                if neighbours[j][1] > max_value :
                    max_value = neighbours[j][1]
                    max_index = j 
            if  d < max_value :
                neighbours[max_index][0] = i
                neighbours[max_index][1] = d
    predictions=[]
    for i in range(k):
        predictions.append(y_train[int(neighbours[i][0])]) 
    prediction = max(predictions,key=predictions.count)
    return prediction

#Below function takes training set, test set, array of samples and value of k as input and returns the class predictions 
#in form of a array 
#predicted using the predict function above which is a implementation of KNN algorithm
#The function is generalized and will work with any value of k <= no of training sample
#assumption to be considered that the k value passed is within 1 to no of training samples
def knn(X_train,y_train,new_sample,k):
    length = len(new_sample)
    results =[]
    for i in range(length):
        sample = new_sample[i]
        result = predict(X_train,y_train,sample,k)
        results.append(result)
    
    return np.array(results) 

# IRIS Dataset prediction using KNN algorithm (K=1)

In [4]:
#Loading Iris dataset
from sklearn.datasets import load_iris
iris_data = load_iris()

In [5]:
#Splitting the IRIS dataset into train and test data sets using random_state
from sklearn.model_selection import train_test_split
Iris_X_train, Iris_X_test, Iris_y_train, Iris_y_test = train_test_split(iris_data.data, iris_data.target,random_state=100)

In [6]:
#predicting the labels for X_test using knn algorithm, k=1
iris_pred = knn(Iris_X_train,Iris_y_train,Iris_X_test,1)
print("Array of predicted Labels:",iris_pred)
print("\n")

#Calculating and printing the accuracy and test error rate
print("The accuracy % for Nearest Neighbour applied to the IRIS dataset for K=1: ", (np.mean(iris_pred == Iris_y_test)*100),"%")
print("Number of prediction errors: ",(np.sum(iris_pred != Iris_y_test)))
print("The test error rate for Nearest Neighbour applied to the IRIS dataset for K=1: ", (np.mean(iris_pred != Iris_y_test)))

Array of predicted Labels: [2 0 2 0 2 2 0 0 2 0 0 2 0 0 2 1 1 1 2 2 2 0 2 0 1 2 1 0 1 2 1 1 1 0 0 1 0
 1]


The accuracy % for Nearest Neighbour applied to the IRIS dataset for K=1:  97.36842105263158 %
Number of prediction errors:  1
The test error rate for Nearest Neighbour applied to the IRIS dataset for K=1:  0.02631578947368421


# Ionosphere dataset prediction using KNN algorithm (K = 1)

In [7]:
ion = np.genfromtxt("ionosphere.txt", delimiter=",")
ion[1:33]
data = ion[:,0:34]
target = ion[:,34]

In [8]:
#Splitting the Ionosphere dataset to train and test data sets using random_state
from sklearn.model_selection import train_test_split
Ion_X_train, Ion_X_test, Ion_y_train, Ion_y_test = train_test_split(data,target,random_state=100)

In [9]:
#predicting the labels for X_test using knn algorithm, k=1
ion_pred = knn(Ion_X_train,Ion_y_train,Ion_X_test,1)
print("Array of predicted Labels:",ion_pred)
print("\n")

#Calculating and printing the accuracy and test error rate
print("The accuracy % for Nearest Neighbour applied to the ionosphere dataset for K=1: ", (np.mean(ion_pred == Ion_y_test)*100),"%")
print("Number of prediction errors: ",(np.sum(ion_pred != Ion_y_test)))
print("The test error rate for Nearest Neighbour applied to the ionosphere dataset for K=1: ", (np.mean(ion_pred != Ion_y_test)))

Array of predicted Labels: [ 1.  1. -1.  1.  1. -1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.  1.
  1. -1. -1. -1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1. -1.  1.  1.
  1.  1. -1.  1. -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
 -1.  1. -1.  1. -1.  1.  1.  1.  1.  1.  1. -1. -1.  1. -1. -1.  1.  1.
 -1. -1. -1.  1. -1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.]


The accuracy % for Nearest Neighbour applied to the ionosphere dataset for K=1:  88.63636363636364 %
Number of prediction errors:  10
The test error rate for Nearest Neighbour applied to the ionosphere dataset for K=1:  0.11363636363636363


# Observations about the dataset

There are duplicate datapoints in the IRIS dataset

In [10]:
iris_data.data[101]

array([5.8, 2.7, 5.1, 1.9])

In [11]:
iris_data.data[142]

array([5.8, 2.7, 5.1, 1.9])

In [12]:
iris_data.target[101]

2

In [13]:
iris_data.target[142]

2

The second column in ionosphere dataset has no variations

In [14]:
ion[:,1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

# Results at a glance

– the test error rate for Nearest Neighbour applied to the iris dataset: 0.05263157894736842

– the test error rate for Nearest Neighbour applied to ionosphere.txt: 0.125

# References

1. https://scikit-learn.org/stable/index.html    

2. https://numpy.org/doc/stable/

3. https://docs.python.org/3/library/math.html