In [199]:
import pandas as pd
import numpy as np
import math

In [200]:
data = pd.read_csv("Iris.csv")

In [201]:
data

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [202]:
data.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [203]:
len(data)

150

In [204]:
data.shape

(150, 6)

In [None]:
import numpy as np


class KNearestNeighbor:
    def __init__(self, k):
        self.k = k
        self.eps = 1e-8

    def train(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X_test, num_loops=0):
        if num_loops == 0:
            distances = self.compute_distance_vectorized(X_test)

        elif num_loops == 1:
            distances = self.compute_distance_one_loop(X_test)

        else:
            distances = self.compute_distance_two_loops(X_test)

        return self.predict_labels(distances)

    def compute_distance_two_loops(self, X_test):
        """
        Inefficient naive implementation, use only
        as a way of understanding what kNN is doing
        """

        num_test = X_test.shape[0]
        num_train = self.X_train.shape[0]
        distances = np.zeros((num_test, num_train))

        for i in range(num_test):
            for j in range(num_train):
                # (Taking sqrt is not necessary: min distance won't change since sqrt is monotone)
                distances[i, j] = np.sqrt(
                    self.eps + np.sum((X_test[i, :] - self.X_train[j, :]) ** 2)
                )

        return distances

    def compute_distance_one_loop(self, X_test):
        """
        Much better than two-loops but not as fast as fully vectorized version.
        Utilize Numpy broadcasting in X_train - X_test[i,:]
        """
        num_test = X_test.shape[0]
        num_train = self.X_train.shape[0]
        distances = np.zeros((num_test, num_train))

        for i in range(num_test):
            # (Taking sqrt is not necessary: min distance won't change since sqrt is monotone)
            distances[i, :] = np.sqrt(
                self.eps + np.sum((self.X_train - X_test[i, :]) ** 2, axis=1)
            )

        return distances

    def compute_distance_vectorized(self, X_test):
        """
        Can be tricky to understand this, we utilize heavy
        vecotorization as well as numpy broadcasting.
        Idea: if we have two vectors a, b (two examples)
        and for vectors we can compute (a-b)^2 = a^2 - 2a (dot) b + b^2
        expanding on this and doing so for every vector lends to the 
        heavy vectorized formula for all examples at the same time.
        """
        X_test_squared = np.sum(X_test ** 2, axis=1, keepdims=True)
        X_train_squared = np.sum(self.X_train ** 2, axis=1, keepdims=True)
        two_X_test_X_train = np.dot(X_test, self.X_train.T)

        # (Taking sqrt is not necessary: min distance won't change since sqrt is monotone)
        return np.sqrt(
            self.eps + X_test_squared - 2 * two_X_test_X_train + X_train_squared.T
        )

    def predict_labels(self, distances):
        num_test = distances.shape[0]
        y_pred = np.zeros(num_test)

        for i in range(num_test):
            y_indices = np.argsort(distances[i, :])
            k_closest_classes = self.y_train[y_indices[: self.k]].astype(int)
            y_pred[i] = np.argmax(np.bincount(k_closest_classes))

        return y_pred


if __name__ == "__main__":
    X = np.array([[1, 1], [3, 1], [1, 4], [2, 4], [3, 3], [5, 1]])
    y = np.array([0, 0, 0, 1, 1, 1])

    KNN = KNearestNeighbor(k=1)
    KNN.train(X, y)
    y_pred = KNN.predict(X, num_loops=0)
    print(f"Accuracy: {sum(y_pred == y) / y.shape[0]}")

In [205]:
data.drop(columns="Id", inplace = True)

In [206]:
data

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [207]:
data['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [208]:
data['Species'].replace(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], ['0','1','2'], inplace=True)

In [209]:
data

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [210]:
df = data.sample(frac=1)
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
18,5.7,3.8,1.7,0.3,0
27,5.2,3.5,1.5,0.2,0
140,6.7,3.1,5.6,2.4,2
104,6.5,3.0,5.8,2.2,2
76,6.8,2.8,4.8,1.4,1
...,...,...,...,...,...
102,7.1,3.0,5.9,2.1,2
32,5.2,4.1,1.5,0.1,0
146,6.3,2.5,5.0,1.9,2
5,5.4,3.9,1.7,0.4,0


In [211]:
X = df.iloc[:, :-1]

In [212]:
X.shape

(150, 4)

In [213]:
X

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
18,5.7,3.8,1.7,0.3
27,5.2,3.5,1.5,0.2
140,6.7,3.1,5.6,2.4
104,6.5,3.0,5.8,2.2
76,6.8,2.8,4.8,1.4
...,...,...,...,...
102,7.1,3.0,5.9,2.1
32,5.2,4.1,1.5,0.1
146,6.3,2.5,5.0,1.9
5,5.4,3.9,1.7,0.4


In [214]:
Y = df.iloc[:, -1:]

In [215]:
Y
Y.shape

(150, 1)

In [216]:
Y.nunique()

Species    3
dtype: int64

In [217]:
Y

Unnamed: 0,Species
18,0
27,0
140,2
104,2
76,1
...,...
102,2
32,0
146,2
5,0


In [218]:
ratio = 0.75

rows = df.shape[0]
train_size = int(rows*ratio)
train_size

112

In [219]:
Xtrain = X[:train_size]
print(Xtrain.shape)
Xtest = X[train_size:]
print(Xtest.shape)

(112, 4)
(38, 4)


In [220]:
Ytrain = Y[:train_size]
print(Ytrain.shape)
Ytest = Y[train_size:]
print(Ytest.shape)

(112, 1)
(38, 1)


In [221]:
Xtrain = np.array(Xtrain)
Xtest = np.array(Xtest)
Ytrain = np.array(Ytrain)
Ytest = np.array(Ytest)

In [222]:
def eucli_dist(v1,v2):
    dist = 0
    for i in range(len(v1)):
        dist += ((v1[i] - v2[i])**2)
    return math.sqrt(dist)

In [223]:
def find_neighbors(k,Xtrain, Ytrain, point):
    distance = []
    
    for i in range(len(Xtrain)):
        distance.append(eucli_dist(Xtrain[i],point))
        
        
    distance = np.array(distance)
    
    index_dist = distance.argsort()
    trainx = Xtrain[index_dist]
    trainy = Ytrain[index_dist]
    concat = np.concatenate((trainx,trainy), 1)
    
    neighbors = concat[:k]
    
    return neighbors

In [250]:
def predict(k,Xtrain, Ytrain,point):
    neighbor_classes = []
    majority = {}
    neighbors = find_neighbors(k,Xtrain, Ytrain,point)
    
    for i in neighbors:
        neighbor_classes.append(i[-1])
    
    unique_classes = set(neighbor_classes)
    for class_value in list(unique_classes):
        majority.update({class_value:neighbor_classes.count(class_value)})
    
    vote = max(list(majority.values()))
    for key,v in majority.items():
        if v == vote:
            prediction = key
    
    return prediction

In [265]:
def test_prediction(k,Xtrain, Ytrain,Xtest,Ytest,index):
    pred = predict(k,Xtrain,Ytrain,Xtest[index])
    #print("predicted value=\n", pred)
    if pred == Ytest[index]:
        #print("Actual value = \n", Ytest[index])
        #print("predicted value is equal to actual value")
        return True
    else:
        #print("not equal")
        return False

In [274]:
times = Ytest.shape[0]
k = 5
count = 0
for i in range(times):
    val = test_prediction(k,Xtrain,Ytrain,Xtest,Ytest,i)
    if val:
        count += 1

accuracy = (count/times) * 100
print("Accuracy achieved on Ytest = \n", accuracy)
    

Accuracy achieved on Ytest = 
 100.0
