In [251]:
import numpy as np
from collections import Counter

In [252]:
data = [
    [150, 7.0, 1, 'Apple'],
    [120, 6.5, 0, 'Banana'],
    [180, 7.5, 2, 'Orange'],
    [155, 7.2, 1, 'Apple'],
    [110, 6.0, 0, 'Banana'],
    [190, 7.8, 2, 'Orange'],
    [145, 7.1, 1, 'Apple'],
    [115, 6.3, 0, 'Banana']
]

labels = {"Apple": 0, "Banana": 1, "Orange": 2}


In [253]:
def data_split(data, n_features):
    data = np.array(data)
    X = data[:, 0:n_features]
    X = X.astype(float)
    y = data[:, n_features]
    return X, y


In [254]:
n_features = 3
X_train, y_train = data_split(data, n_features)
y_train

array(['Apple', 'Banana', 'Orange', 'Apple', 'Banana', 'Orange', 'Apple',
       'Banana'], dtype='<U32')

In [255]:
#implementing train_test_split similar to the function in sklearn
def train_test_split(X, y, test_size):
    rows_X = X.shape[0]
    rows_y = y.shape[0]
    
    #Since test_size lies between 0 and 1, we determine a split index to split the dataset into test/train data
    split_index = int(rows_X*(1 - test_size))
    
    X_train = X[:split_index, :]
    X_test = X[split_index:, :]
    y_train = y[:split_index]
    y_test = y[:split_index]
    
    return X_train, X_test, y_train, y_test
    
    

We define accuracy as
$$
    \text{accuracy} = 1 - \frac{\sum_{i=1}^{n} \mathbf{1}(y_{\text{pred}}[i] \neq y_{\text{true}}[i])}{n}
$$

Where **1** represents the indicator function that $y_{pred} = y_{true}$ and $n$ the length of $y_{pred}$. Intuitively, 1 means perfect accuracy and 0 means 0 accuracy

In [256]:
def accuracy(y_pred, y_true):
    anomaly = 0
    for val in range(len(y_pred)):
        if y_pred[val] != y_true[val]:
            anomaly += 1
    accuracy = 1 - anomaly/len(y_pred)
    return accuracy
        

We define the $p^{th}$ norm to compute distance in an $n$-dimensional space

$$
     L_p(x_1, x_2) = \|x_1 - x_2\|_p = \left( \sum_{i = 1}^{n} \left| x_{1i} - x_{2i} \right|^p \right)^{\frac{1}{p}}
$$

When $p = 1$, the distance is termed as **Manhattan Distance** </br>
When $p = 2$, the distance is termed as **Euclidean Distance** </br>
When $p > 2$, it is generalized as **Minkowski Distance**

In [257]:
def euclidean_distance(a, b):
    return np.linalg.norm(a - b)

In [258]:
def minkowski_distance(a, b, p):
    return np.sum(np.abs(a - b)**p)**(1 / p)

In [259]:
def manhattan_distance(a, b):
    return np.sum(np.abs(a - b))

We normalize each element in X_train using **min-max normalization** .ie
$$
    x_{new} = \frac{x - x_{min}}{x_{max} - x_{min}}
$$
Where $x_{min}$ and $x_{max}$ are the minimum and maximum elements of the corresponding column respectively. </br>
This ensures that all the points in X_train are normalized between 0 and 1

In [260]:
def normalize(arr):
    min_val = np.min(arr)
    max_val = np.max(arr)
    
    #modifying the columns in place
    for val in range(len(arr)):
        arr[val] = (arr[val] - min_val)/(max_val - min_val)
    return

In [261]:
def normalization(X_train):
    for cols in range(X_train.shape[1]):
        normalize(X_train[:, cols])
    return

In [262]:
normalization(X_train)

In [263]:
class KNN:
    #the parameter p defines which distance metric we are using
    def __init__(self, k = 3, p = None):
        self.k = k
        self.p = p
        self.X_train = None
        self.y_train = None
        
       
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        
    
    def distance(self, p):
        if p == 1:
            return manhattan_distance
        elif p == 2:
            return euclidean_distance
        else:
            return lambda x1, x2: minkowski_distance(x1, x2, p=self.p)
        
    
    
    def predict(self, X_test):
        predicted_classes = []
        for x in range(X_test.shape[0]):
            pred = self.predict_one(X_test[x])
            predicted_classes.append(pred)
            
        return predicted_classes
            
        
    def predict_one(self, x):
        #storing the best distances from each point in X_train with the new point
        distances = []
        for rows in range(self.X_train.shape[0]):
            #calculating the euclidean distance between each row of X with the new point
            distance_fn = self.distance(self.p)
            distance = distance_fn(self.X_train[rows, :], x)
            distances.append(distance)

        distances = np.array(distances)

        #returning the closest k neighbours to new point
        idx_min_k = np.argsort(distances)[:self.k]

        #calculating the mode of the classes of the k closest neighbours
        classes_min_k = self.y_train[idx_min_k]
        predicted_class = Counter(classes_min_k).most_common(1)[0][0]
        return predicted_class
        

In [264]:
#We first evaluate the test data using k = 3 and euclidean distance
test_data = np.array([
    [118, 6.2, 0],  # Expected: Banana
    [160, 7.3, 1],  # Expected: Apple
    [185, 7.7, 2]   # Expected: Orange
])

normalization(test_data)

y_true = ["Banana", "Apple", "Orange"]

knn = KNN(k=3, p = 2)
knn.fit(X_train, y_train)

y_pred = knn.predict(test_data)

print(accuracy(y_pred, y_true))

1.0


We use KNN with varying $k$ and varying distance metric to predict the classes for each point on the test dataset, and correspondingly, print the accuracy

In [265]:

def evaluation():
    for k in range(1, 6):
        for p in range(1, 4):
            knn = KNN(k = k, p = p)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(test_data)
            accuracy_score = accuracy(y_pred, y_true)
            print(f"Predictions for k = {k}, p = {p}: {y_pred}, accuracy = {accuracy_score : .2f}")
        print("\n\n")
    return
        

In [266]:
evaluation()

Predictions for k = 1, p = 1: ['Banana', 'Apple', 'Orange'], accuracy =  1.00
Predictions for k = 1, p = 2: ['Banana', 'Apple', 'Orange'], accuracy =  1.00
Predictions for k = 1, p = 3: ['Banana', 'Apple', 'Orange'], accuracy =  1.00



Predictions for k = 2, p = 1: ['Banana', 'Apple', 'Orange'], accuracy =  1.00
Predictions for k = 2, p = 2: ['Banana', 'Apple', 'Orange'], accuracy =  1.00
Predictions for k = 2, p = 3: ['Banana', 'Apple', 'Orange'], accuracy =  1.00



Predictions for k = 3, p = 1: ['Banana', 'Apple', 'Orange'], accuracy =  1.00
Predictions for k = 3, p = 2: ['Banana', 'Apple', 'Orange'], accuracy =  1.00
Predictions for k = 3, p = 3: ['Banana', 'Apple', 'Orange'], accuracy =  1.00



Predictions for k = 4, p = 1: ['Banana', 'Apple', 'Orange'], accuracy =  1.00
Predictions for k = 4, p = 2: ['Banana', 'Apple', 'Orange'], accuracy =  1.00
Predictions for k = 4, p = 3: ['Banana', 'Apple', 'Orange'], accuracy =  1.00



Predictions for k = 5, p = 1: ['Banana', 'Apple', 'A