In [None]:
import sys
import time
import numpy as np
import heapq as hq
import matplotlib
import matplotlib.pyplot as plt

from numpy.linalg import norm
from prettytable import PrettyTable
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
np.set_printoptions(threshold=sys.maxsize)

In [None]:
data = np.load('data.npy', 'r', True)

# Task 2.2.1
labels = {}
for x in data[:,3]:
    labels[x] = labels.get(x, 0) + 1

matplotlib.rcParams.update({'font.size': 6})
plt.figure(figsize=(20, 3))
plt.bar(range(len(labels)), list(labels.values()), tick_label=list(labels.keys()))
plt.ylabel('Number of samples')
plt.xticks(rotation=90)
plt.xlabel('Labels')
plt.show()
matplotlib.rcParams.update({'font.size': 10})


In [None]:
# Task 2.3.1
class KNN:
    def __init__(self, data, k=1, encoderType='VIT', distanceMetric='manhattan'):
        self.k = k
        self.data = data
        self.encoderType = encoderType
        self.distanceMetric = distanceMetric

    def setK(self, k):
        self.k = k

    def setEncoderType(self, encoderType):
        self.encoderType = encoderType

    def setDistanceMetric(self, distanceMetric):
        self.distanceMetric = distanceMetric

    def getDistance(self, vec1, vec2):
        if self.distanceMetric=='manhattan':
            return np.sum(np.abs(vec1-vec2))
        if self.distanceMetric=='euclidean':
            return norm(vec1-vec2)
        if self.distanceMetric=='cosine':
            return 1-np.dot(vec1,vec2.T)/(norm(vec1)*norm(vec2))
        
    def getLabel(self, arr):
        freq = {}
        for pair in arr:
            freq[pair[1]] = freq.get(pair[1], 0) + 1
        if len(freq)==self.k:
            label = arr[0][1]
        else:
            label = max(zip(freq.values(), freq.keys()))[1]
        return label
        
    def compute(self, vec):
        idx = 2 if self.encoderType=='VIT' else 1
        arr = []
        for y in self.data:
            distance = self.getDistance(vec[idx], y[idx])
            if len(arr) < self.k:
                arr.append([-distance, y[3]])
                if len(arr)==self.k:
                    hq.heapify(arr)
            else:
                hq.heappushpop(arr, [-distance, y[3]])
        return self.getLabel(arr)
    
    def classify(self, x_test):
        predicted = []
        for vec in x_test:
            label = self.compute(vec)
            predicted.append(label)
        return predicted
        

In [None]:
def get_scores(actual, predicted):
    fone = f1_score(actual, predicted, zero_division=0, average='weighted')
    accuracy = accuracy_score(actual, predicted)
    precision = precision_score(actual, predicted, zero_division=0, average='weighted')
    recall = recall_score(actual, predicted, zero_division=0, average='weighted')
    return [fone, accuracy, precision, recall]

def print_results(actual, predicted):
    result = get_scores(actual, predicted)
    t = PrettyTable(['Measure', 'Value'])
    t.add_row(['F1-score', result[0]])
    t.add_row(['Accuracy', result[1]])
    t.add_row(['Precision', result[2]])
    t.add_row(['Recall', result[3]])
    print(t)

def default_knn(train, test, k, enc, dist):
    idx = 2 if enc=='VIT' else 1
    x_train = []
    y_train = []
    x_test = []
    for vec in train:
        x_train.append(vec[idx][0])
        y_train.append(vec[3])
    for vec in test:
        x_test.append(vec[idx][0])
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    dKnn = KNeighborsClassifier(n_neighbors=k, metric=dist)
    dKnn.fit(x_train, y_train)
    start = time.time()
    dKnn.predict(x_test)
    end = time.time()
    return end-start

In [None]:
# Task 2.4.1
train, test = train_test_split(data, train_size=0.8)
knn = KNN(train)
x_test = test[:,0:3]
y_test = test[:,3]
# y_pred = knn.classify(x_test)
# print_results(y_test, y_pred)

results = []
encoder_metrics = ['ResNet', "VIT"]
distance_metrics = ['manhattan', 'euclidean', 'cosine']
for k in range(1, 35, 2):
    for enc in encoder_metrics:
        for dist in distance_metrics:
            knn.setK(k)
            knn.setEncoderType(enc)
            knn.setDistanceMetric(dist)
            y_pred = knn.classify(x_test)
            score = get_scores(y_test, y_pred)
            results.append([round(score[1], 2), k, enc, dist])

# ordered rank list of top 20 triplets
top = sorted(results, key=lambda x:x[0], reverse=True)[:20]
print('Best Triplet: ', top[0])
print()

t = PrettyTable(['Accuracy', 'K', 'Encoder', 'Distance'])
for vec in top:
    t.add_row(vec)
print(t)

# k vs accuracy plot using VIT and manhattan
chosen = {}
for vec in results:
    if vec[2]=='VIT' and vec[3]=='manhattan':
            chosen[vec[1]] = vec[0]

plt.plot(range(len(chosen)), list(chosen.values()))
plt.ylabel('Accuracy')
plt.xlabel('K value')
plt.show()

In [None]:
# Task 2.6.1
tvtSize = []
optTime = []
bestTime = []
defTime = []

val = {}
for i in range(1, 10):
    size = i / 10
    tvtSize.append(size)
    train, test = train_test_split(data, train_size=size)
    x_test = test[:,0:3]
    y_test = test[:,3]

    timeVal = []
    tknn = KNN(train)
    for k in range(1, 35, 2):
        for enc in encoder_metrics:
            for dist in distance_metrics:
                tknn.setK(k)
                tknn.setEncoderType(enc)
                tknn.setDistanceMetric(dist)
                
                start = time.time()
                y_pred = tknn.classify(x_test)
                end = time.time()
                score = get_scores(y_test, y_pred)
                timeVal.append([end-start, score[1], k, enc, dist])

    optVal = sorted(timeVal, key=lambda x:x[0])[0]
    bestVal = sorted(timeVal, key=lambda x:x[1], reverse=True)[0]
    optTime.append(optVal[0])
    bestTime.append(bestVal[0])

    defVal = default_knn(train, x_test, bestVal[2], bestVal[3], bestVal[4])
    defTime.append(defVal)

    if size == 0.8:
        val['optimised knn'] = optVal[0]
        val['best knn'] = bestVal[0]
        val['default knn'] = defVal

plt.bar(range(len(val)), list(val.values()), tick_label=list(val.keys()))
plt.show()

plt.plot(tvtSize, bestTime, label='best knn', color='blue')
plt.plot(tvtSize, optTime, label='optimised knn', color='red')
plt.plot(tvtSize, defTime, label='default knn', color='green')
plt.ylabel('Inference Time')
plt.xlabel('Train dataset size')
plt.legend()
plt.show()