In [1]:
from scipy.io import loadmat as sloadmat
from numpy import linalg as LA
from tqdm import tqdm
from time import time
import pandas as pd
import numpy as np

# 1. Implement k-NN method (50 Points)

In [2]:
class KNN:
    def __init__(self, k):
        self.k = k
    
    def fit(self, X, y):
        self.data = X
        self.lbls = y
            
    def predict(self, y_test, dist_metric):
        preds = []
        if dist_metric == "manhattan":
            for testpoint in tqdm(y_test):
                distances = [ np.sum(abs(x-testpoint)) for x in self.data]
                preds.append(np.max(self.lbls[np.argsort(distances)][:self.k]))
        elif dist_metric == "cosine":
            for testpoint in tqdm(y_test):
                distances = [ 1-(x*testpoint)/(LA.norm(x)*LA.norm(x)) for x in self.data]
                preds.append(np.max(self.lbls[np.argsort(distances)][:self.k]))
        elif dist_metric == 'minkowski':
            for testpoint in tqdm(y_test):
                distances = [ np.sum((testpoint - x)**2) for x in self.data]
                preds.append(np.max(self.lbls[np.argsort(distances)][:self.k]))
        elif dist_metric == "euclidean":
            for testpoint in tqdm(y_test):
                distances = [ np.sum(np.sqrt((testpoint - x)**2)) for x in self.data]
                preds.append(np.max(self.lbls[np.argsort(distances)][:self.k]))
        elif dist_metric == "chebyshev":
            for testpoint in tqdm(y_test):
                distances = [ np.max(x-testpoint) for x in self.data]
                preds.append(np.max(self.lbls[np.argsort(distances)][:self.k]))
            

        return  preds

- Add all the combinations of distance metrics. 
- Apply for all the combinations of k. 
- Reduce the dimensions using different techniques.

# 2. Load train and test mat files, perform k-NN and report acuracy on the test dataset (30 Points)

In [3]:
X_train, y_train = sloadmat('train.mat')['features'], sloadmat('train.mat')['labels']
X_test, y_test = sloadmat('test.mat')['features'], sloadmat('test.mat')['labels']

print("Train data")
print(X_train.shape, y_train.shape)
print("5 train datapoints:", X_train[:5])
print("5 train labels:", y_train)

Train data
(1123, 1000) (1123, 1)
5 train datapoints: [[-0.19087052 -0.81624687 -0.16324869 ... -0.17326276 -0.33678705
   1.453196  ]
 [-1.0689135  -0.66361743 -0.5476942  ...  0.23691764 -0.7915029
   1.6607733 ]
 [ 0.21004608 -0.54553837 -0.23526147 ...  0.22105977  0.3269091
  -0.5789186 ]
 [-0.3347178  -0.52735007  0.08525061 ...  0.24418375  0.4792558
  -0.7996379 ]
 [ 0.44703373 -0.24532022  0.16423625 ...  0.23674938 -0.02424169
  -1.0602354 ]]
5 train labels: [[ 1]
 [ 1]
 [ 1]
 ...
 [10]
 [10]
 [10]]


In [4]:
print("Test data")
print(X_test.shape, y_test.shape)
print("5 test datapoints:", X_test[:5])
print("5 test labels:", y_test[:5])

Test data
(958, 1000) (958, 1)
5 test datapoints: [[-0.1810047  -0.3263064   0.09900231 ...  0.0896374   0.30170187
  -0.315777  ]
 [-0.08604874 -0.24425426  0.03491393 ...  0.15105137  0.14930305
  -0.50344443]
 [-0.02536274 -0.22994491  0.0929625  ...  0.21274611  0.08147568
  -0.5320467 ]
 [-0.4775491  -0.35678953  0.28851986 ... -0.18793373  0.49184024
  -0.32741845]
 [-0.22974673 -0.176327    0.07313281 ...  0.01462969  0.28931898
  -0.59351134]]
5 test labels: [[1]
 [1]
 [1]
 [1]
 [1]]


In [5]:
model = KNN(k=10)
model.fit(X_train, y_train)

In [6]:
methods = ['chebyshev', "euclidean", "minkowski", "cosine", "manhattan"]
for meth in methods:
    start = time()
    y_preds = model.predict(X_test, meth)
    print(f"Test accuracy with {meth} is {(y_preds == y_test).sum()/len(y_test) } with time complexity {time()- start}")

100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:51<00:00, 18.46it/s]


Test accuracy with chebyshev is 96.67640918580376 with time complexity 51.91572833061218


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:59<00:00, 16.06it/s]


Test accuracy with euclidean is 96.41127348643006 with time complexity 59.66495370864868


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [01:06<00:00, 14.40it/s]


Test accuracy with minkowski is 96.35386221294364 with time complexity 66.54495143890381


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [03:28<00:00,  4.59it/s]


Test accuracy with cosine is 94.0 with time complexity 208.56397104263306


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:59<00:00, 16.10it/s]

Test accuracy with manhattan is 96.41127348643006 with time complexity 59.50673174858093






# 3. Reduce the dimensionality of features using PCA to low dimensions (e.g., 10, 3 ) and report the accuracy of the test datasets (20 Points)

In [7]:
from sklearn.decomposition import PCA

n_components = 10
pca = PCA(n_components=n_components)
train_reduced = pca.fit_transform(X_train)
test_reduced = pca.transform(X_test)
model = KNN(k=10)
model.fit(train_reduced, y_train)


methods = ['chebyshev', "euclidean", "minkowski", "cosine", "manhattan"]
for meth in methods:
    start = time()
    y_preds = model.predict(test_reduced, meth)
    print(f"Test accuracy with {meth} is {(y_preds == y_test).sum()/len(y_test) } with time complexity {time()- start}")

100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:09<00:00, 99.82it/s]


Test accuracy with chebyshev is 96.88413361169103 with time complexity 9.60222578048706


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:10<00:00, 94.10it/s]


Test accuracy with euclidean is 96.20772442588726 with time complexity 10.189552307128906


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:11<00:00, 82.25it/s]


Test accuracy with minkowski is 96.26826722338204 with time complexity 11.655941009521484


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:22<00:00, 42.05it/s]


Test accuracy with cosine is 92.0 with time complexity 22.841721296310425


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:10<00:00, 92.12it/s]

Test accuracy with manhattan is 96.20772442588726 with time complexity 10.41386103630066





In [8]:
n_components = 5
pca = PCA(n_components=n_components)
train_reduced = pca.fit_transform(X_train)
test_reduced = pca.transform(X_test)
model = KNN(k=10)
model.fit(train_reduced, y_train)

methods = ['chebyshev', "euclidean", "minkowski", "cosine", "manhattan"]
for meth in methods:
    start = time()
    y_preds = model.predict(test_reduced, meth)
    print(f"Test accuracy with {meth} is {(y_preds == y_test).sum()/len(y_test) } with time complexity {time()- start}")

100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:12<00:00, 76.93it/s]


Test accuracy with chebyshev is 96.58141962421712 with time complexity 12.466671228408813


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:14<00:00, 65.22it/s]


Test accuracy with euclidean is 96.38204592901879 with time complexity 14.695978164672852


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:12<00:00, 75.15it/s]


Test accuracy with minkowski is 96.36534446764092 with time complexity 12.756106615066528


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:26<00:00, 35.57it/s]


Test accuracy with cosine is 92.0 with time complexity 26.943304777145386


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:09<00:00, 97.39it/s]

Test accuracy with manhattan is 96.38204592901879 with time complexity 9.860225677490234





In [9]:
n_components = 3
pca = PCA(n_components=n_components)
train_reduced = pca.fit_transform(X_train)
test_reduced = pca.transform(X_test)
model = KNN(k=10)
model.fit(train_reduced, y_train)

methods = ['chebyshev', "euclidean", "minkowski", "cosine", "manhattan"]
for meth in methods:
    start = time()
    y_preds = model.predict(test_reduced, meth)
    print(f"Test accuracy with {meth} is {(y_preds == y_test).sum()/len(y_test) } with time complexity {time()- start}")

100%|███████████████████████████████████████████████████████████████████████████████| 958/958 [00:09<00:00, 104.60it/s]


Test accuracy with chebyshev is 97.57724425887265 with time complexity 9.162171363830566


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:13<00:00, 69.01it/s]


Test accuracy with euclidean is 96.43319415448852 with time complexity 13.888290166854858


100%|███████████████████████████████████████████████████████████████████████████████| 958/958 [00:08<00:00, 110.16it/s]


Test accuracy with minkowski is 96.45511482254697 with time complexity 8.714414119720459


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:23<00:00, 40.47it/s]


Test accuracy with cosine is 92.0 with time complexity 23.679378986358643


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:14<00:00, 67.57it/s]

Test accuracy with manhattan is 96.43319415448852 with time complexity 14.184147357940674





In [10]:
n_components = 2
pca = PCA(n_components=n_components)
train_reduced = pca.fit_transform(X_train)
test_reduced = pca.transform(X_test)

model = KNN(k=10)
model.fit(train_reduced, y_train)

methods = ['chebyshev', "euclidean", "minkowski", "cosine", "manhattan"]
for meth in methods:
    start = time()
    y_preds = model.predict(test_reduced, meth)
    print(f"Test accuracy with {meth} is {(y_preds == y_test).sum()/len(y_test) } with time complexity {time()- start}")

100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:10<00:00, 91.83it/s]


Test accuracy with chebyshev is 99.0 with time complexity 10.447240591049194


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:14<00:00, 68.22it/s]


Test accuracy with euclidean is 97.08350730688936 with time complexity 14.053858518600464


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:11<00:00, 83.65it/s]


Test accuracy with minkowski is 97.18162839248434 with time complexity 11.459871768951416


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:25<00:00, 37.73it/s]


Test accuracy with cosine is 92.0 with time complexity 25.39868950843811


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:10<00:00, 89.14it/s]

Test accuracy with manhattan is 97.08350730688936 with time complexity 10.752355813980103





In [11]:
n_components = 1
pca = PCA(n_components=n_components)
train_reduced = pca.fit_transform(X_train)
test_reduced = pca.transform(X_test)

model = KNN(k=10)
model.fit(train_reduced, y_train)

methods = ['chebyshev', "euclidean", "minkowski", "cosine", "manhattan"]
for meth in methods:
    start = time()
    y_preds = model.predict(test_reduced, meth)
    print(f"Test accuracy with {meth} is {(y_preds == y_test).sum()/len(y_test) } with time complexity {time()- start}")

100%|███████████████████████████████████████████████████████████████████████████████| 958/958 [00:08<00:00, 119.22it/s]


Test accuracy with chebyshev is 99.0 with time complexity 8.03973937034607


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:16<00:00, 57.42it/s]


Test accuracy with euclidean is 97.04592901878914 with time complexity 16.688974857330322


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:15<00:00, 63.15it/s]


Test accuracy with minkowski is 97.04592901878914 with time complexity 15.181360006332397


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:25<00:00, 38.11it/s]


Test accuracy with cosine is 92.0 with time complexity 25.14989137649536


100%|████████████████████████████████████████████████████████████████████████████████| 958/958 [00:12<00:00, 79.52it/s]

Test accuracy with manhattan is 97.04592901878914 with time complexity 12.059455871582031



