# LAB 4

## KNN design and implementation

In [1]:
import pandas as pd
import numpy as np
from collections import Counter

1. Load the Iris dataset

In [299]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header = None)

2. X_test and y_test creation

In [300]:
iris_array = df.to_numpy()

indexes_array = np.arange(iris_array.shape[0])
test_indexes = np.random.choice(indexes_array, size = int(iris_array.shape[0]*0.2), replace=False)
test_array = iris_array[test_indexes, :]

X_test = test_array[:,0:4]
y_test = test_array[:,4]

3. X_train and y_train creation

In [301]:
train_indexes = indexes_array[~np.in1d(indexes_array, test_indexes)]
X_train = iris_array[train_indexes, 0:4]
y_train = iris_array[train_indexes, 4]

In [302]:
Counter(y_train)

Counter({'Iris-setosa': 42, 'Iris-versicolor': 38, 'Iris-virginica': 40})

4. Implement fit for KNN

In [303]:
class KNearestNeighbors:
    def __init__(self, k, distance_metric="euclidean"):
        self.k = k
        self.distance_metric = distance_metric
    def fit(self, X, y):
        """
        Store the 'prior knowledge'of you model that will be used
        to predict new labels.
        :param X : input data points, ndarray, shape = (R,C).
        :param y : input labels, ndarray, shape = (R,).
        """
        self.xtrain = X
        self.ytrain = y
        # pass # TODO: implement it!
    def predict(self, X):
        """Run the KNN classification on X.
        :param X: input data points, ndarray, shape = (N,C).
        :return: labels : ndarray, shape = (N,).
        """
        pass # TODO: implement it!

5. Define distance functions

In [304]:
def euclidean_distance(p,q):
    return np.sqrt(((p-q)**2).sum())

def cosine_distance(p,q):
    num = (p*q).sum()
    den = np.sqrt((p**2).sum())*np.sqrt((q**2).sum())
    return 1-abs(num/den)

def manhattan_distance(p,q):
    return (abs(p-q)).sum()

In [305]:
# a = np.array([1,2,3])
# b = np.array([1,3,2])
# euclidean_distance(a,b)
# cosine_distance(a,b)
# manhattan_distance(a,b)

6. Implement predict for KNN

In [306]:
class KNearestNeighbors:
    def __init__(self, k, distance_metric="euclidean"):
        self.k = k
        self.distance_metric = distance_metric
        
    def fit(self, X, y):
        """
        Store the 'prior knowledge'of you model that will be used
        to predict new labels.
        :param X : input data points, ndarray, shape = (R,C).
        :param y : input labels, ndarray, shape = (R,).
        """
        self.xtrain = X
        self.ytrain = y

    def predict(self, X):
        """Run the KNN classification on X.
        :param X: input data points, ndarray, shape = (N,C).
        :return: labels : ndarray, shape = (N,).
        """
        self.xtest = X

        if self.distance_metric == 'euclidean':
            compute_distance = euclidean_distance
        elif self.distance_metric == 'cosine':
            compute_distance = cosine_distance
        elif self.distance_metric == 'manhattan':
            compute_distance = manhattan_distance
        else: 
            print("Unknown metric")
            return None

        out_matrix = list()
        for p in self.xtest:
            out_vector = list()
            for q in self.xtrain:
                out_vector.append(compute_distance(p,q))
                # print(q,p,out_vector)
            out_matrix.append(np.argsort(out_vector)[:self.k])
            # print(np.argsort(out_vector)[:self.k])
        out_matrix = np.array(out_matrix)
        
        y_out = []

        for vec_ind in out_matrix:
            # label_assign(self.ytrain,np.array(vec_ind))
            y_out.append(label_assign(self.ytrain,np.array(vec_ind)))

        # print(out_matrix)
        # print(ytest)
        return np.array(y_out)


In [307]:
def label_assign(y_train, ind):
    # print(y_train, ind)
    label, n_occurrences = np.unique(y_train[ind], return_counts=True)
    # return np.random.choice(label,1)[0] 
    return np.random.choice(label[n_occurrences==np.max(n_occurrences)],1)[0] # restituisco la label con più occorrenze, o, in caso di parità, una fra le label con più occorrenze

7. Now let’s fit the KNN model with the X_train and y_train data.

In [309]:
knn1 = KNearestNeighbors(5, distance_metric="cosine")
knn1.fit(X_train, y_train)
y_pred = knn1.predict(X_test)

y_pred

array(['Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',
       'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica',
       'Iris-setosa', 'Iris-virginica', 'Iris-versicolor',
       'Iris-virginica', 'Iris-virginica', 'Iris-versicolor'],
      dtype='<U15')

In [310]:
def accuracy_score(y_true, y_pred):
    return (y_true==y_pred).sum()/len(y_true)

In [311]:
accuracy_score(y_test, y_pred)

0.9333333333333333

# Broadcasting
Si possono operare somme fra matrici se:
1. Le dimensioni sono uguali
2. Se una dimensione su una matrice è >1 e la corrispondente dimensione sull'altra matrice è =1

Si noti che una matrice avente dimensione (N,M) è come se fosse a una matrice avente dimensione (1,N,M), o una matrice (1,1,N,M), o un matrice (1,1,1,N,M), etc.; pertanto, affinché sia possibile operare somme fra matrici, usando il broadcasting automatico, è sufficiente che le due matrici abbiano le stesse dimensioni (o dimensione 1) partendo dall'ultima.
#### Esempio
<ul>
<li>(A,B,N,M) + (A,B,N,M) --> Funziona</li>
<li>(A,B,N,M) + (N,M) --> Funziona</li>
<li>(A,B,N,M) + (A,1,N,1) --> Funziona</li>
<li>(A,B,N,M) + (N,1) --> Funziona</li>
</ul>

Se si vuole assegnare la dimensione 1 a un'altra posizione diversa dalla prima (e.g., (N,1,M)), usare la funzione <code>expand_dims(axis=\<posizione_inserimento\>)</code>

In [41]:
a = np.ones((2,3,4,5,6))
b = np.ones((5,1))
a+b

array([[[[[2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.]],

         [[2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.]],

         [[2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.]],

         [[2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.]]],


        [[[2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.]],

         [[2., 2., 2., 2., 2., 2.],
          [2., 2., 2., 2., 2., 2.],
          [2., 2

In [27]:
a = np.ones((2,5,3))
b = np.ones((8,5))


c = np.expand_dims(a,axis=1)
d = np.expand_dims(b,axis=2)
print(c.shape, d.shape)
c+d
# a+b

(2, 1, 5, 3) (8, 5, 1)


array([[[[2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.]],

        [[2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.]],

        [[2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.]],

        [[2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.]],

        [[2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.]],

        [[2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.]],

        [[2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.]],

        [[2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.],
         [2., 2., 2.]]],


       [[[2., 2., 2.],
         [2., 2., 2.],
         [2., 2.