Zaimplementować algorytm klasyfikacji binarnej kNN (k najbliższych sąsiadów).

Należy udostępnić metody train i predict.
Train buduje bazę przypadków uczących (przyjmuje przynajmniej wektory i prawidłowe odpowiedzi).
Wielokrotne wywołanie metody train powinno rozszerzać zbiór przypadków uczących.
Metoda predict przyjmuje wektor (opcjonalnie: większą liczbę wektorów naraz) i zwraca odpowiedź klasyfikatora.
Należy umożliwić wybór jednej z czterech funkcji odległości: euklidesowej, taksówkowej, maksimum i cosinusowej.

Porównać wyniki na podanych zbiorach dla przynajmniej 3 wartości k (ostatnia kolumna zawiera etykietę).

In [26]:
import csv
import numpy as np
from enum import Enum

class DistanceMetric(Enum):
    EUCLIDEAN = 'euclidean'
    MANHATTAN = 'manhattan'
    CHEBYSHEV = 'chebyshev'
    COSINE = 'cosine'

class KNNClassifier:
    def __init__(self, k=3, distance_metric=DistanceMetric.EUCLIDEAN):
        if not isinstance(distance_metric, DistanceMetric):
            raise ValueError(f"Invalid distance metric. Choose from: {list(DistanceMetric)}")
        
        self.k = k
        self.distance_metric = distance_metric
        self.x_train = np.empty((0,))
        self.y_train = np.empty((0,), dtype=int)

    def train(self, x, y):
        x, y = np.array(x), np.array(y)
        if self.x_train.size == 0:
            self.x_train, self.y_train = x, y
        else:
            self.x_train = np.vstack((self.x_train, x))
            self.y_train = np.hstack((self.y_train, y))

    def _compute_distances(self, x):
        if self.distance_metric == DistanceMetric.EUCLIDEAN:
            return np.linalg.norm(self.x_train - x, axis=1)
        
        elif self.distance_metric == DistanceMetric.MANHATTAN:
            return np.sum(np.abs(self.x_train - x), axis=1)
        
        elif self.distance_metric == DistanceMetric.CHEBYSHEV:
            return np.max(np.abs(self.x_train - x), axis=1)
        
        elif self.distance_metric == DistanceMetric.COSINE:
            x_norm = np.linalg.norm(x)
            train_norms = np.linalg.norm(self.x_train, axis=1)
            ratio = np.dot(self.x_train, x) / (train_norms * x_norm)
            return 1 - ratio
        
        else:
            raise ValueError("Unsupported distance metric")

    def predict(self, X):
        if X.size == 2:
            if X.shape[0] != self.x_train.shape[1]:
                    raise ValueError(f"Input vector dimension {X.shape[0]} does not match training data dimension {self.x_train.shape[1]}")
                
            distances = self._compute_distances(X)
            k_nearest_index = np.argpartition(distances, self.k)[:self.k]
            labels = self.y_train[k_nearest_index]
            predictions = np.argmax(np.bincount(labels))

        elif X.size > 2:
            predictions = []
            for x in X:
                if x.shape[0] != self.x_train.shape[1]:
                    raise ValueError(f"Input vector dimension {x.shape[0]} does not match training data dimension {self.x_train.shape[1]}")
                
                distances = self._compute_distances(x)
                k_nearest_index = np.argpartition(distances, self.k)[:self.k]
                labels = self.y_train[k_nearest_index]
                predictions.append( np.argmax(np.bincount(labels)) )
                
        return np.array(predictions)

In [27]:
def open_csv(file_path):
    x, y = [], []
    with open(file_path, 'r') as csvfile:
        csvreader = csv.reader(csvfile, delimiter=' ')
        for row in csvreader:
            x.append(list(map(float, row[:-1])))
            y.append(int(row[-1]))
    return np.array(x), np.array(y)

In [28]:
x, y = open_csv('dataset0.csv')       
print(x)
print(y)
# Testowanie klasyfikatora
knn = KNNClassifier()
knn.train(x, y)
new_data = np.array([1.0,2.0])
prediction = knn.predict(new_data)
print(f'Prediction for new data {new_data} is {prediction}') # 1

knn.train(np.array([1., 2.]), np.array(1))
new_data = np.array([14.0, 23.0])
prediction = knn.predict(new_data)
print(f'Prediction for new data {new_data} is {prediction}') # 1

[[0. 0.]
 [0. 1.]
 [1. 1.]
 [1. 0.]]
[0 1 2 1]
Prediction for new data [1. 2.] is 1
Prediction for new data [14. 23.] is 1


In [36]:
x, y = open_csv('dataset0.csv')       
# Testowanie klasyfikatora
knn = KNNClassifier()
knn.train(x, y)
new_data = np.array([[1.0,2.0], [1.0,3.0]])
prediction = knn.predict(new_data)
print(f'Prediction for new data {new_data} is {prediction}') # 1

Prediction for new data [[1. 2.]
 [1. 3.]] is [1 1]


In [None]:
x, y = open_csv('dataset1.csv')
# Testowanie klasyfikatora
knn = KNNClassifier(distance_metric=DistanceMetric.CHEBYSHEV)
knn.train(x, y)
prediction = knn.predict(np.array([121.314024, 222.530757]))
print(f'Prediction: {prediction}') # 1

knn.train(np.array([121.314024, 222.530757]), np.array(1))
prediction = knn.predict(np.array([0.45540533, 6.420678]))
print(f'Prediction: {prediction}') # 0

Prediction: 1
Prediction: 0


In [None]:
x, y = open_csv('dataset2.csv')
# Testowanie klasyfikatora
knn = KNNClassifier(distance_metric=DistanceMetric.MANHATTAN)
knn.train(x, y)
prediction = knn.predict(np.array([121.314024, 222.530757, 23.3578, 87.4542]))
print(f'Prediction: {prediction}') # 1

knn.train(np.array([121.314024, 222.530757, 23.3578, 87.4542]), np.array(1))
prediction = knn.predict(np.array([0.45540533, 6.420678, 86.3435, 2.3546]))
print(f'Prediction: {prediction}') # 0

x, y = open_csv('dataset2.csv')
# Testowanie klasyfikatora
knn = KNNClassifier(distance_metric=DistanceMetric.EUCLIDEAN)
knn.train(x, y)
prediction = knn.predict(np.array([121.314024, 222.530757, 23.3578, 87.4542]))
print(f'Prediction: {prediction}') # 1

knn.train(np.array([121.314024, 222.530757, 23.3578, 87.4542]), np.array(1))
prediction = knn.predict(np.array([0.45540533, 6.420678, 86.3435, 2.3546]))
print(f'Prediction: {prediction}') # 0



Prediction: 1
Prediction: 0
Prediction: 1
Prediction: 0
