## K-NN classifier

In [1]:
import numpy as np
from scipy.spatial import distance_matrix

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
iris = datasets.load_iris()

# we only take the first two features. We could avoid this ugly
# slicing by using a two-dim dataset
X = iris.data[:, :2]
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)


In [3]:
class KNNClassifier:
    def __init__(self, k):
        self.k = k

    def predict(self, X_train, y_train, X_test):
        X_dist = distance_matrix(X_train, X_test).T
        
        idx = np.argpartition(X_dist, self.k, axis=1)    # k first elements will be the smallest
        y_nn = y_train[idx[:, :self.k]]    # labels of k nearest training samples
        bin_counts = np.apply_along_axis(np.bincount,
                                         axis=1,
                                         arr=y_nn,
                                         minlength=np.max(y_nn) + 1)
        y_pred = np.argmax(bin_counts, axis=1)    # majority voting (bincount works for nonnegative int values)

        return y_pred

In [4]:
knn_clf = KNNClassifier(k=5)
y_pred = knn_clf.predict(X_train, y_train, X_test)

In [5]:
f1_score(y_test, y_pred, average="micro")

0.78