In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics

Скачаем дата-сет

In [3]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [5]:
X, y = data['data'], data['target']
X.shape

(569, 30)

Разобьем на тренировочные и тестовые выборки и рассмотрим базовую реализацию из scikit-learn

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=3)
preds = model.predict(X_test)
preds

array([1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1])

F1-score равен:

In [9]:
metrics.f1_score(y_test, preds)

0.9473684210526315

Возьмем реализвцию KNN из семинара и добавим новые метрики (https://habr.com/ru/articles/801885/) 

In [42]:
from sklearn.preprocessing import StandardScaler
class KNN:
    def __init__(self, k, type) -> None:
        self.k = k
        self.type = type
        self.scaler = StandardScaler()

    def fit(self, X_train, y_train):
        self.scaler.fit(X_train)
        self.X_train = self.scaler.transform(X_train)
        self.y_train = y_train

    def euclidian_distance(self, a, b):
        return np.sqrt(((a - b)**2).sum())
    
    def manhattan_distance(self, a, b):
        return (abs(a - b).sum())

    def cosine_distance(self, a, b):
        return 1 - ((a * b).sum() / (np.sqrt(((a)**2).sum()) * np.sqrt(((b)**2).sum())))
    
    def predict(self, X_test):

        transformed_X_test = self.scaler.transform(X_test)
        
        distance_matrix = np.zeros((X_test.shape[0], X_train.shape[0]))

        if self.type == 'euclid':
            for i in range(len(X_test)):
                for j in range(len(self.X_train)):
                    distance_matrix[i][j] = self.euclidian_distance(transformed_X_test[i], self.X_train[j])
                    
        elif self.type == 'manhattan':
            for i in range(len(X_test)):
                for j in range(len(self.X_train)):
                    distance_matrix[i][j] = self.manhattan_distance(transformed_X_test[i], self.X_train[j])

        else:
            for i in range(len(X_test)):
                for j in range(len(self.X_train)):
                    distance_matrix[i][j] = self.cosine_distance(transformed_X_test[i], self.X_train[j])
        
        idx_matrix = np.argsort(distance_matrix, axis=1)[:, :self.k]
        y_pred_initial = self.y_train[idx_matrix]
        y_pred = (y_pred_initial.mean(axis=1)>=0.5).astype(int)

        return y_pred

In [52]:
knn = KNN(3, 'euclid')
knn.fit(X_train, y_train)
knn.predict(X_test)
metrics.f1_score(y_test, knn.predict(X_test))

0.9842931937172774

Сравним с реализацией готового решения, найденного на просторах интернета P.S. интересует значения F1-score

In [71]:
import numpy as np
from sklearn.preprocessing import StandardScaler

class KNN:
    def __init__(self, k, metric='euclidean') -> None:
        """
        :param metric: metric for distance ('euclidean', 'manhattan', 'cosine')
        """
        self.k = k
        self.metric = metric
        self.scaler = StandardScaler()

    def fit(self, X_train, y_train):
        self.scaler.fit(X_train)
        self.X_train = self.scaler.transform(X_train)
        self.y_train = y_train

    def euclidean_distance(self, a, b):
        return np.sqrt(((a - b) ** 2).sum())

    def manhattan_distance(self, a, b):
        return np.abs(a - b).sum()

    def cosine_distance(self, a, b):
        dot_product = np.dot(a, b)
        norm_a = np.sqrt(np.dot(a, a))
        norm_b = np.sqrt(np.dot(b, b))
        return 1 - dot_product / (norm_a * norm_b)

    def compute_distance(self, a, b):
        if self.metric == 'euclidean':
            return self.euclidean_distance(a, b)
        elif self.metric == 'manhattan':
            return self.manhattan_distance(a, b)
        elif self.metric == 'cosine':
            return self.cosine_distance(a, b)
        else:
            raise ValueError(f"Unknown metric: {self.metric}")

    def predict(self, X_test):
        transformed_X_test = self.scaler.transform(X_test)

        distance_matrix = np.zeros((X_test.shape[0], self.X_train.shape[0]))
        for i in range(len(X_test)):
            for j in range(len(self.X_train)):
                distance_matrix[i][j] = self.compute_distance(transformed_X_test[i], self.X_train[j])

        idx_matrix = np.argsort(distance_matrix, axis=1)[:, :self.k]
        y_pred_initial = self.y_train[idx_matrix]
        y_pred = (y_pred_initial.mean(axis=1) >= 0.5).astype(int)

        return y_pred


In [73]:
knn = KNN(k=3, metric='manhattan')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [77]:
knn = KNN(3, metric='euclidean')
knn.fit(X_train, y_train)
knn.predict(X_test)
metrics.f1_score(y_test, knn.predict(X_test))

0.9842931937172774

Вывод: Значения скора сошлись, реализация метрик на одном и том же дата-сете дали аналогичные значения