In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics

Скачаем дата-сет

In [3]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [4]:
X, y = data['data'], data['target']
X.shape

(569, 30)

Разобьем на тренировочные и тестовые выборки и рассмотрим базовую реализацию из scikit-learn

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=3)
preds = model.predict(X_test)
preds

array([0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1])

F1-score равен:

In [8]:
metrics.f1_score(y_test, preds)

0.9591836734693878

Возьмем реализвцию KNN из семинара и добавим новые метрики (https://habr.com/ru/articles/801885/) 

In [15]:
from sklearn.preprocessing import StandardScaler
class KNN:
    def __init__(self, k, type) -> None:
        self.k = k
        self.type = type
        self.scaler = StandardScaler()

    def fit(self, X_train, y_train):
        self.scaler.fit(X_train)
        self.X_train = self.scaler.transform(X_train)
        self.y_train = y_train

    def euclidian_distance(self, a, b):
        return np.sqrt(((a - b)**2).sum())
    
    def manhattan_distance(self, a, b):
        return (abs(a - b).sum())

    def cosine_distance(self, a, b):
        return 1 - ((a * b).sum() / (np.sqrt(((a)**2).sum()) * np.sqrt(((b)**2).sum())))
    
    def predict(self, X_test):

        transformed_X_test = self.scaler.transform(X_test)
        
        distance_matrix = np.zeros((X_test.shape[0], X_train.shape[0]))

        if self.type == 'euclid':
            for i in range(len(X_test)):
                for j in range(len(self.X_train)):
                    distance_matrix[i][j] = self.euclidian_distance(transformed_X_test[i], self.X_train[j])
                    
        elif self.type == 'manhattan':
            for i in range(len(X_test)):
                for j in range(len(self.X_train)):
                    distance_matrix[i][j] = self.manhattan_distance(transformed_X_test[i], self.X_train[j])

        else:
            for i in range(len(X_test)):
                for j in range(len(self.X_train)):
                    distance_matrix[i][j] = self.cosine_distance(transformed_X_test[i], self.X_train[j])
        
        idx_matrix = np.argsort(distance_matrix, axis=1)[:, :self.k]
        y_pred_initial = self.y_train[idx_matrix]
        y_pred = (y_pred_initial.mean(axis=1)>=0.5).astype(int)

        return y_pred

In [17]:
knn = KNN(3, 'euclid')
knn.fit(X_train, y_train)
knn.predict(X_test)
metrics.f1_score(y_test, knn.predict(X_test))

0.9637305699481866

Сравним с реализацией готового решения, подставляя в KNeighborsClassifier метрики P.S. интересует значения F1-score

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
model.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=3)
preds = model.predict(X_test)
metrics.f1_score(y_test, preds)

0.9456521739130435

In [24]:
knn = KNN(3, type='euclidean')
knn.fit(X_train, y_train)
knn.predict(X_test)
metrics.f1_score(y_test, knn.predict(X_test))

0.978021978021978

Аналогично проверим для других мтерик

In [36]:
model = KNeighborsClassifier(n_neighbors=3, metric='manhattan')
model.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=3)
preds = model.predict(X_test)
metrics.f1_score(y_test, preds)

0.937142857142857

In [38]:
knn = KNN(3, type='manhattan')
knn.fit(X_train, y_train)
knn.predict(X_test)
metrics.f1_score(y_test, knn.predict(X_test))

0.9704142011834319

In [42]:
model = KNeighborsClassifier(n_neighbors=3, metric='cosine')
model.fit(X_train, y_train)
KNeighborsClassifier(n_neighbors=3)
preds = model.predict(X_test)
metrics.f1_score(y_test, preds)

0.9479768786127167

In [44]:
knn = KNN(3, type='cosine')
knn.fit(X_train, y_train)
knn.predict(X_test)
metrics.f1_score(y_test, knn.predict(X_test))

0.970059880239521

Вывод: Значения скора сошлись, реализация метрик на одном и том же дата-сете дали аналогичные значения