In [1]:
import os
os.sys.path.append(os.path.dirname(os.path.abspath('.')))

## 数据准备

In [2]:
import numpy as np
from datasets.dataset import load_breast_cancer
from model_selection.train_test_split import train_test_split

In [3]:
data = load_breast_cancer()
X = data.data
Y = data.target

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

## KNN
首先定义一个距离度量函数。

In [4]:
# 欧氏距离
def E_dist(a:list,b:list):
    a=np.array(a)
    b=np.array(b)
    return np.linalg.norm(a-b)

KNN模型不需要训练，他把训练数据存储起来，给定一个测试(验证)样本，直接在训练样本库中对比、搜索出最近的邻居所对应的标签。

首先定义查找邻居的函数。

In [5]:
# # 返回单个测试样本的前k个邻居
# def get_nb_of_one(X_train, Y_train, x_test, k=3, dist=E_dist):
#     dists = []

#     for idx in range(len(X_train)):
#         cur_dist = dist(x_test, X_train[idx])
#         dists.append((Y_train[idx], cur_dist))    # 首位为标签，末位为距离
#     dists.sort(key=lambda x: x[1])    # 按照距离排序

#     return dists[:k]

返回前k个邻居之后，这些邻居会有不同的标签，还需要投票选出出现次数最多的标签。

In [6]:
# # 投票选出neighbors中出现次数最多的标签
# from collections import Counter


# def Vote(neighbors):
#     counter = Counter()
#     for idx in range(len(neighbors)):
#         dist=neighbors[idx][1]
#         counter[label] += 1/(dist+1)    # 首位(标签)计数，权重为距离的倒数
#     return counter.most_common(1)[0][0]

这样一来，就可以返回单个样本在训练数据库中最近邻邻居的标签了，即预测标签。接下来再定义一个有批量预测功能的函数。

In [7]:
# def predict(X_test):
#     Y_pred=[]
#     for x_test in X_test:
#         neighbors=get_nb_of_one(X_train,Y_train,x_test)
#         Y_pred.append(Vote(neighbors))
#     return np.array(Y_pred)

## 封装
基本功能都完成之后，把这些模块都封装起来，实现一个类SKlearn的KNN类。

In [8]:
from collections import Counter


class KNN:
    def __init__(self, n_neighbors=5, metric=E_dist):
        self.X_train = None
        self.Y_train = None
        self.k = n_neighbors
        self.metric = metric

    def fit(self, X_train, Y_train):
        # 模型不改变输入数据，所以这里等号赋值没有问题
        self.X_train = X_train
        self.Y_train = Y_train

    def __get_nb_of_one(self, x_test):
        dists = []

        for idx in range(len(self.X_train)):
            cur_dist = self.metric(x_test, self.X_train[idx])
            dists.append((self.Y_train[idx], cur_dist))    # 首位为标签，末位为距离
        dists.sort(key=lambda x: x[1])    # 按照距离排序

        return dists[:self.k]

    def __vote(self, neighbors):
        counter = Counter()
        for idx in range(len(neighbors)):
            dist = neighbors[idx][1]
            label = neighbors[idx][0]
            counter[label] += 1/(dist+1)    # 首位(标签)计数，权重为距离的倒数
        return counter.most_common(1)[0][0]

    def predict(self, X_test):
        Y_pred = []
        for x_test in X_test:
            neighbors = self.__get_nb_of_one(x_test)
            Y_pred.append(self.__vote(neighbors))
        return np.array(Y_pred)

In [9]:
knn = KNN()
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
print('acc:{}'.format(np.sum(Y_pred == Y_test)/len(Y_test)))

acc:0.956140350877193


完整代码中使用了KD树进行优化。