In [1]:
# 载入数据集
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [2]:
iris = datasets.load_iris()

x = iris.data
y = iris.target.reshape(-1,1)
print(x.shape,y.shape)

(150, 4) (150, 1)


In [4]:
#############################核心代码实现#############################

# 欧氏距离
def distance_Euclidean(a, b):
    return np.sqrt(np.sum((a - b) ** 2, axis = 1))
# 分类器实现
class kNN():
    def __init__(self, n_neighbors = 1, dist_func = distance_Euclidean):
        self.n_neighbors = n_neighbors
        self.dist_func = dist_func
    def fit(self, x, y):
        self.x = x
        self.y = y
    def predict(self, x):
        # shape[0]为矩阵第一维长度
        y_pred = np.zeros((x.shape[0], 1), dtype = self.y.dtype)
        # enumerate多用于在for循环中得到计数，利用它可以同时获得索引和值
        for i, x_test in enumerate(x):
            distances = self.dist_func(self.x, x_test)
            n_index = np.argsort(distances)
            # 展开成一维数组
            n_y = self.y[n_index[:self.n_neighbors]].ravel()
            # numpy.bincount函数是统计列表中元素出现的个数
            y_pred[i] = np.argmax(np.bincount(n_y))
            print(np.bincount(n_y))
        # 将数组重新组成一列
        y_pred.reshape(-1)
        return y_pred

In [5]:
# 将数据分为训练集和测试集，用来测试模型分类正确率
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3, random_state = 1)

In [6]:
knn = kNN()
knn.fit(X_train,Y_train)

# 传入测试数据，做预测
Y_pred = knn.predict(X_test)
print('Prediction: ', Y_pred)

# 实际结果
print('Test value: ', Y_test)

# 统计预测正确的个数
num_correct = np.sum(Y_pred == Y_test)

# 计算准确率
accuracy = float(num_correct) / X_test.shape[0]
print('Got %d / %d correct => accuracy: %f' % (num_correct, X_test.shape[0], accuracy))

[1]
[0 1]
[0 1]
[1]
[0 0 1]
[0 1]
[0 0 1]
[1]
[1]
[0 0 1]
[0 1]
[1]
[0 0 1]
[0 1]
[0 1]
[1]
[0 1]
[0 1]
[1]
[1]
[0 1]
[0 1]
[0 1]
[1]
[0 0 1]
[0 1]
[1]
[1]
[0 1]
[0 0 1]
[0 1]
[0 0 1]
[0 1]
[0 0 1]
[0 0 1]
[1]
[0 1]
[1]
[0 1]
[0 0 1]
[0 0 1]
[1]
[0 1]
[0 0 1]
[0 1]
Prediction:  [[0]
 [1]
 [1]
 [0]
 [2]
 [1]
 [2]
 [0]
 [0]
 [2]
 [1]
 [0]
 [2]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [2]
 [1]
 [0]
 [0]
 [1]
 [2]
 [1]
 [2]
 [1]
 [2]
 [2]
 [0]
 [1]
 [0]
 [1]
 [2]
 [2]
 [0]
 [1]
 [2]
 [1]]
Test value:  [[0]
 [1]
 [1]
 [0]
 [2]
 [1]
 [2]
 [0]
 [0]
 [2]
 [1]
 [0]
 [2]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [2]
 [1]
 [0]
 [0]
 [1]
 [2]
 [1]
 [2]
 [1]
 [2]
 [2]
 [0]
 [1]
 [0]
 [1]
 [2]
 [2]
 [0]
 [2]
 [2]
 [1]]
Got 44 / 45 correct => accuracy: 0.977778


1. 不同k值对分类准确率的影响

In [9]:
# 定义一个knn实例
knn = kNN()

# 训练模型
knn.fit(X_train,Y_train)

result_list = []

#考虑不同k值
for k in range(1, 50, 5):
    knn.n_neighbors = k
    print(knn.n_neighbors)
    knn.dist_func = distance_Euclidean
    Y_pred = knn.predict(X_test)
    num_correct = np.sum(Y_pred == Y_test)
    accuracy = float(num_correct) / X_test.shape[0]
    result_list.append([k, accuracy])
pd.DataFrame(result_list, columns = ['k', '预测准确率'])

1
[1]
[0 1]
[0 1]
[1]
[0 0 1]
[0 1]
[0 0 1]
[1]
[1]
[0 0 1]
[0 1]
[1]
[0 0 1]
[0 1]
[0 1]
[1]
[0 1]
[0 1]
[1]
[1]
[0 1]
[0 1]
[0 1]
[1]
[0 0 1]
[0 1]
[1]
[1]
[0 1]
[0 0 1]
[0 1]
[0 0 1]
[0 1]
[0 0 1]
[0 0 1]
[1]
[0 1]
[1]
[0 1]
[0 0 1]
[0 0 1]
[1]
[0 1]
[0 0 1]
[0 1]
6
[6]
[0 6]
[0 6]
[6]
[0 0 6]
[0 4 2]
[0 0 6]
[6]
[6]
[0 0 6]
[0 6]
[6]
[0 0 6]
[0 6]
[0 5 1]
[6]
[0 6]
[0 6]
[6]
[6]
[0 6]
[0 6]
[0 3 3]
[6]
[0 0 6]
[0 6]
[6]
[6]
[0 5 1]
[0 2 4]
[0 5 1]
[0 0 6]
[0 6]
[0 0 6]
[0 0 6]
[6]
[0 6]
[6]
[0 5 1]
[0 0 6]
[0 0 6]
[6]
[0 3 3]
[0 0 6]
[0 6]
11
[11]
[ 0 11]
[ 0 10  1]
[11]
[ 0  0 11]
[0 6 5]
[ 0  0 11]
[11]
[11]
[ 0  0 11]
[ 0 11]
[11]
[ 0  0 11]
[0 8 3]
[0 8 3]
[11]
[ 0 11]
[0 9 2]
[11]
[11]
[ 0 11]
[0 8 3]
[0 4 7]
[11]
[ 0  0 11]
[ 0 11]
[11]
[11]
[0 8 3]
[0 2 9]
[0 8 3]
[ 0  0 11]
[ 0 11]
[ 0  0 11]
[ 0  1 10]
[11]
[ 0 11]
[11]
[0 8 3]
[ 0  0 11]
[ 0  0 11]
[11]
[0 4 7]
[ 0  1 10]
[0 9 2]
16
[16]
[ 0 16]
[ 0 13  3]
[16]
[ 0  0 16]
[0 9 7]
[ 0  1 15]
[16]
[16]
[ 0  0 16]
[ 0 16]
[1

Unnamed: 0,k,预测准确率
0,1,0.977778
1,6,0.977778
2,11,0.977778
3,16,0.977778
4,21,0.977778
5,26,0.977778
6,31,0.977778
7,36,0.955556
8,41,0.955556
9,46,0.933333


2. 采取另一种距离运算，是否会对分类结果造成影响(如曼哈顿距离)

曼哈顿距离𝑑𝑖𝑠𝑡=|𝑥1−𝑥2|+|𝑦1−𝑦2|

In [10]:
def distance_Manhattan(a, b):
    return np.sum(np.abs(a - b), axis = 1)

In [11]:
knn2 = kNN(n_neighbors = 3, dist_func = distance_Manhattan)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
print('Prediction:', Y_pred)
print(Y_pred.shape)
print('Test value:', Y_test)
num_correct = np.sum(Y_pred == Y_test)
accuracy = float(num_correct) / X_test.shape[0]
print('Got %d / %d correct => accuracy: %f' % (num_correct, X_test.shape[0], accuracy))

[36 10]
[21 24  1]
[ 0 27 19]
[36 10]
[ 0 11 35]
[ 0 22 24]
[ 0 14 32]
[36 10]
[36 10]
[ 0 10 36]
[ 0 31 15]
[36 10]
[ 0 10 36]
[ 0 25 21]
[ 0 29 17]
[36 10]
[ 0 32 14]
[ 0 28 18]
[36 10]
[36 10]
[ 0 31 15]
[ 0 28 18]
[ 0 17 29]
[36 10]
[ 0 11 35]
[ 0 32 14]
[36 10]
[36 10]
[ 0 29 17]
[ 0 19 27]
[ 0 27 19]
[ 0 10 36]
[ 0 32 14]
[ 0 14 32]
[ 0 17 29]
[36 10]
[ 0 31 15]
[36 10]
[ 0 25 21]
[ 0 11 35]
[ 0 12 34]
[36 10]
[ 0 25 21]
[ 0 14 32]
[ 0 25 21]
Prediction: [[0]
 [1]
 [1]
 [0]
 [2]
 [2]
 [2]
 [0]
 [0]
 [2]
 [1]
 [0]
 [2]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [2]
 [0]
 [2]
 [1]
 [0]
 [0]
 [1]
 [2]
 [1]
 [2]
 [1]
 [2]
 [2]
 [0]
 [1]
 [0]
 [1]
 [2]
 [2]
 [0]
 [1]
 [2]
 [1]]
(45, 1)
Test value: [[0]
 [1]
 [1]
 [0]
 [2]
 [1]
 [2]
 [0]
 [0]
 [2]
 [1]
 [0]
 [2]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [2]
 [1]
 [0]
 [0]
 [1]
 [2]
 [1]
 [2]
 [1]
 [2]
 [2]
 [0]
 [1]
 [0]
 [1]
 [2]
 [2]
 [0]
 [2]
 [2]
 [1]]
Got 42 / 45 correct => accuracy: 0.933333
