## k近邻算法

In [2]:
import numpy as np
import pandas as pd

# 这里直接引入sklearn里的数据集，iris鸢尾花
from sklearn.datasets import load_iris 
from sklearn.model_selection import train_test_split  # 切分数据集为训练集和测试集
from sklearn.metrics import accuracy_score # 计算分类预测的准确率

In [3]:
iris = load_iris()
iris # 可以发现该数据是一个类似字典的数据

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

### 数据的提取

In [4]:
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
df['class'] = iris.target  # 增加类别的列
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [6]:
df['class'] = df['class'].map({0: iris.target_names[0], 1: iris.target_names[1], 2: iris.target_names[2]})
df.head()  # 将0， 1，2 ，3 的类别的列转化为对应的花的名称

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [7]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [8]:
x = iris.data
y = iris.target.reshape(-1, 1)  # iris.target的形状为(150,) 要转化为(150, 1)的形式需要用到reshape
print(x.shape, y.shape)
print(type(x))

(150, 4) (150, 1)
<class 'numpy.ndarray'>


In [14]:
# 对训练集和测试集进行划分 调用库函数train_test_split进行划分
# 测试样本的比例为0.3， 并且stratify=y表示按照y的值进行均匀的划分，0， 1， 2， 3出现各自对应的比例相同
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=35, stratify=y)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(105, 4) (105, 1)
(45, 4) (45, 1)


In [15]:
# 测试方法的例子 np.sum(np.abs(a-b), axis=1) axis的用法
a = np.array([[3,2,4,2],
             [2,1,4,23],
             [12,3,2,3],
             [2,3,15,23],
             [1,3,2,3],
             [13,3,2,2],
             [213,16,3,63],
             [23,62,23,23],
             [23,16,23,43]])
b = np.array([[1,1,1,1]])
a-b

array([[  2,   1,   3,   1],
       [  1,   0,   3,  22],
       [ 11,   2,   1,   2],
       [  1,   2,  14,  22],
       [  0,   2,   1,   2],
       [ 12,   2,   1,   1],
       [212,  15,   2,  62],
       [ 22,  61,  22,  22],
       [ 22,  15,  22,  42]])

In [16]:
np.sum(np.abs(a- b))  # 直接使用sum来计算的话生成的是一个总的和，不会是一个列向量

628

In [17]:
np.sum(np.abs(a- b),axis=1)  # 此时得到的结果才是我们需要的

array([  7,  26,  16,  39,   5,  16, 291, 127, 101])

### 核心算法的实现

In [18]:
# 定义距离函数  曼哈顿距离
def l1_distance(a, b):
    return np.sum(np.abs(a-b), axis=1)

In [19]:
# 欧式距离
def l2_distance(a, b):
    return np.sqrt(np.sum((a-b)**2, axis=1))

In [27]:
y_pred = np.zeros((x.shape[0], 1), dtype=y_train.dtype)
y_pred
# dtype=y_train.dtype
# dtype

dtype('int64')

In [29]:
# 测试argsort()
a = np.array([1,10, 4, 25, 23, 7, 6])
b = np.argsort(a)  # b 返回的是一个排序后的下标的数组  b=array([0, 2, 6, 5, 1, 4, 3])

array([0, 2, 6, 5, 1, 4, 3])

In [37]:
c = y_train[b[:9]].ravel()
c

array([1, 2, 1, 1, 2, 1, 2])

In [39]:
d = np.argmax(np.bincount(c)) 
d

1

In [40]:
# 定义分类器
class kNN(object):
    # 定义构造函数，传入k邻近的k和距离函数
    def __init__(self, n_neighbor=1, dist_func=l1_distance):
        self.n_neighbor = n_neighbor
        self.dist_func = dist_func
    # 传入训练样本
    def fit(self, x, y):
        self.x_train = x
        self.y_train = y
    # 模型预测方法  
    def predict(self, x):
        # dtype('int64')  x.shape[0] = 150  生成的是一个(150, 1)的矩阵
        y_pred = np.zeros((x.shape[0], 1), dtype=y_train.dtype)
        # 枚举类型可以返回对应的序号i和测试数据
        for i, x_test in enumerate(x):
            # 计算测试样本中的数据到训练样本中数据的距离
            distances = self.dist_func(self.x_train, x_test)
            
            nn_index = np.argsort(distances) # 返回一个排序后下标的数组
            # 获取k个邻近元素的下标,然后通过下标获取其在y_train中的值
            nn_y = self.y_train[nn_index[:self.n_neighbor]].ravel()
            y_pred[i] =  np.argmax(np.bincount(nn_y))
        return y_pred

### 测试

In [42]:
# 定义一个knn实例
knn = kNN(n_neighbor=3)
# 传入训练参数
knn.fit(x_train, y_train)
# 传入测试数据，计算距离，进行预测分类
y_pred = knn.predict(x_test)

print(y_test.ravel())
print(y_pred.ravel())

[2 1 2 2 0 0 2 0 1 1 2 0 1 1 1 2 2 0 1 2 1 0 0 0 1 2 0 2 0 0 2 1 0 2 1 0 2
 1 2 2 1 1 1 0 0]
[2 1 2 2 0 0 2 0 1 1 1 0 1 1 1 2 2 0 1 2 1 0 0 0 1 2 0 2 0 0 2 1 0 2 1 0 2
 1 2 1 1 2 1 0 0]


In [44]:
# 求出预测准确率 利用库函数进行计算准确率， 由于此方法实现逻辑比较简单，直接用轮子比较方便
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9333333333333333

### 测试不同参数下的准确率

In [49]:
knn = kNN()
knn.fit(x_train, y_train)
# 定义一个数组用来保存结果
result = []

# 针对不同的参数选取，做预测
# [1, 2]表示不同的距离函数
for p in [1, 2]:
    knn.dist_func = l1_distance if p == 1 else l2_distance
    
    # 取不同的k值进行进行比较 取基数，步长为2
    for k in range(1, 10, 2):
        knn.n_neighbor = k
        # 传入测试数据，进行预测
        y_pred = knn.predict(x_test)
        # 求出预测准确率
        accuracy = accuracy_score(y_test, y_pred)
        result.append([k, 'l1_distance' if p == 1 else 'l2_distance', accuracy])
        # 将二维数组传入dataFrame 便于展示
df = pd.DataFrame(result, columns=['k', '距离函数', '预测准确率'])
df

Unnamed: 0,k,距离函数,预测准确率
0,1,l1_distance,0.933333
1,3,l1_distance,0.933333
2,5,l1_distance,0.977778
3,7,l1_distance,0.955556
4,9,l1_distance,0.955556
5,1,l2_distance,0.933333
6,3,l2_distance,0.933333
7,5,l2_distance,0.977778
8,7,l2_distance,0.977778
9,9,l2_distance,0.977778
