# K近邻算法-自由练习

### 0. 引入依赖

In [158]:
import numpy as np
import pandas as pd


#治理直接引入sklearn的数据集，红酒分类预测练习
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split    ##切分数据集为切分数据集和测试集
from sklearn.metrics import accuracy_score  #计算分类预测的准确率

### 1. 数据加载预处理

In [159]:
wine = load_wine()
df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
df['class']=wine.target

In [160]:
x = wine.data
y = wine.target.reshape(-1,1)

In [161]:
# 划分训练集和测试集

## random_state: 随机种子，可以随便给
## Stratify: 使分布成比例

x_train, x_test,y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=35, stratify=y)

print(x_train.shape, y_train.shape)


(124, 13) (124, 1)


### 2. 核心算法实现

In [162]:
#距离函数定义 (寻找每个x_test 和 每个 x_train之间的距离)
def l1_distance(a,b):
    '''
    parameters:
    a: training set with size (133,13)
    b: test vector with size (1,13)

    return:
    the distance between each example in a and b using l1 distance
    '''
    return np.sum(np.abs(a-b), axis=1)

def l2_distance(a,b):
    '''
    parameters:
    a: training set with size (133,13)
    b: test vector with size (1,13)

    return:
    the distance between each example in a and b using l2 distance
    '''
    return np.sqrt(np.sum(np.abs(a-b)**2, axis=1))



# 分类器实现
class kNN(object):
    #__init__ method
    def __init__(self, n_neighbours=1, dist_func = l1_distance):
        self.n_neighbours= n_neighbours
        self.dist_func=dist_func
    
    #训练模型方法
    ##注意这里是Knn所以其实并不需要训练，只要传入training set就行
    def fit(self,x,y):
        self.x_train=x
        self.y_train=y

    #模型预测方法
    def predict (self,x):
        #初始化预测分类数据
        y_pred = np.zeros((x.shape[0],1),dtype=self.y_train.dtype)
        #遍历输入的x数据点，去除每一个数据点序号i和数据x_test
        for i, x_test in enumerate(x):
            # x_test跟所有训练数据计算距离
            distances = self.dist_func(self.x_train,x_test)
            # 得到的距离按照由近到远排序，取出index
            nn_index=np.argsort(distances)
            # 取出前n个索引值，和其对应的y值
            nn_y = self.y_train[nn_index[:self.n_neighbours]].ravel()
            #找出概率最高的值
            y_pred[i]=np.argmax(np.bincount(nn_y))
        return y_pred





### 3.测试

In [163]:
knn=kNN(n_neighbours=6, dist_func=l2_distance)
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
accuracy=accuracy_score(y_test,y_pred)

print("预测准确率：", accuracy)

预测准确率： 0.7592592592592593


In [164]:
#定义一个实例
knn=kNN()
#训练模型
knn.fit(x_train, y_train)
#传入测试数据做预测
y_pred = knn.predict(x_test)

#保存结果list
result_list=[]

#针对不同的参数选取，做预测
for p in [1,2]:
    knn.dist_func = l1_distance if p==1 else l2_distance
    #考虑不同的k值, 步长为2
    for k in range (1,5,2): 
        knn.n_neighbours=k
        # 传入测试数据，做预测
        y_pred = knn.predict(x_test)
        # 求出预测准确率
        accuracy = accuracy_score(y_test,y_pred)
        result_list.append([k, 'l1_distance' if p==1 else 'l2_distance', accuracy])

df=pd.DataFrame(result_list, columns=['k','距离函数','预测准确率'])

df

Unnamed: 0,k,距离函数,预测准确率
0,1,l1_distance,0.87037
1,3,l1_distance,0.814815
2,1,l2_distance,0.851852
3,3,l2_distance,0.740741
