# k邻近算法过程

## 一、自行设计算法过程

### 1.1、创建数据
    * 特征数据raw_data_X
    * 结果数据raw_data_y

### 1.2、将数据转化array数组格式的训练集
    * X_train
    * y_train

### 1.3、计算预测点到训练集各点的距离最小值
    * 欧拉距离计算公式distance
    * distance排序：nearest = np.argsort(distance)
    * 统计最接近的k个值：topK_y = [nearest[i] for i in nearest[:k]]
    * 统计最接近的k个值中的结果分类：from collections import Counter, votes = Counter(topK_y)
    * 得出1个预测值：votes.most_common(1)[0][0]

In [None]:
import math
import numpy as np
import matplotlib.pyplot as plt

In [None]:
raw_data_X = [[3.3, 2.3],
              [3.1, 1.7],
              [1.3, 3.3],
              [3.6, 4.7],
              [2.3, 2.9],
              [7.4, 4.7],
              [5.7, 3.6],
              [9.1, 2.5],
              [7.8, 3.4],
              [7.9, 0.8]
             ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

In [None]:
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)

In [None]:
X_train

In [None]:
y_train

### 分别对y_train == 0和1的情况绘制X_train位置图

In [None]:
plt.scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], color = 'red')
plt.scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], color = 'b')
plt.scatter(x[0],x[1], color='g', marker = 'x')

In [None]:
#欧拉公式距离计算示例(数组与向量的计算)
a = np.array([3.3, 2.3])
b = [8.1, 3.4] 
a - b

In [None]:
(a - b) ** 2

In [None]:
sum = np.sum((a-b)**2)
sum

In [None]:
math.sqrt(sum)

In [None]:
x = np.array([8.1, 3.4])
distance = [math.sqrt(np.sum(x_train- x) ** 2) for x_train in X_train ]  #每个样本均与预测点进行距离计算,表达式代替for循环提高效率
distance


In [None]:
nearest = np.argsort(distance)

In [None]:
k = 6

In [None]:
topK_y = [y_train[i] for i in nearest[:k]]
topK_y

In [None]:
from collections import Counter
votes = Counter(topK_y)

In [None]:
votes.most_common(1)

In [None]:
votes.most_common(1)[0][0]#第1个元组的第一个元素

#### collections模块的Counter类
##### 类方法
    * Counter(list/tuple/dict/string等)创建
    * most_common(N):从多到少统计前N个数据，返回列表，列表元素为tuple：[（），（），（）]
    * 解释页：http://www.pythoner.com/205.html

### 1.4、kNN函数封装

In [None]:
import numpy as np
import math
from collections import Counter
def kNN_classify(x, k, X_train, y_train):
    distances = [math.sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
    nearest = np.argsort(distances)
    topK_y = [y_train[i] for i in nearest[:k]]
    votes = Counter(topK_y)
    return votes.most_common(1)[0][0]

In [None]:
%run somelib/kNNFunction/kNN_Classify

#### 封装函数验证

In [None]:
raw_data_X = [[3.3, 2.3],
              [3.1, 1.7],
              [1.3, 3.3],
              [3.6, 4.7],
              [2.3, 2.9],
              [7.4, 4.7],
              [5.7, 3.6],
              [9.1, 2.5],
              [7.8, 3.4],
              [7.9, 0.8]
             ]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
x = np.array([8.1, 3.4])
k = 6
kNN_classify(x, k ,X_train, y_train)

## 二、使用scikit—learn中的kNN

In [None]:
#导入
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#创建实例,n_neighbors相当于k
kNN_classifier = KNeighborsClassifier(n_neighbors=6)

In [None]:
#拟合（实例传入训练数据集特征矩阵和标签向量进行拟合）
kNN_classifier.fit(X_train, y_train)
#现在模型将存在此实例中

In [None]:
x

In [None]:
#将向量转化为矩阵
X = x.reshape(1, -1)

In [None]:
X

In [None]:
#利用模型对矩阵X进行结果预测
kNN_classifier.predict(X)

In [None]:
#保存结果
y_predict = kNN_classifier.predict(X)

### 2018-9-3

## 三、sklearn数据集拆分（训练集和测试集）
### 3.1实现方法

In [None]:
import numpy as np
import matplotlib.pyplot as plt
#引入sklearn中的datasets数据集
from sklearn import datasets

In [None]:
#加载鸢尾花数据集
iris = datasets.load_iris()
#导入数据特征属性值（特征矩阵）
x = iris.data
#导入数据目标值（结果标签对应的向量）
y = iris.target
x.shape, y.shape

### train_test_split将数据集拆分为训练数据集和验证数据集

#### 将原始数据乱序处理

In [None]:
#方法一：将x，y合并为一个矩阵（特征矩阵+标签向量），乱序后在拆分
#方法二：只对数据的索引进行乱序
shuffie_indexes = np.random.permutation(len(x))   #len(x)=150,对0到149进行随机排列

In [None]:
#设置测试数据集的比例
test_ratio = 0.2
#计算测试数据集的个数
test_size = int(len(x) * test_ratio)
test_size

In [None]:
#测试数据集索引
test_indexes = shuffie_indexes[:test_size]
#训练数据索引
train_indexes = shuffie_indexes[test_size:]
#测试数据特征x和标签y
x_test = x[test_indexes]
y_test = y[test_indexes]
#训练数据特征x和标签y
x_train = x[train_indexes]
y_train = y[train_indexes]
x_train.shape, y_train.shape

#### 3.2拆分函数封装
    将特征数据集X和对应的标签数量y，按一定的拆分比例test_ratio进行拆分

In [None]:
def train_test_split(x, y, ratio = 0.2, seed = None):
    #将参数合法性进行断言
    assert x.shape[0] == y.shape[0]
    assert 0.0 <= ratio <= 1.0

    
    #将数据索引进行乱序
    if seed:
        np.random.seed(seed)
    shuffie_indexes = np.random.permutation(len(x))
    
    test_size = int(len(x) * ratio)
    #得出测试数据
    x_test = x[shuffie_indexes[:test_size]]
    y_test = y[shuffie_indexes[:test_size]]
    
    #得出训练数据
    x_train = x[shuffie_indexes[test_size:]]
    y_train = y[shuffie_indexes[test_size:]]    
    
    return x_train, y_train, x_test, y_test


In [None]:
x = iris.data
y = iris.target
x_train, y_train, x_test, y_test = train_test_split(x, y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
#创建实例
kNN_Classifier = KNeighborsClassifier(n_neighbors = 6)
#拟合训练数据
kNN_Classifier.fit(x_train, y_train)

#预测测试数据
test_predict = kNN_Classifier.predict(x_test)
test_predict

In [None]:
y_test

In [None]:
sum(test_predict == y_test)

In [None]:
#预测合格率
sum(test_predict == y_test) / len(y_test)

### 2018-9-4