# 算法介绍

KNN，即K-Nearest Neighbor，选取K个距离自己最近的邻居中出现频率最高的label作为自己的分类。其过程如下：<br>
![](https://tva1.sinaimg.cn/large/e6c9d24ely1h672xemk8nj214q0ek0wi.jpg)
![](https://tva1.sinaimg.cn/large/e6c9d24ely1h674jcvhu1j21os0a2myw.jpg)

# 算法构建

In [84]:
import pandas as pd 
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
from pyecharts.charts import *

In [5]:
def create_data_set():
    """数据集"""
    group = np.array([[1.0,1.0],[1.0,1.1],[0.0,0.0],[0,0.1]])
    labels = ['A','A','B','B']
    return group,labels

In [42]:
def classify0(inX,dataset,labels,K):
    data_size = dataset.shape[0]
    inx_arr = np.tile(inX,(data_size,1))
    diff = inx_arr - dataset
    square_diff = diff ** 2 
    square_sum = np.sum(square_diff,axis=1)
    distance = square_sum ** 0.5
    sorted_dist_indicies = np.argsort(distance)
    label_count = {}
    for i in range(K):
        vote_label = labels[sorted_dist_indicies[i]]
        label_count[vote_label] = label_count.get(vote_label,0)+1
    sorted_label_count = sorted(label_count.items(), key=lambda x: x[1], reverse=True)
    return sorted_label_count[0][0]

In [43]:
group,labels = create_data_set()
classify0([0,0],group,labels,3)

'B'

# 应用实战

## 使用 k-近邻算法改进约会网站的配对效果

![](https://tva1.sinaimg.cn/large/e6c9d24ely1h674my4e37j21pc0aqq6i.jpg)

### 读取文件

In [77]:
def file_to_matrix(file):
    fo = open(file, "r")
    lines = fo.readlines()
    fo.close()
    number_of_lines = len(lines)
    matrix = np.zeros((number_of_lines,3))
    labels = list()
    index = 0
    for line in lines:
        line = line.strip()
        list_from_line = line.split("\t")
        matrix[index,:] = list_from_line[0:3]
        labels.append(int(list_from_line[-1]))
        index = index+1
    return matrix,labels

In [78]:
dating_data_mat,dating_labels = file_to_matrix('./datingTestSet2.txt')

### 特征缩放

In [93]:
x_data = ['每年获得的飞行常客里程数', '玩视频游戏所耗时间百分比', '每周消费的冰淇淋公升数']
y_data = [dating_data_mat[:,0],dating_data_mat[:,1],dating_data_mat[:,2]]
Box = Boxplot()
Box.add_xaxis(x_data)
Box.add_yaxis("", Box.prepare_data(y_data))
Box.render_notebook()

各个特征间的数值差距较大，需要进行特征缩放

In [104]:
def auto_norm(dataSet):
    # 每个特征的最小值和最大值
    min_values = dataSet.min(0)
    max_values = dataSet.max(0)
    ranges = max_values - min_values
    norm_data_set = np.zeros(dating_data_mat.shape)
    m = dataSet.shape[0]
    norm_data_set = dataSet - np.tile(min_values,(m,1))
    norm_data_set = norm_data_set / np.tile(ranges,(m,1))
    return norm_data_set,ranges,min_values

### 算法测试

In [125]:
def dating_class_test():
    ho_ration = 0.1
    dating_data_mat,dating_labels = file_to_matrix('./datingTestSet2.txt')
    norm_dataset,ranges,min_values = auto_norm(dating_data_mat)
    m = norm_dataset.shape[0]
    number_of_tests = int(m*ho_ration)
    error_count = 0
    for i in range(number_of_tests):
        classify_result = classify0(norm_dataset[i,:],norm_dataset[number_of_tests:m,:],
                                    dating_labels[number_of_tests:m],3)
        print("the classify result is {}, the true result is {}".format(classify_result,dating_labels[i]))
        if classify_result != dating_labels[i]:
            error_count+=1
    print("the total error rate is:{}".format(error_count/number_of_tests))

In [127]:
dating_class_test()

the classify result is 3, the true result is 3
the classify result is 2, the true result is 2
the classify result is 1, the true result is 1
the classify result is 1, the true result is 1
the classify result is 1, the true result is 1
the classify result is 1, the true result is 1
the classify result is 3, the true result is 3
the classify result is 3, the true result is 3
the classify result is 1, the true result is 1
the classify result is 3, the true result is 3
the classify result is 1, the true result is 1
the classify result is 1, the true result is 1
the classify result is 2, the true result is 2
the classify result is 1, the true result is 1
the classify result is 1, the true result is 1
the classify result is 1, the true result is 1
the classify result is 1, the true result is 1
the classify result is 1, the true result is 1
the classify result is 2, the true result is 2
the classify result is 3, the true result is 3
the classify result is 2, the true result is 2
the classify 

In [128]:
def classify_date_person():
    result_list = ['not at all','in small doses','in large doses']
    percentTats = float(input('玩视频游戏所耗时间百分比'))
    fly_miles = float(input('每年获得的飞行常客里程数'))
    ice_cream = float(input('每周消费的冰淇淋公升数'))
    dating_datamat,dating_labels = file_to_matrix('./datingTestSet2.txt')
    norm_mat,ranges,min_values = auto_norm(dating_data_mat)
    in_arr = np.array([[percentTats,fly_miles,ice_cream]])
    classify_result = classify0((in_arr-min_values)/ranges,norm_mat,dating_labels,3)
    print(result_list[classify_result])

In [129]:
classify_date_person()

玩视频游戏所耗时间百分比10
每年获得的飞行常客里程数10000
每周消费的冰淇淋公升数0.5
in small doses
