In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier

# 数据案例

## 加载数据

In [2]:
os.chdir(r"D:\Data\CDA\机器学习 课件与教材\机器学习 前四天课件\KNN\data")
dat = pd.read_csv("diabetes.csv")
print("dataset shape {}".format(dat.shape))
dat.head()

dataset shape (768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


可以看到数据有768个观测(行,样本),8列x,1列y共9列属性(特征,维度):

- Pregnancies:怀孕的次数
- Glucose: 血浆葡萄糖浓度,采用每两小时口服葡萄糖耐量试验得到
- BloodPressure: 舒张压(毫米汞柱)
- SkinThickness: 肱三头肌皮肤褶皱厚度(毫米)
- Insulin: 两个小时血清胰岛素($\mu U$/毫升)
- BMI: 身体质量指数, 体重除以身高的平方
- Diabetes Pedigree Function: 糖尿病血统指数,糖尿病和家族遗传相关
- Age: 年龄
- Outcome: 标记值,0表示没有糖尿病,1表示有糖尿病

## 分数据

In [3]:
dat["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

其中阴性样本500例,阳性样本268例,将数据分为训练集和测试集,x和y分离开

In [4]:
Y, X = dat.pop('Outcome'), dat

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, 
                                                    random_state=21)

## 尝试清洗、筛选数据和建模

### 标准化

In [7]:
standard = StandardScaler()

- 标准化, with_mean=True, with_std=True
- 处理稀疏矩阵的时候为了不改变稀疏性即不改变中心位置,设置with_mean=False
- 结果属性: mean_, var_

In [8]:
standard.fit_transform(X_train)

array([[ 0.95832994,  1.2320822 ,  0.86098014, ..., -0.21528805,
        -0.91344731,  1.19854245],
       [-0.84662277,  0.20836164,  0.6528944 , ..., -0.57775204,
        -1.06469172, -0.9666484 ],
       [ 0.05585359,  0.33244898,  0.86098014, ..., -0.51525825,
        -0.14512567,  2.5842646 ],
       ...,
       [-0.24497187, -1.24966461,  0.86098014, ..., -0.57775204,
        -0.48693805, -0.9666484 ],
       [ 0.95832994, -0.56718424, -0.17944853, ...,  0.8721039 ,
        -0.37199229, -0.18717969],
       [ 0.35667904,  1.26310403,  1.79736593, ...,  0.69712128,
        -0.95579574,  1.63158062]])

In [9]:
standard.mean_, standard.var_

(array([  3.81433225, 121.28338762,  69.4495114 ,  20.37785016,
         79.6465798 ,  32.12247557,   0.4669772 ,  33.16123779]),
 array([1.10502180e+01, 1.03911839e+03, 3.69517809e+02, 2.63332799e+02,
        1.35983327e+04, 6.40127196e+01, 1.09290221e-01, 1.33317651e+02]))

### 特征选择

In [10]:
selection = SelectKBest(k=2)

SelectKBest: 选择k个得分最高的特征,包含参数

- score_func: 数组X和y之间的函数,默认f_classif:方差分析,只用于分类y,其他更多见帮助文件
- k: 选择的特征数

In [11]:
selection.fit_transform(X_train, Y_train)

array([[161. ,  30.4],
       [128. ,  27.5],
       [132. ,  28. ],
       ...,
       [ 81. ,  27.5],
       [103. ,  39.1],
       [162. ,  37.7]])

In [12]:
selection.scores_

array([ 29.04380574, 183.92035023,   2.68945159,   3.32032559,
         9.72171222,  60.39735159,  19.69612969,  45.92170116])

In [13]:
pd.DataFrame({'colnames': X_train.columns, 
              'cor_score':selection.scores_}).sort_values('cor_score',
                                                          ascending=False)

Unnamed: 0,colnames,cor_score
1,Glucose,183.92035
5,BMI,60.397352
7,Age,45.921701
0,Pregnancies,29.043806
6,DiabetesPedigreeFunction,19.69613
4,Insulin,9.721712
3,SkinThickness,3.320326
2,BloodPressure,2.689452


- 这里选择的两个特征是Glucose(血糖浓度)和BMI(身体质量指数)

### knn建模

In [14]:
knn = KNeighborsClassifier(n_neighbors=10, weights='distance')

参数:
- n_neighbors: k值
- weights表示权重函数,默认是"uniform",可选项
    - "uniform":每个邻居点权重相等
    - "distance":权重取距离的负值(距离小的点影响较大)
    - $[$callable$]$: 用户自定义函数,输入距离数组,输出权重数组
- algorithm选择寻找最近邻的方法,"auto"表示根据实际的数据选择,"ball_tree"使用ball树,"kd_tree"使用kd树,"brute"暴力搜索
- leaf_size:树结构的叶节点包含样本数,影响构建树和索引的时间,以及内存使用
- metric:距离度量,默认闵可夫斯基距离,p=2这里等同于欧氏距离
- n_jobs,最近邻搜索时并行数,如果是-1表示并行数设置为cpu核数

In [15]:
knn.fit(X_train, Y_train)

In [16]:
knn.predict(X_test)

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1],
      dtype=int64)

In [17]:
knn.predict_proba(X_test)

array([[0.55393019, 0.44606981],
       [0.55549943, 0.44450057],
       [0.8087013 , 0.1912987 ],
       [0.13342321, 0.86657679],
       [0.38704912, 0.61295088],
       [0.8991374 , 0.1008626 ],
       [1.        , 0.        ],
       [0.8987104 , 0.1012896 ],
       [0.9177629 , 0.0822371 ],
       [0.81718573, 0.18281427],
       [0.80988921, 0.19011079],
       [0.7269019 , 0.2730981 ],
       [0.88946132, 0.11053868],
       [0.93058179, 0.06941821],
       [1.        , 0.        ],
       [0.53512239, 0.46487761],
       [0.53115033, 0.46884967],
       [1.        , 0.        ],
       [0.52192969, 0.47807031],
       [0.        , 1.        ],
       [0.7665017 , 0.2334983 ],
       [0.7727384 , 0.2272616 ],
       [0.34463568, 0.65536432],
       [0.23545814, 0.76454186],
       [0.17240912, 0.82759088],
       [0.90594793, 0.09405207],
       [0.81145898, 0.18854102],
       [0.81430082, 0.18569918],
       [0.70880378, 0.29119622],
       [0.50057787, 0.49942213],
       [1.

- 返回每个y分类对应的概率,分类顺序按字母顺序,这里从左往右分别是0,1列

In [18]:
knn.score(X_test, Y_test)

0.7142857142857143

### RadiusNeighbors建模

RadiusNeighborsClassifier 寻找指定半径范围内的邻居

In [19]:
rn = RadiusNeighborsClassifier(radius=130, weights='distance')

参数

- radius: 指定半径,找半径内所有点
- 其他参数类似knn

In [20]:
rn.fit(X_train, Y_train)

RadiusNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                          metric_params=None, n_jobs=None, outlier_label=None,
                          p=2, radius=130, weights='distance')

In [21]:
rn.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
      dtype=int64)

In [22]:
rn.score(X_test, Y_test)

0.6233766233766234

## 标准流程：构建pipeline，调优模型

### knn

In [24]:

pipeline=Pipeline([
    ("std",StandardScaler()),
    ("sel",SelectKBest()),
    ("knn",KNeighborsClassifier())
])

In [25]:
param_grid={
    "sel__k":range(1,9),
    "knn__n_neighbors":range(1,31),
    "knn__weights":["distance","uniform"]
}
    
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=10)

[GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html?highlight=gridsearch#sklearn.model_selection.GridSearchCV)
- 参数:
    - estimator:估算器
    - param_grid:字典或字典组成的列表,所有参数可取值
    - scoring:表示指标的字符串,详见: [scoring-parameter](https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)
    - cv:折数; n_jobs:核数
    - refit: 是否根据最优参数重新拟合全数据, 默认True
    - error_score:某些参数在某些折上产生错误的处理方式,默认raise报错,可以是整数或者np.nan.不会影响refit
    - verbose: 整数,训练过程中显示的提示信息,越大(>1,>2,>3)信息越多.
- 结果属性(部分):
    - cv_results_:数组组成的字典: 其中:params存储所有可能的参数组合,mean_test_score,所有可能的验证集均分
    - best_score_:最高分
    - best_estimator_:最高分数对应的估算器
    - best_params_:最高分对应的参数组合
- 方法:
    - predict(),predict_prob(): 使用最优参数调用预测方法
    - score(): refit为True时,返回给定数据的分数

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, 
                                                    random_state=21)

In [28]:
grid_search.fit(X_train, Y_train)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits
[CV 1/5; 1/480] START knn__n_neighbors=1, knn__weights=distance, sel__k=1.......
[CV 1/5; 1/480] END knn__n_neighbors=1, knn__weights=distance, sel__k=1;, score=0.618 total time=   0.0s
[CV 2/5; 1/480] START knn__n_neighbors=1, knn__weights=distance, sel__k=1.......
[CV 2/5; 1/480] END knn__n_neighbors=1, knn__weights=distance, sel__k=1;, score=0.675 total time=   0.0s
[CV 3/5; 1/480] START knn__n_neighbors=1, knn__weights=distance, sel__k=1.......
[CV 3/5; 1/480] END knn__n_neighbors=1, knn__weights=distance, sel__k=1;, score=0.626 total time=   0.0s
[CV 4/5; 1/480] START knn__n_neighbors=1, knn__weights=distance, sel__k=1.......
[CV 4/5; 1/480] END knn__n_neighbors=1, knn__weights=distance, sel__k=1;, score=0.707 total time=   0.0s
[CV 5/5; 1/480] START knn__n_neighbors=1, knn__weights=distance, sel__k=1.......
[CV 5/5; 1/480] END knn__n_neighbors=1, knn__weights=distance, sel__k=1;, score=0.648 total time=   0.0s
[CV 1/

In [29]:
grid_search.best_params_

{'knn__n_neighbors': 23, 'knn__weights': 'uniform', 'sel__k': 3}

In [30]:
grid_search.best_score_

0.7850059976009597

In [31]:
grid_search.score(X_test, Y_test)

0.7207792207792207

In [32]:
grid_search.predict(X_test)

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0],
      dtype=int64)

In [33]:
grid_search.predict_proba(X_test)

array([[0.73913043, 0.26086957],
       [0.73913043, 0.26086957],
       [0.86956522, 0.13043478],
       [0.26086957, 0.73913043],
       [0.47826087, 0.52173913],
       [0.7826087 , 0.2173913 ],
       [0.95652174, 0.04347826],
       [0.91304348, 0.08695652],
       [0.95652174, 0.04347826],
       [0.82608696, 0.17391304],
       [0.7826087 , 0.2173913 ],
       [0.82608696, 0.17391304],
       [0.82608696, 0.17391304],
       [0.86956522, 0.13043478],
       [1.        , 0.        ],
       [0.82608696, 0.17391304],
       [0.69565217, 0.30434783],
       [0.65217391, 0.34782609],
       [0.60869565, 0.39130435],
       [0.13043478, 0.86956522],
       [0.86956522, 0.13043478],
       [0.86956522, 0.13043478],
       [0.26086957, 0.73913043],
       [0.13043478, 0.86956522],
       [0.2173913 , 0.7826087 ],
       [0.86956522, 0.13043478],
       [0.91304348, 0.08695652],
       [0.65217391, 0.34782609],
       [0.65217391, 0.34782609],
       [0.60869565, 0.39130435],
       [0.

In [34]:
grid_search.best_estimator_

### 练习

**仿照knn的建模流程, 使用RadiusNeighbors建模**