<h1>超参数选择</h1>


In [1]:
import numpy as np
from sklearn import  datasets

In [2]:
#手写数字数据集
digits = datasets.load_digits()
digits

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'frame': None,
 'feature_names': ['pixel_0_0',
  'pixel_0_1',
  'pixel_0_2',
  'pixel_0_3',
  'pixel_0_4',
  'pixel_0_5',
  'pixel_0_6',
  'pixel_0_7',
  'pixel_1_0',
  'pixel_1_1',
  'pixel_1_2',
  'pixel_1_3',
  'pixel_1_4',
  'pixel_1_5',
  'pixel_1_6',
  'pixel_1_7',
  'pixel_2_0',
  'pixel_2_1',
  'pixel_2_2',
  'pixel_2_3',
  'pixel_2_4',
  'pixel_2_5',
  'pixel_2_6',
  'pixel_2_7',
  'pixel_3_0',
  'pixel_3_1',
  'pixel_3_2',
  'pixel_3_3',
  'pixel_3_4',
  'pixel_3_5',
  'pixel_3_6',
  'pixel_3_7',
  'pixel_4_0',
  'pixel_4_1',
  'pixel_4_2',
  'pixel_4_3',
  'pixel_4_4',
  'pixel_4_5',
  'pixel_4_6',
  'pixel_4_7',
  'pixel_5_0',
  'pixel_5_1',
 

In [3]:
x = digits.data
y = digits.target
x,y

(array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]),
 array([0, 1, 2, ..., 8, 9, 8]))

In [8]:
#使用sklearn中的train_test_split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 666)

In [9]:
#使用sklearn中的knn
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors = 3)
knn_classifier.fit(x_train,y_train)
knn_classifier.score(x_test,y_test)

0.9916666666666667

1、超参数k, 寻找最合适的k

In [10]:
best_score = 0.0
best_k = -1
for k in range(1,11):
    knn_classifier = KNeighborsClassifier(n_neighbors = k)
    knn_classifier.fit(x_train,y_train)
    score = knn_classifier.score(x_test,y_test)
    if score > best_score:
        best_score = score
        best_k = k
best_k,best_score

(3, 0.9916666666666667)

2、超参数权重weights，用于控制是否考虑距离权重

In [17]:
best_method =''
best_score = 0.0
best_k = -1
for method in ['uniform','distance']:
    for k in range(1,11):
        knn_classifier = KNeighborsClassifier(n_neighbors = k,weights = method)
        knn_classifier.fit(x_train,y_train)
        score = knn_classifier.score(x_test,y_test)
        if score > best_score:
            best_method = method
            best_score = score
            best_k = k
best_method,best_k,best_score

0.9833333333333333
0.9888888888888889
0.9916666666666667
0.9916666666666667
0.9888888888888889
0.9888888888888889
0.9861111111111112
0.9861111111111112
0.9833333333333333
0.9833333333333333
0.9833333333333333
0.9861111111111112
0.9916666666666667
0.9888888888888889
0.9888888888888889
0.9888888888888889
0.9888888888888889
0.9888888888888889
0.9861111111111112
0.9861111111111112


('uniform', 3, 0.9916666666666667)

3、距离参数，使用哪种距离

In [19]:
best_p = -1
best_method =''
best_score = 0.0
best_k = -1
for method in ['uniform','distance']:
    for p in range(1,6):
        for k in range(1,11):
            knn_classifier = KNeighborsClassifier(n_neighbors = k,weights = method,p = p)
            knn_classifier.fit(x_train,y_train)
            score = knn_classifier.score(x_test,y_test)
            if score > best_score:
                best_method = method
                best_score = score
                best_k = k
                best_p = p
best_method,best_k,best_score,best_p

('uniform', 3, 0.9916666666666667, 2)

网格搜索（Grid Search）是一种超参数优化方法，用于系统地遍历给定的超参数组合，以找到模型的最佳超参数设置（就像在一张井字网上逐个格子搜索）。它通过在预定义的超参数空间中进行穷举搜索，评估每个组合的性能，从而选择出最优的超参数组

In [20]:
from sklearn.model_selection import GridSearchCV

#定义网格参数
paramGrid = [{
    "weights":["uniform"],
    "n_neighbors":[k for k in range(1,11)]
},{
    "weights":["distance"],
    "n_neighbors":[k for k in range(1,11)],
    "p":[p for p in range(1,6)]
}]

#网格搜索
knn_classifier = KNeighborsClassifier()
grid_search = GridSearchCV(knn_classifier,paramGrid)
grid_search.fit(x_train,y_train)
knn_classifier = grid_search.best_estimator_
knn_classifier.score(x_test,y_test)

0.9833333333333333

In [21]:
grid_search.best_params_

{'n_neighbors': 1, 'weights': 'uniform'}

In [22]:
grid_search