In [1]:
import numpy as np
from sklearn import datasets
digits = datasets.load_digits()

In [2]:
X = digits.data
y = digits.target

In [3]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=666)

In [4]:
from sklearn.neighbors import KNeighborsClassifier
sk_knn_clf = KNeighborsClassifier(n_neighbors=4,weights="uniform")
sk_knn_clf.fit(X_train,y_train)
sk_knn_clf.score(X_test,y_test)

0.9916666666666667

## Grid Search

In [5]:
param_grid = [
    {
        'weights':['uniform'],
        'n_neighbors':[i for i in range(1,11)]
    },
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(1,11)],
        'p':[i for i in range(1,6)]
    }   
]

**param_grid是一个数组，每一个元素是一个字典，每一个字典对应的就是我们要进行的一组网格搜索，每一组网格搜索（即每一个字典）要列出这组网格搜索要遍历的各个参数的取值范围。**

**字典中，键是参数的名称，值为一个列表，在列表中存储这个键对应的所有可能的范围。**

In [6]:
knn_clf = KNeighborsClassifier()

In [7]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf,param_grid) #传入需要进行网格搜索的分类器的名字，和需要进行网格搜索的参数

In [8]:
%%time

grid_search.fit(X_train,y_train)



CPU times: user 1min 34s, sys: 334 µs, total: 1min 34s
Wall time: 1min 34s


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [10]:
#获得最佳超参数结果
grid_search.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=3,
                     weights='distance')

In [11]:
#查看对应最佳超参数结果的准确度
grid_search.best_score_

0.9853862212943633

In [12]:
#查看相对于我们的param_grid数组而言，最好的超参数
grid_search.best_params_

{'n_neighbors': 3, 'p': 3, 'weights': 'distance'}

In [13]:
#可以直接得到最佳的模型
knn_clf = grid_search.best_estimator_

In [15]:
knn_clf.score(X_test,y_test)

0.9833333333333333

In [22]:
#利用多核心计算，可以指定n核，也可以传入-1，代表使用全部核心.
#verbose参数可以使grid search输出一些信息，如果传入的值越大，输出的信息也就越详细
grid_search = GridSearchCV(knn_clf,param_grid,n_jobs=-1,verbose=6)

In [23]:
%%time
grid_search.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   21.3s


CPU times: user 539 ms, sys: 20 ms, total: 559 ms
Wall time: 37.2 s


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   37.1s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=3, p=3,
                                            weights='distance'),
             iid='warn', n_jobs=-1,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=6)