In [12]:
import numpy as np
from sklearn import datasets

In [13]:
digits = datasets.load_digits()
X = digits.data
y = digits.target

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 666)

In [15]:
from sklearn.neighbors import KNeighborsClassifier

best_score, best_p, best_k = 0, 0, 0
for k in range(2, 11):
    for p in range(1, 6):
        knn_clf = KNeighborsClassifier(weights = "distance", n_neighbors = k, p = p)
        knn_clf.fit(X_train, y_train)
        score = knn_clf.score(X_test, y_test)
        if score > best_score:
            best_score, best_p, best_k = score, p, k
print("Best score =", best_score)
print("Best k =", best_k)
print("Best p =", best_p)


Best score = 0.9866666666666667
Best k = 5
Best p = 2


### Using Cross Validation
1. 自动进行交叉验证：默认分成3份
2. 返回k个模型各自的准确率

In [16]:
from sklearn.model_selection import cross_val_score

knn_clf = KNeighborsClassifier()
cross_val_score(knn_clf, X_train, y_train)



array([0.98896247, 0.98210291, 0.98210291])

In [17]:
best_score, best_p, best_k = 0, 0, 0
for k in range(2, 11):
    for p in range(1, 6):
        knn_clf = KNeighborsClassifier(weights = "distance", n_neighbors = k, p = p)
        scores = cross_val_score(knn_clf, X_train, y_train)
        score = np.mean(scores)
        if score > best_score:
            best_score, best_p, best_k = score, p, k
print("Best score =", best_score)
print("Best k =", best_k)
print("Best p =", best_p)

'''结果：
Best score = 0.9866166891368011
Best k = 5
Best p = 3
'''





Best score = 0.9866166891368011
Best k = 5
Best p = 3


'结果：\nBest score = 0.9866166891368011\nBest k = 5\nBest p = 3\n'

### Cross Validation 
1. 准确率会低一点，不过更有可信度
2. 目的在于找到最好的超参数，这里分别是k=5, p=3

In [18]:
best_knn_clf = KNeighborsClassifier(weights = "distance", n_neighbors = 5, p = 3)
best_knn_clf.fit(X_train, y_train)
best_knn_clf.score(X_test, y_test)

0.9822222222222222

### Recap grid search

In [24]:
from sklearn.model_selection import GridSearchCV
knn_clf = KNeighborsClassifier()
#利用到了相关性，例如如果weights = uniform，就没有p这个参数
param_grid = [
    {
        'weights':['uniform'],
        'n_neighbors':[i for i in range(2,11)]
    },
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(2,11)],
        'p':[i for i in range(1,6)]
    }
]

grid_search = GridSearchCV(knn_clf, param_grid, verbose = 1)
grid_search.fit(X_train, y_train)
#Fitting 3 folds for each of 54 candidates, totalling 162 fits
#意味着：默认分成三层，

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=1)]: Done 162 out of 162 | elapsed:  1.4min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [27]:
grid_search.best_score_

0.9866369710467706

In [28]:
grid_search.best_params_

{'n_neighbors': 5, 'p': 3, 'weights': 'distance'}

In [26]:
best_knn_clf = grid_search.best_estimator_
best_knn_clf.score(X_test, y_test)

0.9822222222222222

#### grid_search的best_score_以及best_params_ 和cross_val_score() 中的best_knn_clf的一样

### 交叉验证：将数据集分成n份：

In [29]:
cross_val_score(knn_clf, X_train, y_train, cv = 5)

array([0.98540146, 0.99632353, 0.98507463, 0.97378277, 0.98120301])

In [30]:
GridSearchCV(knn_clf, param_grid, verbose = 1, cv=5)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)