In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [4]:
boston = datasets.load_boston()

X = boston.data
y = boston.target

X = X[y<50] #这里的X不再是simple lr中的”RM"一个维度，而是全部维度
y = y[y<50]

X.shape

(490, 13)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 666)

### Linear regression in scikit-learn

In [7]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [8]:
lin_reg.coef_ 
#与自己实现的myLinearRegression不同，因为我们的train,test切割方法不一样。
#所以如果调用我自己的my_train_test_split，coef_的结果会一致

array([-1.15625837e-01,  3.13179564e-02, -4.35662825e-02, -9.73281610e-02,
       -1.09500653e+01,  3.49898935e+00, -1.41780625e-02, -1.06249020e+00,
        2.46031503e-01, -1.23291876e-02, -8.79440522e-01,  8.31653623e-03,
       -3.98593455e-01])

In [9]:
lin_reg.intercept_

32.59756158869959

In [11]:
lin_reg.score(X_test, y_test)

0.81232090107018

In [10]:
import sys
sys.path.append("..")

from myML.model_selection import my_train_test_split
X_train, X_test, y_train, y_test = my_train_test_split(X, y, seed = 666)

my_lin_reg = LinearRegression()
my_lin_reg.fit(X_train, y_train)
print(my_lin_reg.coef_)
print(my_lin_reg.intercept_)

[-1.20354261e-01  3.64423279e-02 -3.61493155e-02  5.12978140e-02
 -1.15775825e+01  3.42740062e+00 -2.32311760e-02 -1.19487594e+00
  2.60101728e-01 -1.40219119e-02 -8.35430488e-01  7.80472852e-03
 -3.80923751e-01]
34.117399723229845


### kNN regressor （注意不是kNN classifier!）

In [12]:
from sklearn.neighbors import KNeighborsRegressor

knn_reg = KNeighborsRegressor() #创建算法的实例
knn_reg.fit(X_train, y_train) #fit
knn_reg.score(X_test, y_test) #你可以score(), 或者predict()

0.5865412198300899

In [16]:
from sklearn.model_selection import GridSearchCV

# param_grid = [
#     { aa : [yy], #记得有逗号！
#       bb : [xx]
#     },
#     { xx : [zz] }
# ]

param_grid = [
    {
        "weights" : ["uniform"],
        "n_neighbors" : [i for i in range(1, 11)]
    },
    {
        "weights" : ["distance"],
        "n_neighbors" : [i for i in range(1, 11)],
        "p" : [i for i in range(1,6)] #当p==2是欧拉距离。 p==1是曼哈顿距离
    }
]

knn_reg = KNeighborsRegressor() #回归器
grid_search = GridSearchCV(knn_reg, param_grid, n_jobs = -1, verbose = 1) #第一个参数：回归器， 第二个参数：grid
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    2.4s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [17]:
grid_search.best_params_ #使用曼哈顿距离， k = 5， 用distance

{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [18]:
grid_search.best_score_ #这个是CV交叉验证下的score

0.6340477954176972

In [19]:
grid_search.best_estimator_.score(X_test, y_test) #score是KNeighborsRegressor的score的计算方式
#这个0.7044357727037996 结果，比没有进行grid_search中knn_reg.score()高
#但是比linear regression低
#但是不能说knn 比 linear regression 好
#因为grid_search.best_estimator_返回的best_estimator_（也就是KNeighborsRegressor）是在GridSearchCV计算score方法下分数最好的KNeighborsRegressor
#而不是KNeighborsRegressor的score的计算下的最高值

0.7044357727037996