 ### 实现多元线性回归模型

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
boston = datasets.load_boston()
X = boston.data
y = boston.target
X = X[y < 50.0]
y = y[y < 50.0]

In [3]:
from ML.model_selection import train_test_split
X_train,y_train,X_test,y_test = train_test_split(X,y,seed = 666)

### 使用正规方程自己封装多元回归模型

```python
import numpy as np
from .metrics import r2_score

class LinearRegression:
    def __init__(self):
        """初始化LinearRegression"""
        self.coef_ = None #系数
        self.interception_ = None #截距
        self._theta = None #theta

    def fit_normal(self,X_train,y_train):
        """利用X_train,y_train fit 正规方程"""
        assert X_train.shape[0] == y_train.shape[0],\
            "the size of the X_train be equal to the size of y_train"

        X_b = np.hstack([np.ones((len(X_train),1)),X_train])
        self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
        self.interception_ = self._theta[0]
        self.coef_ = self._theta[1:]

        return self

    def predict(self,X_predict):
        """给定一个X_predict,返回表示X_predict的结果向量"""
        assert self.interception_ is not None and self.coef_ is not None,\
            "must fit before predict"
        assert X_predict.shape[1] == len(self.coef_),\
            "the feature number of X_predict must be equal to X_train"

        X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
        return X_b.dot(self._theta)

    def score(self,X_test,y_test):
        """根据测试数据集 X_test和y_test确定当前模型的准确度"""
        y_predict = self.predict(X_test)
        return r2_score(y_test,y_predict)


    def __repr__(self):
        return "LinearRegression()"
```

In [7]:
from ML.LinearRegression import LinearRegression
reg = LinearRegression()
print(X_train.shape)
reg.fit_normal(X_train,y_train)
reg.coef_

(392, 13)


array([-1.18919477e-01,  3.63991462e-02, -3.56494193e-02,  5.66737830e-02,
       -1.16195486e+01,  3.42022185e+00, -2.31470282e-02, -1.19509560e+00,
        2.59339091e-01, -1.40112724e-02, -8.36521175e-01,  7.92283639e-03,
       -3.81966137e-01])

In [9]:
reg.interception_

34.16143549624022

In [10]:
reg.score(X_test,y_test)

0.8129802602658537

##  使用scikit-learn解决回归问题{以波士顿放假为例}

In [13]:
使用sklearn中的split方法
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=666)

### 使用sklearn.linear_model.LinearRegression

In [15]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [16]:
lin_reg.coef_

array([-1.14235739e-01,  3.12783163e-02, -4.30926281e-02, -9.16425531e-02,
       -1.09940036e+01,  3.49155727e+00, -1.40778005e-02, -1.06270960e+00,
        2.45307516e-01, -1.23179738e-02, -8.80618320e-01,  8.43243544e-03,
       -3.99667727e-01])

In [17]:
lin_reg.intercept_

32.64566083965359

In [18]:
lin_reg.score(X_test,y_test)

0.8008916199519112

### 使用KNN Regression解决回归问题

In [19]:
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train,y_train)
knn_reg.score(X_test,y_test)

0.602674505080953

#### KNN超参数网格搜索调参

In [22]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {
        'weights':['uniform'],
        'n_neighbors':[i for i in range(1,11)]
    },
    {
        'weights':['distance'],
        'n_neighbors':[i for i in range(1,11)],
        'p':[i for i in range(1,6)]
    }
]
knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg,param_grid,n_jobs=-1,verbose=1)
grid_search.fit(X_train,y_train)
grid_search.best_params_

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    2.7s finished


{'n_neighbors': 6, 'p': 1, 'weights': 'distance'}

In [23]:
grid_search.best_score_#交叉验证的score

0.6060327991735741

In [24]:
grid_search.best_estimator_.score(X_test,y_test)

0.7354244906092771

####  注：这个最佳的超参数是由CV方法的score决定的。不能和我们的score方法得分相提并论，即不能说knn解决回归比LinearRegression好