# 波士顿房价预测学习目标
- 掌握sklearn中线性回归相关API的使用方法
- 掌握模型保存和家在的方法

---
```python
sklearn.linear_model.LinearRegression(fit_intercept=True)
# 参数：fit_intercept，是否计算偏置
# 属性：LinearRegression.coef_ （回归系数） LinearRegression.intercept_（偏置）

SGDRegressor
# SGDRegressor类实现了随机梯度下降学习，它支持不同的 损失函数和正则化惩罚项 来拟合线性回归模型。
# 参数：loss（损失函数类型），fit_intercept（是否计算偏置）learning_rate （学习率）
# 属性：SGDRegressor.coef_ （回归系数）SGDRegressor.intercept_ （偏置）
```



In [11]:
# 导入波士顿房价数据集
# from sklearn.datasets import load_boston
# 导入训练集测试集划分包
from sklearn.model_selection import train_test_split
# 导入特征缩放的工具
from sklearn.preprocessing import StandardScaler
# 导入均方误差包
from sklearn.metrics import mean_squared_error
# 导入随机梯度下降算法
from sklearn.linear_model import SGDRegressor
# 导入线性回归算法
from sklearn.linear_model import LinearRegression


In [35]:
import pandas as pd
import numpy as np

raw_df = pd.read_csv("http://lib.stat.cmu.edu/datasets/boston", sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

In [61]:
def normal_equation():
    '''
    线性回归:正规方程
    :return: None
    '''
    # 1、加载数据
    
    # 2、数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data, target, random_state=22)
    
    # 3、特征工程-标准化
    transfer = StandardScaler()
    print("x_train标准化前：", x_train[:1])
    print("x_test标准化前：", x_test[:1])
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.transform(x_test)
    print("x_train标准化后：", x_train[:1])
    print("x_test标准化后：", x_train[:1])
    
    # 4、机器学习-线性回归（正规方程）
    estimator = LinearRegression()
    estimator.fit(x_train, y_train)
    
    # 5、模型评估
    y_predict = estimator.predict(x_test)
    print('预测值：', y_predict)
    print('模型中系数为：', estimator.coef_)
    print('模型中偏置为：', estimator.intercept_)
    
    # 6、评价（均方误差）
    error = mean_squared_error(y_test, y_predict)
    print('均方误差为：', error)
    
    return estimator

In [62]:
def gradient_desc():
    # 1、获取数据

    # 2、数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data, target, random_state=22)
    
    # 3、特征工程-标准化
    transfer = StandardScaler()
    print("x_train标准化前：", x_train[:1])
    print("x_test标准化前：", x_test[:1])
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)
    print("x_train标准化后：", x_train[:1])
    print("x_test标准化后：", x_train[:1])
    
    # 4、机器学习-线性回归（特征方程）
    estimator = SGDRegressor(max_iter=1000)
    estimator.fit(x_train, y_train)
    
    # 5、模型评估
    y_predict = estimator.predict(x_test)
    print('预测值：', y_predict)
    print('模型中系数为：', estimator.coef_)
    print('模型中偏置为：', estimator.intercept_)
    
    # 6、评价（均方误差）
    error = mean_squared_error(y_test, y_predict)
    print('均方误差为：', error)

    return estimator

In [64]:
# 导入模型的保存和加载API
import joblib

# 保存正规方程模型
normal_equation_estimator = normal_equation()
joblib.dump(normal_equation_estimator, '../file/normal_equation_estimator.pkl')

x_train标准化前： [[  1.80028   0.       19.58      0.        0.605     5.877    79.2
    2.4259    5.      403.       14.7     227.61     12.14   ]]
x_test标准化前： [[6.0760e-02 0.0000e+00 1.1930e+01 0.0000e+00 5.7300e-01 6.9760e+00
  9.1000e+01 2.1675e+00 1.0000e+00 2.7300e+02 2.1000e+01 3.9690e+02
  5.6400e+00]]
x_train标准化后： [[-0.20918739 -0.52054534  1.24477931 -0.24824313  0.44910826 -0.55555519
   0.38286373 -0.66936702 -0.52661517 -0.03238741 -1.72807526 -1.35897227
  -0.08447646]]
x_test标准化后： [[-0.20918739 -0.52054534  1.24477931 -0.24824313  0.44910826 -0.55555519
   0.38286373 -0.66936702 -0.52661517 -0.03238741 -1.72807526 -1.35897227
  -0.08447646]]
预测值： [28.22944896 31.5122308  21.11612841 32.6663189  20.0023467  19.07315705
 21.09772798 19.61400153 19.61907059 32.87611987 20.97911561 27.52898011
 15.54701758 19.78630176 36.88641203 18.81202132  9.35912225 18.49452615
 30.66499315 24.30184448 19.08220837 34.11391208 29.81386585 17.51775647
 34.91026707 26.54967053 34.71035391 27.42

['../file/normal_equation_estimator.pkl']

In [65]:
# 导入模型的保存和加载API
import joblib

# 保存正规方程模型
gradient_desc_estimator = gradient_desc()
joblib.dump(gradient_desc_estimator, '../file/gradient_desc_estimator.pkl')

x_train标准化前： [[  1.80028   0.       19.58      0.        0.605     5.877    79.2
    2.4259    5.      403.       14.7     227.61     12.14   ]]
x_test标准化前： [[6.0760e-02 0.0000e+00 1.1930e+01 0.0000e+00 5.7300e-01 6.9760e+00
  9.1000e+01 2.1675e+00 1.0000e+00 2.7300e+02 2.1000e+01 3.9690e+02
  5.6400e+00]]
x_train标准化后： [[-0.20918739 -0.52054534  1.24477931 -0.24824313  0.44910826 -0.55555519
   0.38286373 -0.66936702 -0.52661517 -0.03238741 -1.72807526 -1.35897227
  -0.08447646]]
x_test标准化后： [[-0.20918739 -0.52054534  1.24477931 -0.24824313  0.44910826 -0.55555519
   0.38286373 -0.66936702 -0.52661517 -0.03238741 -1.72807526 -1.35897227
  -0.08447646]]
预测值： [28.18401656 31.46412391 20.81148312 31.41536913 19.22128414 17.9809865
 20.80860613 18.40545461 18.6163158  32.77433233 20.70354693 26.85375237
 14.68281017 19.18144663 37.01095686 18.19712605  7.6819858  17.67049455
 30.42553168 23.7588784  17.81852373 33.78762352 28.08284865 16.58813194
 34.72510267 26.23992769 34.76119524 26.831

['../file/gradient_desc_estimator.pkl']

In [66]:
# 导入模型的保存和加载API
import joblib

# 加载模型
estimator = joblib.load('../file/normal_equation_estimator.pkl')
x_train, x_test, y_train, y_test = train_test_split(data, target, random_state=22)
y_predict = estimator.predict(x_test)
print("预测值为:\n", y_predict)
print("模型中的系数为:\n", estimator.coef_)
print("模型中的偏置为:\n", estimator.intercept_)

预测值为:
 [ -191.1148787   -226.94333817  -425.50176713  -254.93015986
  -291.86242311  -906.06580172  -420.72510655  -121.13800065
  -241.79449404  -437.48229548  -507.3539583   -880.59039729
  -541.28650281  -527.7785217   -427.17455259  -308.54222035
  -540.54339166  -303.62620227  -128.54623467  -269.96002831
  -924.16428551  -151.64225755  -205.45231261  -949.5243263
   -39.08117215  -237.45422538   -47.0640677   -214.79525501
  -932.75938395  -403.16624173  -173.89677785  -131.70278001
  -218.73182129 -1303.23726262  -306.67274646  -987.89119858
  -987.51812736  -912.16157702   -52.20972761   -87.271636
  -216.71920406 -1021.14224249  -893.23231213  -994.9075714
  -898.89753028  -273.26115858  -487.88701195  -520.05874884
  -265.12851475  -902.43927033 -1117.72607315   -73.08321504
  -258.50011533  -947.79560234 -1294.41318198  -143.57962872
   -37.73657003  -515.37398447  -299.99561927  -188.8048154
  -440.89382826  -209.40281373  -265.96778581   -45.89146629
  -498.89365263  -920.

# 小结
- 正规方程：
```python 
sklearn.linear_model.LinearRegression() 
 ```
- 梯度下降：
```python 
sklearn.linear_model.SGDRegressor() 
 ```
- 线性回归性能评估：
``` python
# 均方误差
sklearn.metrics.mean_squared_error
```
- 模型加载与保存：
``` python
# 保存
joblib.dump(estimator, 'test.pkl')
# 加载
estimator = joblib.load('test.plk')
```