# 案例：波士顿房价预测

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

## 1、获取数据
### 1.1 波士顿房价数据在sklearn中已经内置，可以通过load_boston()方法获得

In [2]:
boston = load_boston()

##### 特征含义
CRIM：城镇人均犯罪率。<br/>
ZN：住宅用地超过 25000 sq.ft. 的比例。<br/>
INDUS：城镇非零售商用土地的比例。<br/>
CHAS：查理斯河空变量（如果边界是河流，则为1；否则为0）。<br/>
NOX：一氧化氮浓度。<br/>
RM：住宅平均房间数。<br/>
AGE：1940 年之前建成的自用房屋比例。<br/>
DIS：到波士顿五个中心区域的加权距离。<br/>
RAD：辐射性公路的接近指数。<br/>
TAX：每 10000 美元的全值财产税率。<br/>
PTRATIO：城镇师生比例。<br/>
B：1000（Bk-0.63）^ 2，其中 Bk 指代城镇中黑人的比例。<br/>
LSTAT：人口中地位低下者的比例。<br/>
MEDV：自住房的平均房价，以千美元计。<br/>

In [3]:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
print(boston)

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]]), 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 1

##### 取特征X和标签y

In [6]:
X = boston.data
y = boston.target

### 1.2 从文件读取
绝大多数情况，数据是存在文件中的，如excel。所以我们也可以从文件中读取数据。一般使用pandas读取。

In [6]:
import pandas as pd

In [8]:
df = pd.read_excel('data/boston.xls')
df

Unnamed: 0.1,Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
0,0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [10]:
X = df[df.columns[0:-1]]
y = df[df.columns[-1]]
y

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: price, Length: 506, dtype: float64

## 2、数据预处理(数据清洗)


我们获取的数据有可能存在下面的一些情况：
  - 缺少数据值
  - 含有错误的数据值，如年龄=200
  - 数据不一致，等级编码有的是“1，2，3”有的却是“A，B，C ”
  - 重复的记录值
  
$\color{red}{注意：本门课程关注的是机器学习算法，而波士顿房价数据也是清理过得，所以该部分不用写代码进行处理}$

## 3、数据分析与可视化

$\color{red}{注意：本门课程关注的是机器学习算法，不是数据分析，因此忽略数据分析与可视化部分}$

## 4、选择合适的机器学习模型

该问题是房价预测问题，线性回归能很好的应用于预测问题，因此我们选择使用线性回归模型

In [11]:
model = linear_model.Ridge(alpha=0.1)
model.fit(X,y)
y_hat = model.predict(X)

我们如何选择参数alpha呢？

## 5、训练模型(使用交叉验证选择合适的参数)


In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [22]:
from sklearn.model_selection import GridSearchCV


In [23]:
ridge_model = linear_model.Ridge()
param = {'alpha':[0.01,0.03,0.05,0.07,0.1,0.5,0.8,1],'normalize':[True,False]}
gsearch = GridSearchCV(estimator=ridge_model,param_grid=param,cv=5,scoring='neg_mean_squared_error')
gsearch.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': [0.01, 0.03, 0.05, 0.07, 0.1, 0.5, 0.8, 1],
                         'normalize': [True, False]},
             scoring='neg_mean_squared_error')

In [24]:
gsearch.best_params_,gsearch.best_score_

({'alpha': 0.01, 'normalize': True}, -23.667749508826866)

## 6、模型评价

In [25]:
final_model = linear_model.Ridge(alpha=0.01,normalize=True)
final_model.fit(X_train,y_train)
y_train_hat = final_model.predict(X_train)
y_test_hat = final_model.predict(X_test)
print("train-MSE=",mean_squared_error(y_train,y_train_hat))
print("test-MSE=",mean_squared_error(y_test,y_test_hat))

train-MSE= 21.592179680057335
test-MSE= 24.594713652261827


## 7、上线部署使用

1、模型保存

In [26]:
#from sklearn.externals import joblib
import joblib
joblib.dump(final_model,"house_train_model.m")

['house_train_model.m']

2、模型读取

In [27]:
load_model = joblib.load("house_train_model.m")

In [28]:
load_model.predict(X_test)

array([17.86463725, 23.84267258, 20.19918585, 21.09466789, 20.36408946,
       17.86919759, 15.99005279, 10.34518376, 24.33945686, 13.98422693,
       28.27735975, 22.13228209, 20.55986273, 20.99707196, 30.71331846,
       20.12778931, 19.72304131,  6.92146177, 18.78117991, 19.53089256,
        8.83737744, 13.73852819, 18.43750604, 18.1472457 , 37.16243527,
       24.98692809, 12.98047518, 32.419058  , 15.11458266, 22.23790681,
       25.50314304, 21.8806831 , 25.52704892, 22.74263156, 21.03634738,
       13.52798571, 17.40533019, 38.08739752, 13.58570282, 22.5967951 ,
       15.93151158, 15.19944237, 19.74609202, 20.3396185 , 21.05852779,
       38.7665506 , 24.81096903, 17.1301721 , 27.28942101, 26.58276158,
       23.2874117 , 18.1756785 , 36.35743345, 29.563364  , 19.0793563 ,
       14.15994171, 34.18734491, 20.87733264,  8.59590692,  9.16484832,
       18.55664037, 26.0235094 , 29.2890751 , 18.75369035, 32.20586898,
       19.98165252, 18.62106342, 19.49612124, 21.04515476, 30.50