In [1]:
# 从sklearn.datasets导入波士顿房价数据读取器。
from sklearn.datasets import load_boston

In [2]:
# 读取房价数据
boston = load_boston()
# 输出数据描述。
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [5]:
# 导入数据分割器。
from sklearn.model_selection import train_test_split
import numpy as np

In [6]:
X = boston.data
y = boston.target

# 随机采样25%的数据构建测试样本，其余作为训练样本。
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33, test_size=0.25)

# 分析回归目标值的差异。
print("The max target value is", np.max(boston.target))
print("The min target value is", np.min(boston.target))
print("The average target value is", np.mean(boston.target))

The max target value is 50.0
The min target value is 5.0
The average target value is 22.532806324110677


In [13]:
# 导入数据标准化模块。
from sklearn.preprocessing import StandardScaler

# 分别初始化对特征和目标值的标准化器。
ss_X = StandardScaler()
ss_y = StandardScaler()

# 分别对训练和测试数据的特征以及目标值进行标准化处理。
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)

y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1, 1))

In [14]:
# 导入LinearRegression。
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
# 对测试数据进行回归预测。
lr_y_predict = lr.predict(X_test)

In [20]:
# 导入SGDRegressor。
from sklearn.linear_model import SGDRegressor

sgdr = SGDRegressor()
sgdr.fit(X_train, y_train.ravel())
# 对测试数据进行回归预测。
sgdr_y_predict = sgdr.predict(X_test)

In [21]:
# 使用LinearRegression模型自带的评估模块，并输出评估结果。
print('The value of default measurement of LinearRegression is', lr.score(X_test, y_test))

# 导入r2_score、mean_squared_error以及mean_absoluate_error用于回归性能的评估。
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

The value of default measurement of LinearRegression is -5.826896537214896


In [24]:
# 使用r2_score、mean_squared_error、mean_absolute_error模块，并输出评估结果。
print('The value of R-squared of LinearRegression is',
      r2_score(y_test, lr_y_predict))

print('The mean squared error of LinearRegression is',
      mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict))
     )

print('The mean absoluate error of LinearRegression is',
      mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict))
     )

The value of R-squared of LinearRegression is -5.826896537214896
The mean squared error of LinearRegression is 529.3663953404998
The mean absoluate error of LinearRegression is 21.537789484481902


In [28]:
# 使用SGDRegressor模型自带的评估模块，并输出评估结果。
print('The value of default measurement of SGDRegressor is', 
      sgdr.score(X_test, y_test))

print('The value of R-squared of SGDRegressor is', 
      r2_score(y_test, sgdr_y_predict))

print('The mean squared error of SGDRegressor is', 
      mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_predict))
     )

print('The mean absoluate error of SGDRegressor is', 
      mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgdr_y_predict))
     )

The value of default measurement of SGDRegressor is -5.821742931395504
The value of R-squared of SGDRegressor is -5.821742931395504
The mean squared error of SGDRegressor is 528.966778073599
The mean absoluate error of SGDRegressor is 21.525759320203047


In [31]:
# 从sklearn.svm中导入支持向量机（回归）模型。
from sklearn.svm import SVR

# 使用线性核函数配置的支持向量机进行回归训练，并且对测试样本进行预测。
linear_svr = SVR(kernel='linear')
linear_svr.fit(X_train, y_train.ravel())
linear_svr_y_predict = linear_svr.predict(X_test)

# 使用多项式核函数配置的支持向量机进行回归训练，并且对测试样本进行预测。
poly_svr = SVR(kernel='poly')
poly_svr.fit(X_train, y_train.ravel())
poly_svr_y_predict = poly_svr.predict(X_test)

# 使用径向基核函数配置的支持向量机进行回归训练，并且对测试样本进行预测。
rbf_svr = SVR(kernel='rbf')
rbf_svr.fit(X_train, y_train.ravel())
rbf_svr_y_predict = rbf_svr.predict(X_test)

In [32]:
# 使用R-squared、MSE和MAE指标对三种配置的支持向量机（回归）模型在相同测试集上进行性能评估。
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
print('R-squared value of linear SVR is',
      linear_svr.score(X_test, y_test))

print('The mean squared error of linear SVR is',
      mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict))
     )
print('The mean absoluate error of linear SVR is',
      mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict))
     )

R-squared value of linear SVR is -5.887962223053588
The mean squared error of linear SVR is 534.1015076737845
The mean absoluate error of linear SVR is 21.644361940770704


In [33]:
print('R-squared value of Poly SVR is',
      poly_svr.score(X_test, y_test))

print('The mean squared error of Poly SVR is',
      mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict))
     )

print('The mean absoluate error of Poly SVR is',
      mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict))
     )

R-squared value of Poly SVR is -5.830457473877632
The mean squared error of Poly SVR is 529.6425149791839
The mean absoluate error of Poly SVR is 21.575153797852472


In [35]:
print('R-squared value of RBF SVR is',
      rbf_svr.score(X_test, y_test))

print('The mean squared error of RBF SVR is',
      mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict))
     )

print('The mean absoluate error of RBF SVR is',
      mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict))
     )

R-squared value of RBF SVR is -5.861497287191408
The mean squared error of RBF SVR is 532.0493822865151
The mean absoluate error of RBF SVR is 21.582537351208934


In [37]:
# 导入KNeighborRegressor（K近邻回归器）。
from sklearn.neighbors import KNeighborsRegressor

# 初始化K近邻回归器，并且调整配置，使得预测的方式为平均回归：weights='uniform'。
uni_knr = KNeighborsRegressor(weights='uniform')
uni_knr.fit(X_train, y_train)
uni_knr_y_predict = uni_knr.predict(X_test)

# 初始化K近邻回归器，并且调整配置，使得预测的方式为根据距离加权回归：weights='distance'。
dis_knr = KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train, y_train)
dis_knr_y_predict = dis_knr.predict(X_test)

In [38]:
# 使用R-squared、MSE以及MAE三种指标对平均回归配置的K近邻模型在测试集上进行性能评估。
print('R-squared value of uniform-weighted KNeighorRegression:',
      uni_knr.score(X_test, y_test))

print('The mean squared error of uniform-weighted KNeighorRegression:',
      mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(uni_knr_y_predict))
     )

print('The mean absoluate error of uniform-weighted KNeighorRegression',
      mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(uni_knr_y_predict))
     )

R-squared value of uniform-weighted KNeighorRegression: -5.898574928779067
The mean squared error of uniform-weighted KNeighorRegression: 534.9244306145499
The mean absoluate error of uniform-weighted KNeighorRegression 21.6287851971816


In [39]:
# 使用R-squared、MSE以及MAE三种指标对根据距离加权回归配置的K近邻模型在测试集上进行性能评估。
print('R-squared value of distance-weighted KNeighorRegression:',
      dis_knr.score(X_test, y_test))

print('The mean squared error of distance-weighted KNeighorRegression:',
      mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict))
     )
print('The mean absoluate error of distance-weighted KNeighorRegression:',
      mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict))
     )

R-squared value of distance-weighted KNeighorRegression: -5.883085438592986
The mean squared error of distance-weighted KNeighorRegression: 533.7233554934066
The mean absoluate error of distance-weighted KNeighorRegression: 21.613676200534318


In [40]:
# 使用R-squared、MSE以及MAE三种指标对根据距离加权回归配置的K近邻模型在测试集上进行性能评估。
print('R-squared value of distance-weighted KNeighorRegression:',
      dis_knr.score(X_test, y_test))

print('The mean squared error of distance-weighted KNeighorRegression:',
      mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict))
     )
print('The mean absoluate error of distance-weighted KNeighorRegression:',
      mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict))
     )

R-squared value of distance-weighted KNeighorRegression: -5.883085438592986
The mean squared error of distance-weighted KNeighorRegression: 533.7233554934066
The mean absoluate error of distance-weighted KNeighorRegression: 21.613676200534318


In [41]:
# 导入DecisionTreeRegressor。
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
# 用波士顿房价的训练数据构建回归树。
dtr.fit(X_train, y_train)
# 使用默认配置的单一回归树对测试数据进行预测
dtr_y_predict = dtr.predict(X_test)

In [42]:
# 使用R-squared、MSE以及MAE指标对默认配置的回归树在测试集上进行性能评估。
print('R-squared value of DecisionTreeRegressor:',
      dtr.score(X_test, y_test))

print('The mean squared error of DecisionTreeRegressor:',
      mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dtr_y_predict))
     )

print('The mean absoluate error of DecisionTreeRegressor:',
      mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dtr_y_predict))
     )

R-squared value of DecisionTreeRegressor: -5.77194942743176
The mean squared error of DecisionTreeRegressor: 525.1057253154429
The mean absoluate error of DecisionTreeRegressor: 21.487237181064327


In [44]:
# 导入RandomForestRegressor、ExtraTreesGressor以及GradientBoostingRegressor。
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

# 使用RandomForestRegressor训练模型，并对测试数据做出预测
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train.ravel())
rfr_y_predict = rfr.predict(X_test)

# 使用ExtraTreesRegressor训练模型，并对测试数据做出预测
etr = ExtraTreesRegressor()
etr.fit(X_train, y_train.ravel())
etr_y_predict = etr.predict(X_test)

# 使用GradientBoostingRegressor训练模型，并对测试数据做出预测
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train.ravel())
gbr_y_predict = gbr.predict(X_test)

In [45]:
# 使用R-squared、MSE以及MAE指标对默认配置的随机回归森林在测试集上进行性能评估。
print('R-squared value of RandomForestRegressor:',
      rfr.score(X_test, y_test))

print('The mean squared error of RandomForestRegressor:',
      mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict))
     )

print('The mean absoluate error of RandomForestRegressor:',
      mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict))
     )

R-squared value of RandomForestRegressor: -5.7908852966890025
The mean squared error of RandomForestRegressor: 526.5740371313174
The mean absoluate error of RandomForestRegressor: 21.511180364848027


In [51]:
# 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估。
print('R-squared value of ExtraTreesRegessor:',
      etr.score(X_test, y_test))

print('The mean squared error of  ExtraTreesRegessor:',
      mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict))
     )

print('The mean absoluate error of ExtraTreesRegessor:',
      mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict))
     )

# 利用训练好的极端回归森林模型，输出每种特征对预测目标的贡献度。
print(np.sort( list(zip(etr.feature_importances_, boston.feature_names)), axis=0))

R-squared value of ExtraTreesRegessor: -5.816303779330491
The mean squared error of  ExtraTreesRegessor: 528.5450191811551
The mean absoluate error of ExtraTreesRegessor: 21.544871202445155
[['0.003874289354477088' 'AGE']
 ['0.014441301851180232' 'B']
 ['0.018696144984278157' 'CHAS']
 ['0.01905108020946717' 'CRIM']
 ['0.020306887755394583' 'DIS']
 ['0.024333857607445823' 'INDUS']
 ['0.02474345673493861' 'LSTAT']
 ['0.03677411448526542' 'NOX']
 ['0.037711407225922845' 'PTRATIO']
 ['0.0384296080688791' 'RAD']
 ['0.06273786959755033' 'RM']
 ['0.34394843935553565' 'TAX']
 ['0.35495154276966495' 'ZN']]


In [52]:
# 使用R-squared、MSE以及MAE指标对默认配置的梯度提升回归树在测试集上进行性能评估。
print('R-squared value of GradientBoostingRegressor:',
      gbr.score(X_test, y_test))

print('The mean squared error of GradientBoostingRegressor:',
      mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(gbr_y_predict))
     )

print('The mean absoluate error of GradientBoostingRegressor:',
      mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(gbr_y_predict))
     )

R-squared value of GradientBoostingRegressor: -5.793084100380008
The mean squared error of GradientBoostingRegressor: 526.744535216007
The mean absoluate error of GradientBoostingRegressor: 21.515870420485697
