# 线性回归之房价预测

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
# 导入房价预测数据集
from sklearn.datasets.california_housing import fetch_california_housing

# 实例化数据集类并命名
housing = fetch_california_housing()
# 查看训练集，目标集数据维度
print(housing.data.shape , housing.target.shape)
pd.DataFrame(housing.data).describe()  # 没有缺失值

(20640, 8) (20640,)


Unnamed: 0,0,1,2,3,4,5,6,7
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [2]:
df_data = pd.DataFrame(housing.data)
df_target = pd.DataFrame(housing.target)
df_all = pd.concat([df_data,df_target],axis=1)

# 修正列名
newlist = housing.feature_names[:]
newlist.append('Price')
df_all.columns=newlist
df_all.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.688075
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,0.105623
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.151948
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.046701
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,-0.02465
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,-0.023737
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.14416
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.045967
Price,0.688075,0.105623,0.151948,-0.046701,-0.02465,-0.023737,-0.14416,-0.045967,1.0


In [3]:
# 首先对数据进行数据拆分
from sklearn.model_selection import train_test_split
# 用拆包的方法接收train_test_split函数的结果, test_size指的是取测试集占训练集的比例
X_train,X_test,y_train,y_test = \
    train_test_split(housing.data, housing.target, test_size=0.4, random_state=2)
# 查看拆分结果
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
data = pd.DataFrame(X_train)
data.head(5)

(12384, 8) (8256, 8) (12384,) (8256,)


Unnamed: 0,0,1,2,3,4,5,6,7
0,2.4886,8.0,4.37931,1.172414,728.0,4.183908,34.05,-118.2
1,5.015,37.0,5.578313,1.012048,826.0,3.317269,34.2,-118.58
2,2.9643,38.0,3.740035,1.008666,1061.0,1.838821,34.09,-118.36
3,3.5156,39.0,5.954545,1.0,288.0,2.181818,37.72,-122.14
4,6.7851,45.0,6.957746,1.023474,1067.0,2.504695,37.81,-122.2


In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# 实例化模型对象
model = LinearRegression()
model.fit(X_train,y_train)

# 用模型预测y_test,用于比对评价模型质量
y_pred = model.predict(X_test)

# 查看残差平方均值MSE(The mean squared error)
print("残差平方均值MSE: %.2f" % mean_squared_error(y_test, y_pred))

"""
Explained variance score: 1 is perfect prediction 
 R2 决定系数（拟合优度）
模型越好：r2→1
模型越差：r2→0
"""
print('拟合优度R2: %.2f' % r2_score(y_test,y_pred))

pd.Series(model.coef_,index=housing.feature_names)

残差平方均值MSE: 0.53
拟合优度R2: 0.60


MedInc        0.443055
HouseAge      0.009191
AveRooms     -0.110162
AveBedrms     0.579041
Population   -0.000004
AveOccup     -0.003716
Latitude     -0.402303
Longitude    -0.413826
dtype: float64

In [28]:
from sklearn.ensemble import RandomForestRegressor

model2 = RandomForestRegressor(n_estimators=80)
model2.fit(X_train,y_train)
# Linear_model.score(X_test,y_test)

y_pred2 = model2.predict(X_test)


print("残差平方均值MSE: %.2f" % mean_squared_error(y_test, y_pred2))
print('拟合优度R2: %.2f' % r2_score(y_test,y_pred2))

残差平方均值MSE: 0.27
拟合优度R2: 0.80
