In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
#处理缺失值
from sklearn.model_selection import StratifiedShuffleSplit
#用于分割数据集
from sklearn.preprocessing import OrdinalEncoder
#处理文本数据，转换器
from sklearn.preprocessing import OneHotEncoder
#处理文本数据，独热编码
from sklearn.base import BaseEstimator,TransformerMixin
#自定义转换器
from sklearn.pipeline import Pipeline
#转换流水线
from sklearn.preprocessing import StandardScaler
#估算器， 就是转换器
from sklearn.compose import ColumnTransformer
#定义处理所有列的转换器
from sklearn.linear_model import LinearRegression
#普通最小二乘回归模型
from sklearn.metrics import mean_squared_error
#均方误差
from sklearn.tree import DecisionTreeRegressor
#决策树
from sklearn.ensemble import RandomForestRegressor
#随机森林
from sklearn.model_selection import cross_val_score
#K-折交叉验证
from sklearn.model_selection import GridSearchCV
#参数值所有可能的组合进行模型评估
from scipy import stats
#计算泛化误差的置信区间

In [3]:
def load_housing_data():
    csv_path = "./datasets/housing/housing.csv"
    return pd.read_csv(csv_path)
h_info = load_housing_data()

# 数据分析及可视化

In [4]:
h_info = load_housing_data()
h_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
h_info.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
h_info['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [7]:
h_info.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [3]:
h_info['income_cat'] = pd.cut(h_info['median_income'],bins=[0.,1.5,3.0,4.5,6.,np.inf],labels = [1,2,3,4,5])
#数据分割成离散的区间
split = StratifiedShuffleSplit(n_splits = 1,test_size = 0.2,random_state = 42)
#首相通过参数构建StratifiedShuffleSplit()对象.使用对象的split方法分割，返回的是分组后数在原数组中的索引
for train_index,test_index in split.split(h_info,h_info['income_cat']):
    strat_train_set = h_info.loc[train_index]
    #取出索引对应的值
    strat_test_set = h_info.loc[test_index]

In [4]:
h_info = strat_train_set.drop("median_house_value",axis = 1)
#axis=1删除指定列，axis=0删除指定行   drop 产生新的数据而不对原数据进行修改
h_labels = strat_train_set['median_house_value'].copy()

### 处理数据中的空值
   * 放弃这些相应的区域
   * 放弃整个属性
   * 将缺失的值设置为某个值（0，平均数或者中位数）

In [5]:
#median = h_info['total_bedrooms'].median
#h_info['total_bedrooms'].fillna(median,inplace = True)

In [6]:
h_info['total_bedrooms'].median

<bound method Series.median of 17606     351.0
18632     108.0
14650     471.0
3230      371.0
3555     1525.0
          ...  
6563      236.0
12053     294.0
13908     872.0
11159     380.0
15775     682.0
Name: total_bedrooms, Length: 16512, dtype: float64>

In [7]:
imputer = SimpleImputer(strategy="median")
housing_num = h_info.drop("ocean_proximity",axis=1)
imputer.fit(housing_num)
imputer.statistics_

array([-118.51  ,   34.26  ,   29.    , 2119.5   ,  433.    , 1164.    ,
        408.    ,    3.5409,    3.    ])

In [8]:
X = imputer.statistics_

### 处理文本数据（从文本转到文字）

In [9]:
housing_cat = h_info[['ocean_proximity']]
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]



array([[0.],
       [0.],
       [4.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [10]:
type(housing_cat)

pandas.core.frame.DataFrame

In [11]:
ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

### 自定义转换器

In [12]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        rooms_per_household = X[:,rooms_ix] / X[:,households_ix]
        population_per_household = X[:,population_ix] / X[:,households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:,bedrooms_ix] / X[:,rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
#attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
#housing_extra_attribs = attr_adder.transform(h_info.values)

In [15]:
#housing_extra_attribs.shape

In [16]:
h_info.values.shape

(16512, 10)

### 转换流水线

In [17]:
num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy = "median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
housing_num = h_info.drop("ocean_proximity",axis=1)
housing_num_tr = num_pipeline.fit_transform(housing_num)

### 定义处理所有列的转换器

In [18]:
housing_num = h_info.drop('ocean_proximity',axis = 1)
num_attribs = list(housing_num)
#所有数值列表
cat_attribs = ['ocean_proximity']
#文本列表

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    #数值转换流水线
    ('cat', OneHotEncoder(), cat_attribs),
    #文本独热编码处理方式
])
housing_prepared = full_pipeline.fit_transform(h_info)

# 训练和评估训练集

### 普通最小二乘线性回归

In [27]:
#交叉验证结果输出函数2
def display_scoores(scores):
    print("Scores:", scores)
    print("Mean", scores.mean())
    print("Standard deviation:", scores.std())

In [21]:
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, h_labels)
some_data = h_info.iloc[:5]
#选取前5行数据
some_labels = h_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:", lin_reg.predict(some_data_prepared))
print("Labels:", list(some_labels))

Predictions: [203682.37379543 326371.39370781 204218.64588245  58685.4770482
 194213.06443039]
Labels: [286600.0, 340600.0, 196900.0, 46300.0, 254500.0]


In [29]:
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(h_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)
#使用交叉验证评估模型
scores = cross_val_score(lin_reg, housing_prepared, h_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-scores)
display_scoores(lin_rmse_scores)

68376.64295459937
Scores: [66877.52325028 66608.120256   70575.91118868 74179.94799352
 67683.32205678 71103.16843468 64782.65896552 67711.29940352
 71080.40484136 67687.6384546 ]
Mean 68828.99948449328
Standard deviation: 2662.7615706103393


### 决策树

In [28]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, h_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(h_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)
#使用交叉验证评估模型
scores = cross_val_score(tree_reg,housing_prepared, h_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
display_scoores(tree_rmse_scores)

0.0
Scores: [68312.75391796 68230.27423689 72354.17942141 69086.15310947
 70561.89267187 74041.09806197 71552.64139197 70841.04352542
 77336.07317557 68823.80821187]
Mean 71113.99177243975
Standard deviation: 2737.0033247976735


### 随机森林

In [30]:
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, h_labels)
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(h_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)
print(forest_rmse)
#使用交叉验证评估模型
scores = cross_val_score(forest_reg,housing_prepared, h_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display_scoores(forest_rmse_scores)

18691.78182577601
Scores: [49407.17484375 47761.8517321  49965.2246639  52081.78625424
 49717.87489218 53703.86823605 48941.36796537 47800.25715747
 53106.14155404 50132.70119765]
Mean 50261.82484967395
Standard deviation: 1960.4540278799327


In [33]:
param_grid = [
    {'n_estimators':[3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap' : [False], 'n_estimators':[3,10], 'max_features':[2,3,4]},
]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, h_labels)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [34]:
#最佳参数组合
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [35]:
#直接得到最好的估算器
grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=30)

In [39]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
Y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

48597.921098378596


### 计算泛化误差的95%置信区间

In [43]:
confidence = 0.95
squared_errors = (final_predictions - Y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors)-1,
                        loc=squared_errors.mean(),
                        scale = stats.sem(squared_errors)))

array([46578.92240675, 50536.32216134])

In [47]:
type(final_predictions)
fi

numpy.ndarray