# 随机森林模型的sklearn实现

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
# 样例数据读取
df = pd.read_excel('realestate_sample_preprocessed.xlsx')
# 根据共线性矩阵，保留与房价相关性最高的日间人口，将夜间人口和20-39岁夜间人口进行比例处理
def age_percent(row):
    if row['nightpop'] == 0:
        return 0
    else:
        return row['night20-39']/row['nightpop']
df['per_a20_39'] = df.apply(age_percent,axis=1)
df = df.drop(columns=['nightpop','night20-39'])
# 制作标签变量
price_median = df['average_price'].median()
print(price_median)
df['is_high'] = df['average_price'].map(lambda x: True if x>= price_median else False)
print(df['is_high'].value_counts())
# 数据集基本情况查看
print(df.shape)
print(df.dtypes)
print(df.isnull().sum())

30273.0
True     449
False    449
Name: is_high, dtype: int64
(898, 10)
id                 int64
complete_year      int64
average_price    float64
area             float64
daypop           float64
sub_kde          float64
bus_kde          float64
kind_kde         float64
per_a20_39       float64
is_high             bool
dtype: object
id               0
complete_year    0
average_price    0
area             0
daypop           0
sub_kde          0
bus_kde          0
kind_kde         0
per_a20_39       0
is_high          0
dtype: int64


In [20]:
# 留出法进行数据集划分
# 载入sklearn中数据集划分的方法
from sklearn.model_selection import train_test_split
# 将数据集划分成训练集和验证集：划分比例0.75训练，0.25验证
training, testing = train_test_split(df,test_size=0.25, random_state=1)
# 提取训练集中的x与y
x_train = training.copy()[['complete_year','area', 'daypop', 'sub_kde',
       'bus_kde', 'kind_kde','per_a20_39']]
y_train = training.copy()['is_high']
# 提取验证集中的x与y
x_test = testing.copy()[['complete_year','area', 'daypop', 'sub_kde',
       'bus_kde', 'kind_kde','per_a20_39']]
y_test = testing.copy()['is_high']
print(f'the shape of training set is: {training.shape}')
print(f'the shape of testing set is: {testing.shape}')

the shape of training set is: (673, 10)
the shape of testing set is: (225, 10)


In [21]:
from sklearn.preprocessing import PowerTransformer, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(criterion='gini',
                                  n_jobs=16,
                                  max_features = 'auto',
                                  n_estimators = 100,
                                  max_depth = None,
                                  random_state=133)
pipe_clf = Pipeline([
        ('sc',StandardScaler()),
        ('power_trans',PowerTransformer()),
        ('polynom_trans',PolynomialFeatures(degree=2)),
        ('rf_clf', rf_model)
        ])
print(pipe_clf)

Pipeline(memory=None,
     steps=[('sc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('power_trans', PowerTransformer(copy=True, method='yeo-johnson', standardize=True)), ('polynom_trans', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('rf_clf', RandomForestClassifier(bootstrap=True,...mators=100, n_jobs=16,
            oob_score=False, random_state=133, verbose=0, warm_start=False))])


## 随机森林模型参数讲解

#### 特有参数：

**n_estimators**: 也就是子学习器的个数

**max_features**: 每棵树选择的特征的最大数量，默认是"auto"

**bootstrap**： 默认True，构建决策树的时候是否使用有放回的抽样方式构建训练数据

#### 决策树模型参数：

**criterion:**做划分时对特征的评价标准默认是基尼系数gini,另一个可选择的标准是信息增益。回归树默认是均方差mse，另一个可是绝对值差mae。

**max_depth**: 每棵树的最大深度，默认None

**min_samples_split**: 内部节点再划分所需最小样本数

**min_samples_leaf**: 叶子节点最少样本数

**max_leaf_nodes**:  最大叶子节点数

In [22]:
# 查看留出法验证集上模型的表现
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
warnings.filterwarnings('ignore')
pipe_clf.fit(x_train,y_train)
y_predict = pipe_clf.predict(x_test)
print(f'accuracy score is: {accuracy_score(y_test,y_predict)}')
print(f'precision score is: {precision_score(y_test,y_predict)}')
print(f'recall score is: {recall_score(y_test,y_predict)}')
print(f'auc: {roc_auc_score(y_test,y_predict)}')

accuracy score is: 0.8622222222222222
precision score is: 0.8761061946902655
recall score is: 0.853448275862069
auc: 0.862503954444796
