#### 随机森林模型的sklearn实现

In [None]:
#数据读取以及xy提取
import pandas as pd
import matplotlib as plt
#样例数据读取
df = pd.read_excel('')
#根据共线性矩阵,保留与房价相关性最高的日间人口,将夜间人口和夜间20-29岁人口进行比例处理
def age_percent(row):
    if row['夜间人口'] == 0:
        return 0
    else:
        return row['夜间20-29岁人口']/row['夜间人口']
df['per_a20-29'] = df.apply(age_percent,axis=1)
df = df.drop(columns=['夜间人口','夜间20-29岁人口'])
#制作标签变量
price_median = df['房价']median()
df['is_high'] = df['房价'].map(lambda x: True if x >= price_median else False)
print(df['is_high'].value_counts())
#数据集基本情况查看
print(df.shape)
print(df.dtypes)
#划分数据集
x = df[['日间人口','夜间人口','夜间20-29岁人口','','','']]
print(x.shape)
y = df[['is_high']]
print(y.shape)
print(df.isnull().sum())

In [None]:
#留出法进行数据集划分
#载入sklearn中数据集划分的方法
from sklearn.model_selection import train_test_split
#将数据集划分成训练集和验证集：划分比例为0.75：0.25
training, testing = train_test_split(df,test_size=0.25,random_state=1)
#提取训练集中的x和y
x_train = training.copy()[['日间人口','夜间人口','夜间20-29岁人口','','','']]
y_train = training.copy()['is_high']

#提取验证集中的x和y
x_test = testing.copy()[['日间人口','夜间人口','夜间20-29岁人口','','','']]
y_test = testing.copy()['is_high']
print(f'the shape of training set is : {training.shape}')
print(f'the shape of testing set is : {testing.shape}')

In [None]:
from sklearn.preprocessing import PowerTransformer, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(criterion='gini',
                                  n_jobs=16,
                                  max_features='auto',
                                  n_estimators=100,
                                  max_depth=None,
                                  random_state=133)
pipe_clf = Pipeline([
        ('sc',StandardScaler()),
        ('power_trans',PowerTransformer()),
        ('polynom_trans',PolynomialFeatures(degree=2)),
        ('rf_clf', rf_model)
        ])
print(pipe_clf)

##### 随机森林模型参数讲解
特有参数：
n_estimators:也就是自学习器的个数
max_features:每棵树选择的特征的最大数量，默认是"auto"
bootstrap:默认True，构建决策树的时候是否使用有放回的抽样方式构建训练数据

决策树模型参数：
criterio:做划分时对特征的评价标准默认是基尼系数gini，另一个可选择的标准是信息增益。回归树默认是均方差mse，另一个是绝对值差mae。
max_depth:每棵树的最大深度，默认None
min_sample_spilit:内部节点在划分所需最小样本数
min_sample_leaf:叶子节点最少样本数
max_leaf_nodes:最大叶子节点数

In [None]:
#查看留出法验证集上模型的表现
import  warnings
from    sklearn.metrics import accuracy_score,precision_score,recall_score,roc_auc_score
warnings.filterwarnings('ignore')
pipe_clf.fit(x_train,y_train)
y_predict = pipe_clf.predict(x_test)
print(f'accuracy score is : {accuracy_score(y_test,y_predict)}')
print(f'precision score is : {precision_score(y_test,y_predict)}')
print(f'recall score is : {recall_score(y_test,y_predict)}')
print(f'auc  : {roc_auc_score(y_test,y_predict)}')