# xgboost 模型的sklearn api调用

In [110]:
# 数据读取以及xy提取
import pandas as pd
import matplotlib.pyplot as plt
# 样例数据读取
df = pd.read_excel('realestate_sample_preprocessed.xlsx')
# 根据共线性矩阵，保留与房价相关性最高的日间人口，将夜间人口和20-39岁夜间人口进行比例处理
def age_percent(row):
    if row['nightpop'] == 0:
        return 0
    else:
        return row['night20-39']/row['nightpop']
df['per_a20_39'] = df.apply(age_percent,axis=1)
# 制作标签变量
price_median = df['average_price'].median()
df['is_high'] = df['average_price'].map(lambda x: True if x>= price_median else False)
# 数据集基本情况查看
# 划分数据集
x = df[['complete_year','area', 'daypop', 'sub_kde',
       'bus_kde', 'kind_kde','nightpop','night20-39','per_a20_39']]
y = df['is_high']

In [111]:
# 构建分类模型的交叉验证策略
from sklearn.model_selection import StratifiedKFold
k = 5
kf = StratifiedKFold(n_splits=k, shuffle=True)
kf.get_n_splits(x, y)
print(kf)

StratifiedKFold(n_splits=5, random_state=None, shuffle=True)


In [112]:
from sklearn.preprocessing import PowerTransformer, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
# pip install xgboost
import xgboost as xgb

xgb_model = xgb.XGBClassifier(objective='binary:hinge',
                             nthread=16,
                             booster='gbtree',
                             n_estimators=500,
                             learning_rate=0.05,
                             max_depth=9,
                             subsample=0.8,
                             colsample_bytree=0.8                          
)

pipe_clf = Pipeline([
        ('sc',StandardScaler()),
        ('power_trans',PowerTransformer()),
        ('polynom_trans',PolynomialFeatures(degree=2)),
        ('xgb_clf', xgb_model)
        ])
print(pipe_clf)

Pipeline(memory=None,
     steps=[('sc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('power_trans', PowerTransformer(copy=True, method='yeo-johnson', standardize=True)), ('polynom_trans', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('xgb_clf', XGBClassifier(base_score=0.5, booster...      reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8))])


## xgboost 模型参数讲解

##### 通用参数：

booster：默认gbtree

- gbtree：基于树的模型
- gbliner：线性模型

nthread：最大线程数

objective：任务类型

- 回归任务：reg:squarederror

- 二元分类任务：
  - binary:logistic（输出概率）
  - binary:hinge（输出分类结果）
- 其他任务类型详解：https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters

##### 训练参数，以tree booster为例：

n_estimators： 子学习器数量

learning rate：训练步长

max_depth：树最大深度

max_leaf_nodes:树最大节点或叶子数量

subsample：控制每棵树，训练样本比例

colsample_bytree：控制每棵树，训练特征比例

lambda：L2正则系数

alpha：L1正则系数

其他参数详解：https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [113]:
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
warnings.filterwarnings('ignore')
acc = []
precision = []
recall = []
auc = []
for train_index, test_index in kf.split(x,y):  # 拆分
    x_traincv, x_testcv = x.loc[train_index], x.loc[test_index]
    y_traincv, y_testcv = y.loc[train_index], y.loc[test_index]
    pipe_clf.fit(x_traincv, y_traincv)  # 训练
    y_predictcv = pipe_clf.predict(x_testcv)  # 预测
    k_acc = accuracy_score(y_testcv,y_predictcv)
    print(f'accuracy score is: {k_acc}')
    acc.append(k_acc)
    k_precision = precision_score(y_testcv,y_predictcv)
    print(f'precision score is: {k_precision}')
    precision.append(k_precision)
    k_recall = recall_score(y_testcv,y_predictcv)
    print(f'recall score is: {k_recall}')
    recall.append(k_recall)
    k_auc = roc_auc_score(y_testcv,y_predictcv)
    print(f'auc: {k_auc}')
    auc.append(k_auc)
import numpy as np
print(f'-----------------------------------------------------')
print(f'average accuracy score is: {np.array(acc).mean()}')
print(f'average precision is: {np.array(precision).mean()}')
print(f'average recall is: {np.array(recall).mean()}')
print(f'average auc is: {np.array(auc).mean()}')

accuracy score is: 0.9060758082497213
precision score is: 0.900054914881933
recall score is: 0.9136008918617614
auc: 0.9060758082497213
accuracy score is: 0.9066071926400893
precision score is: 0.9028161236885699
recall score is: 0.9113712374581939
auc: 0.90660586412787
accuracy score is: 0.9157836029001674
precision score is: 0.9067103109656302
recall score is: 0.9269380925822643
auc: 0.9157836029001672
accuracy score is: 0.9079754601226994
precision score is: 0.8905499199145755
recall score is: 0.9302844394868934
auc: 0.9079754601226994
accuracy score is: 0.9012827663134412
precision score is: 0.9039865244244806
recall score is: 0.8979364194088121
auc: 0.9012827663134412
-----------------------------------------------------
average accuracy score is: 0.9075449660452237
average precision is: 0.900823558775038
average recall is: 0.9160262161595851
average auc is: 0.9075447003427799
