In [0]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pylab as plt
plt.xkcd()
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report,confusion_matrix

In [0]:
from sklearn.datasets.samples_generator import make_classification
# X为样本特征，y为样本类别输出，共10000个样本，每个样本20个特征，输出有2个类别，没有冗余特征，每个类别一个簇
X, y = make_classification(n_samples=10000,
               n_features=20,
               n_redundant=0,
               n_clusters_per_class=1,
               n_classes=2,
               flip_y=0.1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [0]:
dtrain = xgb.DMatrix(X_train,y_train)
dtest = xgb.DMatrix(X_test,y_test)

随机初始化了一个二分类的数据集，然后分成了训练集和验证集。使用训练集和验证集分别初始化了一个DMatrix，有了DMatrix，就可以做训练和预测了。

In [0]:
param = {'max_depth':5, 'eta':0.5, 'verbosity':1, 'objective':'binary:logistic'}
raw_model = xgb.train(param, dtrain, num_boost_round=20)
pred_train_raw = raw_model.predict(dtrain)
for i in range(len(pred_train_raw)):
    if pred_train_raw[i] > 0.5:
         pred_train_raw[i]=1
    else:
        pred_train_raw[i]=0               
print(classification_report(dtrain.get_label(), pred_train_raw))

              precision    recall  f1-score   support

         0.0       0.91      0.95      0.93      3723
         1.0       0.95      0.90      0.93      3777

    accuracy                           0.93      7500
   macro avg       0.93      0.93      0.93      7500
weighted avg       0.93      0.93      0.93      7500



训练集准确率0.93。

In [0]:
pred_test_raw = raw_model.predict(dtest)
for i in range(len(pred_test_raw)):
    if pred_test_raw[i] > 0.5:
         pred_test_raw[i]=1
    else:
        pred_test_raw[i]=0
print(classification_report(dtest.get_label(), pred_test_raw))   

              precision    recall  f1-score   support

         0.0       0.87      0.92      0.89      1244
         1.0       0.92      0.86      0.89      1256

    accuracy                           0.89      2500
   macro avg       0.89      0.89      0.89      2500
weighted avg       0.89      0.89      0.89      2500



测试机准确率0.89。

XGBoost原生风格接口。

In [0]:
sklearn_model_raw = xgb.XGBClassifier(**param)
sklearn_model_raw.fit(X_train,
           y_train,
           early_stopping_rounds=10,
           eval_metric="error",
           eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.1136
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.1112
[2]	validation_0-error:0.1048
[3]	validation_0-error:0.1056
[4]	validation_0-error:0.1044
[5]	validation_0-error:0.1044
[6]	validation_0-error:0.1056
[7]	validation_0-error:0.104
[8]	validation_0-error:0.1048
[9]	validation_0-error:0.1032
[10]	validation_0-error:0.1028
[11]	validation_0-error:0.1024
[12]	validation_0-error:0.1016
[13]	validation_0-error:0.102
[14]	validation_0-error:0.1032
[15]	validation_0-error:0.1024
[16]	validation_0-error:0.1028
[17]	validation_0-error:0.1016
[18]	validation_0-error:0.1012
[19]	validation_0-error:0.1016
[20]	validation_0-error:0.1028
[21]	validation_0-error:0.1024
[22]	validation_0-error:0.1024
[23]	validation_0-error:0.1024
[24]	validation_0-error:0.102
[25]	validation_0-error:0.1028
[26]	validation_0-error:0.1028
[27]	validation_0-error:0.1028
[28]	validation_0-error:0.1028
Stopping. Best iteration:
[18]	validation_0-erro

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.5, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

XGBoost的sklearn风格接口。

In [0]:
sklearn_model_new = xgb.XGBClassifier(max_depth=5,
                    learning_rate=0.5,
                    verbosity=1,
                    objective='binary:logistic',
                    random_state=1)
sklearn_model_new.fit(X_train,
           y_train,
           early_stopping_rounds=10,
           eval_metric="error",
           eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.1136
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.106
[2]	validation_0-error:0.1048
[3]	validation_0-error:0.104
[4]	validation_0-error:0.1048
[5]	validation_0-error:0.1064
[6]	validation_0-error:0.1052
[7]	validation_0-error:0.1064
[8]	validation_0-error:0.1064
[9]	validation_0-error:0.1072
[10]	validation_0-error:0.1088
[11]	validation_0-error:0.1096
[12]	validation_0-error:0.1076
[13]	validation_0-error:0.1088
Stopping. Best iteration:
[3]	validation_0-error:0.104



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.5, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

**使用sklearn的GridSearchCV调参**

一般固定步长，先调好框架参数n_estimators，再调弱学习器参数max_depth，min_child_weight,gamma等，接着调正则化相关参数subsample，colsample_byXXX, reg_alpha以及reg_lambda,最后固定前面调好的参数，来调步长learning_rate

In [0]:
gsCv = GridSearchCV(sklearn_model_new,
           {'max_depth':[4,5,6], 'n_estimators':[5,10,20]})
gsCv.fit(X_train,y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.5, max_delta_step=0,
                                     max_depth=5, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=1, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [4, 5, 6], 'n_estimators': [5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       

In [0]:
print(gsCv.best_params_)
print(gsCv.best_score_)

{'max_depth': 5, 'n_estimators': 10}
0.8928


In [0]:
sklearn_model_new2 = xgb.XGBClassifier(max_depth=5,
                     n_estimators=10,
                     verbosity=1,objective='binary:logistic',
                     random_state=1)
gsCv2 = GridSearchCV(sklearn_model_new2, 
            {'learning_rate ':[0.3,0.5,0.7]})
gsCv2.fit(X_train,y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=5, min_child_weight=1,
                                     missing=None, n_estimators=10, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=1, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={'learning_rate ': [0.3, 0.5, 0.7]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None,

In [0]:
print(gsCv2.best_params_)
print(gsCv2.best_score_)

{'learning_rate ': 0.3}
0.8918666666666667


In [0]:
sklearn_model_new2 = xgb.XGBClassifier(max_depth=4,
                     learning_rate=0.3,
                     verbosity=1,
                     objective='binary:logistic',
                     n_estimators=10)
sklearn_model_new2.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="error",
        eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.1172
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.1112
[2]	validation_0-error:0.11
[3]	validation_0-error:0.1076
[4]	validation_0-error:0.1032
[5]	validation_0-error:0.1032
[6]	validation_0-error:0.1024
[7]	validation_0-error:0.1024
[8]	validation_0-error:0.1028
[9]	validation_0-error:0.1032


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.3, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
pred_test_new = sklearn_model_new2.predict(X_test)
print(classification_report(dtest.get_label(), pred_test_new))

              precision    recall  f1-score   support

         0.0       0.87      0.94      0.90      1244
         1.0       0.93      0.86      0.89      1256

    accuracy                           0.90      2500
   macro avg       0.90      0.90      0.90      2500
weighted avg       0.90      0.90      0.90      2500



可见准确率还是增加了的。