In [0]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,roc_auc_score,classification_report,confusion_matrix
import matplotlib.pyplot as plt
plt.xkcd()
%matplotlib inline

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# read data
train = pd.read_csv('/content/drive/My Drive/mine/dataset/GBDT_test.csv')
target = 'Disbursed' # label
IDcol = 'ID'
train['Disbursed'].value_counts()

0    19680
1      320
Name: Disbursed, dtype: int64

类别0的占大多数。

In [0]:
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Disbursed']

In [0]:
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(X,y)
y_pred = gbm0.predict(X)
y_predprob = gbm0.predict_proba(X)[:,1]
print(classification_report(y.values, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     19680
           1       1.00      0.08      0.14       320

    accuracy                           0.99     20000
   macro avg       0.99      0.54      0.57     20000
weighted avg       0.99      0.99      0.98     20000



首先我们从步长(learning rate)和迭代次数(n_estimators)入手。一般来说,开始选择一个较小的步长来网格搜索最好的迭代次数。这里，我们将步长初始值设置为0.1。

In [0]:
param_test1 = {'n_estimators':range=GradientBoostingClassifier(learning_rate=0.1,
                                 min_samples_split=300,
                                 min_samples_leaf=20,
                                 max_depth=8,
                                 max_features='sqrt',
                                 subsample=0.8,
                                 random_state=10), 
         param_grid=param_test1,
         scoring='roc_auc',
         iid=False,
         cv=5)
gsearch1.fit(X,y)
print(gsearch1.best_params_)
print(gsearch1.best_score_)

{'n_estimators': 60}
0.8192660696138212


输出如下，可见最好的迭代次数是60。

找到了一个合适的迭代次数，现在我们开始对决策树进行调参。首先我们对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索。

In [0]:
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(100,801,200)}
gsearch2 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1,
                                n_estimators=60,
                                min_samples_leaf=20, 
                                max_features='sqrt',
                                subsample=0.8,
                                random_state=10), 
             param_grid=param_test2,
             scoring='roc_auc',
             iid=False,
             cv=5)
gsearch2.fit(X,y)

{'n_estimators': 60}
0.8192660696138212


In [0]:
print(gsearch2.best_params_)
print(gsearch2.best_score_)

{'max_depth': 7, 'min_samples_split': 300}
0.8213724275914632


最好的最大树深度是7，内部节点再划分所需最小样本数是300。

　由于决策树深度7是一个比较合理的值，我们把它定下来，对于内部节点再划分所需最小样本数min_samples_split，我们暂时不能一起定下来，因为这个还和决策树其他的参数存在关联。下面我们再对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参。

In [0]:
param_test3 = {'min_samples_split':range(800,1900,200), 'min_samples_leaf':range(60,101,10)}
gsearch3 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1,
                                n_estimators=60,
                                max_depth=7,
                                max_features='sqrt',
                                subsample=0.8,
                                random_state=10), 
             param_grid=param_test3,
             scoring='roc_auc',
             iid=False,
             cv=5)
gsearch3.fit(X,y)
print(gsearch3.best_params_)
print(gsearch3.best_score_)

{'min_samples_leaf': 60, 'min_samples_split': 1200}
0.8222032996697154


In [0]:
gbm1 = GradientBoostingClassifier(learning_rate=0.1,
                  n_estimators=60,
                  max_depth=7,
                  min_samples_leaf =60, 
                  min_samples_split =1200,
                  max_features='sqrt',
                  subsample=0.8,
                  random_state=10)
gbm1.fit(X,y)
y_pred = gbm1.predict(X)
print(classification_report(y.values, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     19680
           1       0.00      0.00      0.00       320

    accuracy                           0.98     20000
   macro avg       0.49      0.50      0.50     20000
weighted avg       0.97      0.98      0.98     20000



  'precision', 'predicted', average, warn_for)


对比我们最开始完全不调参的拟合效果，可见精确度稍有下降，主要原理是我们使用了0.8的子采样，20%的数据没有参与拟合。

对最大特征数max_features进行调参。

In [0]:
param_test4 = {'max_features':range(7,20,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,
                                  n_estimators=60,
                                  max_depth=7,
                                  min_samples_leaf =60, 
                                  min_samples_split =1200,
                                  subsample=0.8,
                                  random_state=10), 
              param_grid=param_test4,
              scoring='roc_auc',
              iid=False,
              cv=5)
gsearch4.fit(X,y)
print(gsearch4.best_params_)
print(gsearch4.best_score_)

{'max_features': 9}
0.822412506351626


最合适的参数为9。

In [35]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator=GradientBoostingClassifier(learning_rate=0.1, 
                                n_estimators=60,
                                max_depth=7,
                                min_samples_leaf=60, 
                                min_samples_split=1200,
                                max_features=9,
                                random_state=10), 
             param_grid=param_test5,
             scoring='roc_auc',
             iid=False,
             cv=5)
gsearch5.fit(X,y)
print(gsearch5.best_params_)
print(gsearch5.best_score_)

{'subsample': 0.7}
0.8234378969766262


最合适的参数为0.7。

In [36]:
gbm2 = GradientBoostingClassifier(learning_rate=0.1,
                  n_estimators=60,
                  max_depth=7,
                  min_samples_leaf=60, 
                  min_samples_split=1200,
                  max_features=9,
                  subsample=0.7,
                  random_state=10)
gbm2.fit(X,y)
y_pred = gbm2.predict(X)
print(classification_report(y.values, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     19680
           1       0.00      0.00      0.00       320

    accuracy                           0.98     20000
   macro avg       0.49      0.50      0.50     20000
weighted avg       0.97      0.98      0.98     20000



  'precision', 'predicted', average, warn_for)


可以看到AUC分数比起之前的版本稍有下降，这个原因是我们为了增加模型泛化能力，为防止过拟合而减半步长，最大迭代次数加倍，同时减小了子采样的比例，从而减少了训练集的拟合程度。