In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

import matplotlib.pyplot as plt
%matplotlib inline



In [5]:
train = pd.read_csv('train_modified.csv')
target = 'Disbursed' #Disbursed的值就是二元分类的输出
IDcol = 'ID'
train['Disbursed'].value_counts()

0    19680
1      320
Name: Disbursed, dtype: int64

In [6]:
train.head()

Unnamed: 0,Disbursed,Existing_EMI,ID,Loan_Amount_Applied,Loan_Tenure_Applied,Monthly_Income,Var4,Var5,Age,EMI_Loan_Submitted_Missing,...,Var2_2,Var2_3,Var2_4,Var2_5,Var2_6,Mobile_Verified_0,Mobile_Verified_1,Source_0,Source_1,Source_2
0,0,0.0,ID000002C20,300000,5,20000,1,0,37,1,...,0,0,0,0,1,1,0,1,0,0
1,0,0.0,ID000004E40,200000,2,35000,3,13,30,0,...,0,0,0,0,1,0,1,1,0,0
2,0,0.0,ID000007H20,600000,4,22500,1,0,34,1,...,0,0,0,0,0,0,1,0,0,1
3,0,0.0,ID000008I30,1000000,5,35000,3,10,28,1,...,0,0,0,0,0,0,1,0,0,1
4,0,25000.0,ID000009J40,500000,2,100000,3,17,31,1,...,0,0,0,0,0,0,1,0,0,1


### 下面我们得到训练集

In [7]:
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Disbursed']

In [19]:
type(y),type(y.values)

(pandas.core.series.Series, numpy.ndarray)

### 不管任何参数，都用默认的，我们拟合下数据看：

In [23]:
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(X, y)
y_pred = gbm0.predict(X)
y_predprob = gbm0.predict_proba(X)[:, 1]

print "Accuracy : %.4g" % metrics.accuracy_score(y, y_pred)
print "AUC Score(Train) : %f" % metrics.roc_auc_score(y.values, y_predprob)

Accuracy : 0.9852
AUC Score(Train) : 0.900531


### 输出如上，可见拟合还可以，
### 我们下面看看怎么通过调参提高模型的泛化能力：

### 1. 首先我们从步长(learning rate)和迭代次数(n_estimators)入手。一般来说,开始选择一个较小的步长来网格搜索最好的迭代次数。这里，我们将步长初始值设置为0.1。对于迭代次数进行网格搜索如下：

In [24]:
param_test1 = {'n_estimators': range(20, 81, 10)}


gbdt1 = GradientBoostingClassifier(learning_rate=0.1,
                                   min_samples_split=300,
                                   min_samples_leaf=20,
                                   max_depth=8,
                                   max_features='sqrt',
                                   subsample=0.8,
                                   random_state=10)

gsearch1 = GridSearchCV(estimator = gbdt1, param_grid = param_test1,
                        scoring = 'roc_auc', iid = False,
                        cv = 5)

gsearch1.fit(X, y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.81285, std: 0.01967, params: {'n_estimators': 20},
  mean: 0.81438, std: 0.01947, params: {'n_estimators': 30},
  mean: 0.81404, std: 0.01970, params: {'n_estimators': 40},
  mean: 0.81593, std: 0.01868, params: {'n_estimators': 50},
  mean: 0.81927, std: 0.01596, params: {'n_estimators': 60},
  mean: 0.81722, std: 0.01750, params: {'n_estimators': 70},
  mean: 0.81485, std: 0.01732, params: {'n_estimators': 80}],
 {'n_estimators': 60},
 0.8192660696138212)

### 输出如下，可见最好的迭代次数是60。

### 2.找到了一个合适的迭代次数，现在我们开始对决策树进行调参。首先我们对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索。

In [25]:
param_test2 = {'max_depth': range(3, 14, 2),
               'min_samples_split': range(100, 801, 200)}

gbdt2 = GradientBoostingClassifier(learning_rate=0.1,
                                   n_estimators = 60,
                                   min_samples_leaf = 20,
                                   max_features = 'sqrt',
                                   subsample = 0.8,
                                   random_state = 10)
gsearch2 = GridSearchCV(estimator = gbdt2,
                        param_grid = param_test2,
                        scoring = 'roc_auc',
                        iid = False, cv = 5)

gsearch2.fit(X, y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

([mean: 0.81199, std: 0.02073, params: {'min_samples_split': 100, 'max_depth': 3},
  mean: 0.81267, std: 0.01985, params: {'min_samples_split': 300, 'max_depth': 3},
  mean: 0.81238, std: 0.01937, params: {'min_samples_split': 500, 'max_depth': 3},
  mean: 0.80925, std: 0.02051, params: {'min_samples_split': 700, 'max_depth': 3},
  mean: 0.81846, std: 0.01843, params: {'min_samples_split': 100, 'max_depth': 5},
  mean: 0.81630, std: 0.01810, params: {'min_samples_split': 300, 'max_depth': 5},
  mean: 0.81315, std: 0.01898, params: {'min_samples_split': 500, 'max_depth': 5},
  mean: 0.81262, std: 0.02090, params: {'min_samples_split': 700, 'max_depth': 5},
  mean: 0.81826, std: 0.02030, params: {'min_samples_split': 100, 'max_depth': 7},
  mean: 0.82137, std: 0.01733, params: {'min_samples_split': 300, 'max_depth': 7},
  mean: 0.81703, std: 0.01773, params: {'min_samples_split': 500, 'max_depth': 7},
  mean: 0.81383, std: 0.02327, params: {'min_samples_split': 700, 'max_depth': 7},
  me

### 输出如上，可见最好的最大树深度是7，内部节点再划分所需最小样本数是300。

###  由于决策树深度7是一个比较合理的值，我们把它定下来，对于内部节点再划分所需最小样本数min_samples_split，我  们暂时不能一起定下来，因为这个还和决策树其他的参数存在关联。
### 3. 下面我们再对min_samples_split和min_samples_leaf一起调参。 

In [26]:
param_test3 = {'min_samples_split': range(800, 1900, 200),
               'min_samples_leaf': range(60, 101, 10)}

gbdt3 = GradientBoostingClassifier(learning_rate = 0.1,
                                   n_estimators = 60,
                                   max_depth = 7,
                                   max_features = 'sqrt',
                                   subsample = 0.8,
                                   random_state = 10)
gsearch3 = GridSearchCV(estimator = gbdt3,
                        param_grid = param_test3,
                        scoring = 'roc_auc',
                        iid = False, cv = 5)

gsearch3.fit(X, y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: 0.81828, std: 0.02251, params: {'min_samples_split': 800, 'min_samples_leaf': 60},
  mean: 0.81731, std: 0.02344, params: {'min_samples_split': 1000, 'min_samples_leaf': 60},
  mean: 0.82220, std: 0.02250, params: {'min_samples_split': 1200, 'min_samples_leaf': 60},
  mean: 0.81447, std: 0.02125, params: {'min_samples_split': 1400, 'min_samples_leaf': 60},
  mean: 0.81495, std: 0.01626, params: {'min_samples_split': 1600, 'min_samples_leaf': 60},
  mean: 0.81528, std: 0.02140, params: {'min_samples_split': 1800, 'min_samples_leaf': 60},
  mean: 0.81590, std: 0.02517, params: {'min_samples_split': 800, 'min_samples_leaf': 70},
  mean: 0.81573, std: 0.02207, params: {'min_samples_split': 1000, 'min_samples_leaf': 70},
  mean: 0.82021, std: 0.02521, params: {'min_samples_split': 1200, 'min_samples_leaf': 70},
  mean: 0.81512, std: 0.01995, params: {'min_samples_split': 1400, 'min_samples_leaf': 70},
  mean: 0.81395, std: 0.02081, params: {'min_samples_split': 1600, 'min_samples_le

### 我们调了这么多参数了，终于可以都放到GBDT类里面去看看效果了。现在我们用新参数拟合数据：

In [37]:
gbm1 = GradientBoostingClassifier(learning_rate = 0.1,
                                  n_estimators = 60,
                                  max_depth = 7,
                                  min_samples_leaf = 60,
                                  min_samples_split = 1200,
                                  max_features = 'sqrt',
                                  subsample = 0.8,
                                  random_state = 10)
gbm1.fit(X, y)

y_pred = gbm1.predict(X)
y_predprob = gbm1.predict_proba(X)[:, 1]
y_predprob_test = gbm1.predict_proba(X)
print "Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred)
print "AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob)

Accuracy : 0.984
AUC Score (Train): 0.908099


In [31]:
y_predprob_test #是类别0 的概率

array([[ 0.99260559,  0.00739441],
       [ 0.93975463,  0.06024537],
       [ 0.99157703,  0.00842297],
       ..., 
       [ 0.99840288,  0.00159712],
       [ 0.99294406,  0.00705594],
       [ 0.99856449,  0.00143551]])

### 对比我们最开始完全不调参的拟合效果，可见精确度稍有下降，主要原理是我们使用了0.8的子采样，20%的数据没有参与拟合。

### 4. 现在我们再对最大特征数max_features进行网格搜索。

In [38]:
param_test4 = {'max_features': range(7, 20, 2)}

gbdt4 = GradientBoostingClassifier(learning_rate = 0.1,
                                   n_estimators = 60, 
                                   max_depth = 7,
                                   min_samples_leaf = 60,
                                   min_samples_split = 1200,
                                   subsample = 0.8,
                                   random_state = 10)

gsearch4 = GridSearchCV(estimator = gbdt4,
                        param_grid = param_test4,
                        scoring = 'roc_auc',
                        iid = False, cv = 5)

gsearch4.fit(X, y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.82220, std: 0.02250, params: {'max_features': 7},
  mean: 0.82241, std: 0.02421, params: {'max_features': 9},
  mean: 0.82108, std: 0.02302, params: {'max_features': 11},
  mean: 0.82064, std: 0.01900, params: {'max_features': 13},
  mean: 0.82198, std: 0.01514, params: {'max_features': 15},
  mean: 0.81355, std: 0.02053, params: {'max_features': 17},
  mean: 0.81877, std: 0.01863, params: {'max_features': 19}],
 {'max_features': 9},
 0.822412506351626)

### 5. 现在我们再对子采样的比例进行网格搜索：

In [40]:
param_test5 = {'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9]}

gbdt5 = GradientBoostingClassifier(learning_rate = 0.1,
                                   n_estimators = 60,
                                   max_depth = 7,
                                   min_samples_leaf = 60,
                                   min_samples_split = 1200,
                                   max_features = 9,
                                   random_state = 10)

gsearch5 = GridSearchCV(estimator = gbdt5,
                        param_grid = param_test5,
                        scoring = 'roc_auc',
                        iid = False, cv = 5)
gsearch5.fit(X, y)

gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

([mean: 0.81828, std: 0.02392, params: {'subsample': 0.6},
  mean: 0.82344, std: 0.02708, params: {'subsample': 0.7},
  mean: 0.81673, std: 0.02196, params: {'subsample': 0.75},
  mean: 0.82241, std: 0.02421, params: {'subsample': 0.8},
  mean: 0.82285, std: 0.02446, params: {'subsample': 0.85},
  mean: 0.81738, std: 0.02236, params: {'subsample': 0.9}],
 {'subsample': 0.7},
 0.8234378969766262)

# 现在我们基本已经得到我们所有调优的参数结果了。这时我们可以减半步长，最大迭代次数加倍来增加我们模型的泛化能力。再次拟合我们的模型：

In [41]:
gbm2 = GradientBoostingClassifier(learning_rate = 0.05,
                                  n_estimators = 120,
                                  max_depth = 7,
                                  min_samples_leaf = 60,
                                  min_samples_split = 1200,
                                  max_features = 9,
                                  subsample = 0.7,
                                  random_state = 10)
gbm2.fit(X, y)
y_pred = gbm2.predict(X)
y_predictprob = gbm2.predict_proba(X)[:, 1]
print "Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred)
print "AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob)

Accuracy : 0.984
AUC Score (Train): 0.908099


### 可以看到AUC分数比起之前的版本稍有下降，这个原因是我们为了增加模型泛化能力，为防止过拟合而减半步长，最大迭代次数加倍，同时减小了子采样的比例，从而减少了训练集的拟合程度

### 下面我们继续将步长缩小5倍， 最大迭代次数增加5倍，继续你和我么的模型

In [43]:
gbm3 = GradientBoostingClassifier(learning_rate = 0.01,
                                  n_estimators = 600,
                                  max_depth = 7,
                                  min_samples_leaf = 60,
                                  min_samples_split = 1200,
                                  max_features = 9,
                                  subsample = 0.7,
                                  random_state = 10)
gbm3.fit(X, y)
y_pred = gbm3.predict(X)
y_predprob = gbm3.predict_proba(X)[:, 1]

print "Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred)
print "AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob)

Accuracy : 0.984
AUC Score (Train): 0.908581


### 输出如下，可见减小步长增加迭代次数可以在保证泛化能力的基础上增加一些拟合程度。

In [44]:
gbm4 = GradientBoostingClassifier(learning_rate=0.005,
                                  n_estimators=1200,
                                  max_depth=7,
                                  min_samples_leaf =60, 
                                  min_samples_split =1200,
                                  max_features=9,
                                  subsample=0.7,
                                  random_state=10)
gbm4.fit(X,y)
y_pred = gbm4.predict(X)
y_predprob = gbm4.predict_proba(X)[:,1]

print "Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred)
print "AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob)

Accuracy : 0.984
AUC Score (Train): 0.908232


### 输出如上，此时由于步长实在太小，导致拟合效果反而变差，也就是说，步长不能设置的过小。