导入库

In [33]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

import matplotlib.pylab as plt
%matplotlib inline 

In [20]:
from sklearn.ensemble import GradientBoostingRegressor

导入数据

In [21]:
train = pd.read_csv('train_modified.csv')
target='Disbursed' # Disbursed的值就是二元分类的输出
IDcol = 'ID'
train['Disbursed'].value_counts()

0    19680
1      320
Name: Disbursed, dtype: int64

In [26]:
x_columns = [x for x in train.columns if x not in [target,IDcol]]  
X = train[x_columns]  
y = train['Disbursed']  
   

In [35]:
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(X,y)
y_pred = gbm0.predict(X)
y_predprob = gbm0.predict_proba(X)[:,1]#给出带有概率值的结果。每个点在所有label（类别）的概率和为1. 
print ("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))
#AUC是指从一堆样本中随机抽一个，抽到正样本的概率比抽到负样本的概率大的可能性 

Accuracy : 0.9852
AUC Score (Train): 0.900531


找树的数量最佳值

In [28]:
param_gbdt = {'n_estimators':list(range(100,600,50))}
gbdt_search = GridSearchCV(estimator=GradientBoostingRegressor(learning_rate=0.1,
                            min_samples_split=300,min_samples_leaf=20,max_depth=8,max_features='sqrt',subsample=0.8,random_state=75),
                          param_grid=param_gbdt,scoring='neg_mean_squared_error',iid=False,cv=3)
gbdt_search.fit(X,y)
gbdt_search.grid_scores_,gbdt_search.best_params_,gbdt_search.best_score_

([mean: -0.01534, std: 0.00095, params: {'n_estimators': 100},
  mean: -0.01544, std: 0.00097, params: {'n_estimators': 150},
  mean: -0.01554, std: 0.00099, params: {'n_estimators': 200},
  mean: -0.01566, std: 0.00100, params: {'n_estimators': 250},
  mean: -0.01578, std: 0.00098, params: {'n_estimators': 300},
  mean: -0.01588, std: 0.00102, params: {'n_estimators': 350},
  mean: -0.01597, std: 0.00101, params: {'n_estimators': 400},
  mean: -0.01609, std: 0.00100, params: {'n_estimators': 450},
  mean: -0.01616, std: 0.00102, params: {'n_estimators': 500},
  mean: -0.01628, std: 0.00101, params: {'n_estimators': 550}],
 {'n_estimators': 100},
 -0.015341378286619639)

找树的深度最佳值

In [29]:
param_gbdt1 = {'max_depth':[6,7,8,9,10]}
gbdt_search1 = GridSearchCV(estimator=GradientBoostingRegressor(learning_rate=0.1,n_estimators = 200,min_samples_split=300,
                            min_samples_leaf=20,max_features='sqrt',subsample=0.8,random_state=75),
                            param_grid=param_gbdt1,scoring='neg_mean_squared_error',iid=False,cv=5)
gbdt_search1.fit(X,y)
print(gbdt_search1.grid_scores_)
print(gbdt_search1.best_params_)
print(gbdt_search1.best_score_)

[mean: -0.01549, std: 0.00118, params: {'max_depth': 6}, mean: -0.01550, std: 0.00117, params: {'max_depth': 7}, mean: -0.01556, std: 0.00116, params: {'max_depth': 8}, mean: -0.01566, std: 0.00122, params: {'max_depth': 9}, mean: -0.01571, std: 0.00116, params: {'max_depth': 10}]
{'max_depth': 6}
-0.015487898597004484


找min_samples_split，min_samples_leaf最佳值

In [30]:
param_gbdt2 = {'min_samples_split':[500,700,900,1100],
               'min_samples_leaf':[30,50,70,90]}
gbdt_search2 = GridSearchCV(estimator=GradientBoostingRegressor(learning_rate=0.1,n_estimators = 100,max_depth=6,
                                max_features='sqrt',subsample=0.8,random_state=75),n_jobs=3,
                            param_grid=param_gbdt2,scoring='neg_mean_squared_error',iid=False,cv=5)
gbdt_search2.fit(X,y)
print(gbdt_search2.grid_scores_)
print(gbdt_search2.best_params_)
print(gbdt_search2.best_score_)

[mean: -0.01533, std: 0.00112, params: {'min_samples_leaf': 30, 'min_samples_split': 500}, mean: -0.01534, std: 0.00110, params: {'min_samples_leaf': 30, 'min_samples_split': 700}, mean: -0.01533, std: 0.00110, params: {'min_samples_leaf': 30, 'min_samples_split': 900}, mean: -0.01534, std: 0.00112, params: {'min_samples_leaf': 30, 'min_samples_split': 1100}, mean: -0.01531, std: 0.00112, params: {'min_samples_leaf': 50, 'min_samples_split': 500}, mean: -0.01533, std: 0.00109, params: {'min_samples_leaf': 50, 'min_samples_split': 700}, mean: -0.01534, std: 0.00112, params: {'min_samples_leaf': 50, 'min_samples_split': 900}, mean: -0.01533, std: 0.00110, params: {'min_samples_leaf': 50, 'min_samples_split': 1100}, mean: -0.01530, std: 0.00109, params: {'min_samples_leaf': 70, 'min_samples_split': 500}, mean: -0.01534, std: 0.00109, params: {'min_samples_leaf': 70, 'min_samples_split': 700}, mean: -0.01534, std: 0.00112, params: {'min_samples_leaf': 70, 'min_samples_split': 900}, mean: -

对n_estimators和学习率进行调整

In [32]:
param_gbdt3 = {'learning_rate':[0.06,0.08,0.1],
               'n_estimators':[100,150,200,250]}
gbdt_search3 = GridSearchCV(estimator=GradientBoostingRegressor(min_samples_split=700,min_samples_leaf=70,
                                max_depth=9,max_features='sqrt',subsample=0.8,random_state=75),n_jobs=3,
                            param_grid=param_gbdt3,scoring='neg_mean_squared_error',iid=False,cv=5)
gbdt_search3.fit(X,y)
print(gbdt_search3.grid_scores_)
print(gbdt_search3.best_params_)
print(gbdt_search3.best_score_)

[mean: -0.01529, std: 0.00112, params: {'learning_rate': 0.06, 'n_estimators': 100}, mean: -0.01529, std: 0.00111, params: {'learning_rate': 0.06, 'n_estimators': 150}, mean: -0.01530, std: 0.00111, params: {'learning_rate': 0.06, 'n_estimators': 200}, mean: -0.01531, std: 0.00112, params: {'learning_rate': 0.06, 'n_estimators': 250}, mean: -0.01529, std: 0.00111, params: {'learning_rate': 0.08, 'n_estimators': 100}, mean: -0.01532, std: 0.00112, params: {'learning_rate': 0.08, 'n_estimators': 150}, mean: -0.01534, std: 0.00112, params: {'learning_rate': 0.08, 'n_estimators': 200}, mean: -0.01538, std: 0.00111, params: {'learning_rate': 0.08, 'n_estimators': 250}, mean: -0.01531, std: 0.00114, params: {'learning_rate': 0.1, 'n_estimators': 100}, mean: -0.01533, std: 0.00114, params: {'learning_rate': 0.1, 'n_estimators': 150}, mean: -0.01536, std: 0.00114, params: {'learning_rate': 0.1, 'n_estimators': 200}, mean: -0.01541, std: 0.00113, params: {'learning_rate': 0.1, 'n_estimators': 2

用调好的参数对数据进行拟合

In [37]:
gbm3 = GradientBoostingClassifier(learning_rate=0.06, n_estimators=100,max_depth=6, min_samples_leaf =90, 
               min_samples_split =500, max_features=9, subsample=0.7, random_state=10)
gbm3.fit(X,y)
y_pred = gbm3.predict(X)
y_predprob = gbm3.predict_proba(X)[:,1]
print("Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

Accuracy : 0.984
AUC Score (Train): 0.906571


参考

In [None]:
https://www.cnblogs.com/nxf-rabbit75/p/10593524.html