In [None]:
from  tunning import Tunning

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
import xgboost as xgb
from tianjikit.tunning import Tunning


data,label = datasets.make_classification(n_samples= 10000, n_features=20, n_informative= 6 ,
             n_classes=2, n_clusters_per_class=10,random_state=0)
dfdata = pd.DataFrame(data,columns = [u'f'+str(i) for i in range(data.shape[1])])
dfdata['label'] = label
dftrain,dftest = train_test_split(dfdata)

In [2]:
# 构造初始化参数
params_dict = dict()
# 以下为待调整参数
# booster参数
params_dict['learning_rate'] = 0.1        # 学习率，初始值为 0.1，通常越小越好。
params_dict['n_estimators'] = 60          # 加法模型树的数量，初始值为50，通常通过模型cv确认。
# tree参数
params_dict['max_depth'] = 3              # 树的深度，通常取值在[3,10]之间，初始值常取[3,6]之间
params_dict['min_child_weight']=10        # 最小叶子节点样本权重和，越大模型越保守。
params_dict['gamma']= 0                   # 节点分裂所需的最小损失函数下降值，越大模型越保守。
params_dict['subsample']= 0.8             # 横向采样，样本采样比例，通常取值在 [0.5，1]之间 
params_dict['colsample_bytree'] = 1.0     # 纵向采样，特征采样比例，通常取值在 [0.5，1]之间 
# regulazation参数 
# Omega(f) = gamma*T + reg_alpha* sum(abs(wj)) + reg_lambda* sum(wj**2) 
params_dict['reg_alpha'] = 0              #L1 正则化项的权重系数，越大模型越保守，通常取值在[0,1]之间。
params_dict['reg_lambda'] = 1             #L2 正则化项的权重系数，越大模型越保守，通常取值在[1,100]之间。
# 以下参数通常不需要调整
params_dict['objective'] = 'binary:logistic'
params_dict['tree_method'] = 'hist'       # 构建树的策略,可以是auto, exact, approx, hist
params_dict['eval_metric'] =  'auc'
params_dict['silent'] = 1
params_dict['scale_pos_weight'] = 1       #不平衡样本时设定为正值可以使算法更快收敛。
params_dict['seed'] = 0

In [3]:
# step0: 初始化
tune = Tunning(dftrain,dftest,score_func = 'ks',score_gap_limit = 0.05,params_dict=params_dict,n_jobs=4)

# step1: tune n_estimators for relatively high learning_rate
params_test1 = {'learning_rate': [0.1],'n_estimators':[50]} 
tune.gridsearch_cv(params_test1,cv = 5,verbose_eval = 10)

# step2：tune max_depth & min_child_weight 
params_test2 = { 'max_depth': [3], 'min_child_weight': [50,100,200] } 
tune.gridsearch_cv(params_test2,cv = 5,verbose_eval = 10)

'''
# step3：tune gamma
params_test3 = {'gamma': [0.1,0.5,1]}
tune.gridsearch_cv(params_test3,cv = 5,verbose_eval = 10)


# step4：tune subsample & colsample_bytree 
params_test4 = { 'subsample': [0.9,1.0],'colsample_bytree': [1.0] } 
tune.gridsearch_cv(params_test4,cv = 5,verbose_eval = 10)


# step5: tune reg_alpha 
params_test5 = { 'reg_alpha': [0.1,1] } 
tune.gridsearch_cv(params_test5,cv = 5,verbose_eval = 10)


# step6: tune reg_lambda 
params_test6 = { 'reg_lambda': [0,0.1] }
tune.gridsearch_cv(params_test6,cv = 5,verbose_eval = 10)


# step7: lower learning_rate and rise n_estimators
params_test7 = { 'learning_rate':[0.08,0.09], 'n_estimators':[100]}
tune.gridsearch_cv(params_test7,cv = 5)
'''


train set size: 7500
test set size: 2500
feature number: 21
score func: ks
score gap limit: 0.05
n_jobs: 4

{'n_estimators': 50, 'learning_rate': 0.1}

k = 1
[16:43:07] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.712797	valid-auc:0.71662	test-auc:0.706823	train-ks:0.325276	valid-ks:0.340163	test-ks:0.306497
[10]	train-auc:0.770709	valid-auc:0.768178	test-auc:0.75719	train-ks:0.412317	valid-ks:0.414517	test-ks:0.396818
[20]	train-auc:0.795999	valid-auc:0.784003	test-auc:0.776054	train-ks:0.441397	valid-ks:0.437233	test-ks:0.425031
[30]	train-auc:0.811413	valid-auc:0.800145	test-auc:0.787526	train-ks:0.460373	valid-ks:0.450506	test-ks:0.440213
[40]	train-auc:0.818274	valid-auc:0.805402	test-auc:0.792287	train-ks:0.470122	valid-ks:0.463854	test-ks:0.450657
[49]	train-auc:0.830972	valid-auc:0.812583	test-auc:0.799177	train-ks:0.493016	valid-ks:0.469223	test-ks:0.458621

k = 2
[16:43:09] Tree method is selected to be 'hist', which 

[50]	train-auc:0.820546	valid-auc:0.772431	test-auc:0.789761	train-ks:0.477091	valid-ks:0.40395	test-ks:0.439043
[59]	train-auc:0.825052	valid-auc:0.775653	test-auc:0.793248	train-ks:0.487131	valid-ks:0.418383	test-ks:0.4403

k = 2
[16:43:14] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.715571	valid-auc:0.720818	test-auc:0.703468	train-ks:0.327346	valid-ks:0.351441	test-ks:0.296329
[10]	train-auc:0.769359	valid-auc:0.759166	test-auc:0.755923	train-ks:0.403639	valid-ks:0.387046	test-ks:0.386718
[20]	train-auc:0.77983	valid-auc:0.773302	test-auc:0.766262	train-ks:0.414389	valid-ks:0.413855	test-ks:0.401092
[30]	train-auc:0.790664	valid-auc:0.787598	test-auc:0.776213	train-ks:0.43319	valid-ks:0.43334	test-ks:0.418105
[40]	train-auc:0.800313	valid-auc:0.792794	test-auc:0.783106	train-ks:0.449419	valid-ks:0.44678	test-ks:0.433272
[50]	train-auc:0.807605	valid-auc:0.798825	test-auc:0.787085	train-ks:0.45605	valid-ks:0.453322	test-ks:0

[59]	train-auc:0.795347	valid-auc:0.766704	test-auc:0.770228	train-ks:0.448183	valid-ks:0.392479	test-ks:0.406107

k = 5
[16:43:23] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.694864	valid-auc:0.685244	test-auc:0.688649	train-ks:0.285533	valid-ks:0.254566	test-ks:0.263336
[10]	train-auc:0.758896	valid-auc:0.757029	test-auc:0.746242	train-ks:0.395153	valid-ks:0.382132	test-ks:0.374582
[20]	train-auc:0.770821	valid-auc:0.765119	test-auc:0.757321	train-ks:0.412463	valid-ks:0.40036	test-ks:0.392892
[30]	train-auc:0.779512	valid-auc:0.772499	test-auc:0.76484	train-ks:0.418857	valid-ks:0.422908	test-ks:0.401282
[40]	train-auc:0.785021	valid-auc:0.776283	test-auc:0.768297	train-ks:0.430157	valid-ks:0.428264	test-ks:0.411469
[50]	train-auc:0.790381	valid-auc:0.777412	test-auc:0.771346	train-ks:0.438223	valid-ks:0.428248	test-ks:0.412328
[59]	train-auc:0.794896	valid-auc:0.779628	test-auc:0.771854	train-ks:0.442681	valid-ks:0.434903	tes

+---+----------+---------------+--------------+-----------+------------------+-------+-----------+------------------+-----------+------------+
|   | model_id | learning_rate | n_estimators | max_depth | min_child_weight | gamma | subsample | colsample_bytree | reg_alpha | reg_lambda |
+---+----------+---------------+--------------+-----------+------------------+-------+-----------+------------------+-----------+------------+
| 0 |    0     |      0.1      |     49.0     |     3     |        10        |   0   |    0.8    |       1.0        |     0     |     1      |
+---+----------+---------------+--------------+-----------+------------------+-------+-----------+------------------+-----------+------------+
Best score so far:
+---+----------+---------------------+----------------+----------------------+--------------------+
|   | model_id |     train_score     | validate_score |      score_gap       |     test_score     |
+---+----------+---------------------+----------------+-----------

"\n# step3\xef\xbc\x9atune gamma\nparams_test3 = {'gamma': [0.1,0.5,1]}\ntune.gridsearch_cv(params_test3,cv = 5,verbose_eval = 10)\n\n\n# step4\xef\xbc\x9atune subsample & colsample_bytree \nparams_test4 = { 'subsample': [0.9,1.0],'colsample_bytree': [1.0] } \ntune.gridsearch_cv(params_test4,cv = 5,verbose_eval = 10)\n\n\n# step5: tune reg_alpha \nparams_test5 = { 'reg_alpha': [0.1,1] } \ntune.gridsearch_cv(params_test5,cv = 5,verbose_eval = 10)\n\n\n# step6: tune reg_lambda \nparams_test6 = { 'reg_lambda': [0,0.1] }\ntune.gridsearch_cv(params_test6,cv = 5,verbose_eval = 10)\n\n\n# step7: lower learning_rate and rise n_estimators\nparams_test7 = { 'learning_rate':[0.08,0.09], 'n_estimators':[100]}\ntune.gridsearch_cv(params_test7,cv = 5)\n"

In [4]:
# step8: train model with tuned parameters and fully train dataset.
bst,dfimportance = tune.train_best()
bst.save_model('./bst.model')
dfimportance.to_csv('./dfimportance.csv',sep = '\t')

[16:43:39] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.71455	test-auc:0.707313	train-ks:0.325309	test-ks:0.303291
[10]	train-auc:0.771947	test-auc:0.754901	train-ks:0.411981	test-ks:0.395233
[20]	train-auc:0.794886	test-auc:0.773506	train-ks:0.438577	test-ks:0.41971
[30]	train-auc:0.805639	test-auc:0.78079	train-ks:0.446658	test-ks:0.415492
[40]	train-auc:0.813594	test-auc:0.788359	train-ks:0.457454	test-ks:0.425266
[50]	train-auc:0.825328	test-auc:0.797107	train-ks:0.476633	test-ks:0.444702
[59]	train-auc:0.834259	test-auc:0.802028	train-ks:0.495239	test-ks:0.452769


In [5]:
dfimportance

Unnamed: 0,feature,importance
14,feature10,64
4,feature2,61
11,feature15,49
19,feature19,44
9,feature9,42
2,feature4,38
7,feature1,38
17,feature13,13
5,feature3,10
18,feature18,5


In [None]:
%run runpipeline.py

In [None]:
%run runeverything.py