# tianjikit模块使用指南

### 一，模块整体架构

![](readme.png)

#### outlier_analysis的输出：

In [None]:
'med', #中位数
'seg_25', #1/4分位数
'seg_75', #3/4分位数
'up_limit',  #离群值判定上边界
'low_limit', #离群值判定下边界
'up_ratio',  #超上边界离群值比例
'low_ratio';  #超下边界离群值比例


#### basic_analysis的输出：

In [None]:
#------覆盖率------------------------#
'not_nan_ratio',  #非空比例，通常覆盖率coverage即指它
'not_zero_ratio', #非零比例，非零值不含空值
'not_outlier_ratio', #非离群值比例，非离群值不含空值

#------统计值------------------------#
'class_num', #数据类别数目
'value_num', #非空数据数目
'min', #最小值
'mean',#均值
'med', #中位数
'most', #众数
'max', #最大值

#------有效性----------------------#
'ks', #ks统计量，适合连续特征
'ks_pvalue', #ks统计量的p值
'chi2', #chi2统计量，适合离散特征
'chi2_pvalue', #chi2统计量的p值
'iv'; #iv统计量，适合连续和离散特征，iv>0.1有效，iv>0.2强有效


#### psi_analysis的输出：

In [None]:
'psi', #psi指标，仅当 train_data和 test_data 有效数据数量 >10时才取值，否则为 nan值
'is_stable', #是否稳定，psi<0.2判定为稳定
'train_class_num', # train_data中数据类别数目
'test_class_num' , # test_data中数据类别数目
'train_value_num', #train_data中有效数据数目
'test_value_num';#test_data中有效数据数目


#### ks_analysis的输出：

In [None]:
'feature_interval',#特征取值区间
'order_num', #订单数量
'order_ratio', #订单占比
'overdue_num', #逾期订单数量
'overdue_ratio', #逾期订单占比
'normal_num', #正常订单数量
'normal_ratio', #正常订单占比
'overdue_cum_ratio', #累计逾期订单比例
'normal_cum_ratio', #累计正常订单比例
'ks_value'; #ks统计值


#### iv_analysis的输出：

In [None]:
'feature_interval',#区间
'order_num', #订单数量
'order_ratio', #订单占比
'overdue_num', #逾期订单数量
'overdue_ratio', #逾期订单比例
'overdue_interval_ratio', #区间逾期订单占总逾期订单比例
'normal_num', #正常订单数量
'normal_ratio', #正常订单占比
'normal_interval_ratio', #区间正常订单占总正常订单比例
'iv_value'; #iv检验值，列重复


#### chi2_analysis的输出：

In [None]:
'TP', #feature为1的逾期样本数量
'FP', #feature为1的正常样本数量
'TN', #feature为0的正常样本数量
'FN', #feature为0的逾期的样本数量
'TPR', #TP/(TP+FN),逾期样本中feature取1比例
'FPR',#FP/(FP+TN),正常样本中feature取1比例
'overdue_ratio_0',# feature为0样本的逾期率
'overdue_ratio_1',# feature为1样本的逾期率
'precision',#精度
'accuracy',#准确度
'chi2', #shi nme shenmeenme
'chi2_pvalue'; #卡方统计量的p值


### 二，单特征分析示范

In [None]:
import numpy as np
import pandas as pd
from tianjikit.analysisfeature import AnalysisFeature

# 准备数据
data = [1.0,2,3,4,5,6,4,3,2,1,2,9,10,100,np.nan,0,7,8,10,6]
label = [0,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,1]
assert len(data)==len(label)

af = AnalysisFeature()
# 离群值分析
dfoutliers = af.outliers_analysis(data,alpha = 2)

# 去除离群值
data_clean = af.drop_outliers(data,data,alpha = 2)

# 基本分析
dfbasic = af.basic_analysis(data,label)

# psi稳定性分析
test_data = [10,9,5,3,4,3,2,1,6,7,5,np.nan,10,100]
dfpsi = af.psi_analysis(data,test_data)

# ks有效性分析,主要对连续特征，对离散特征也可分析
dfks = af.ks_analysis(data,label)

# iv有效性分析，主要针对离散特征，对连续特征也适用
dfiv = af.iv_analysis(data,label)

# 卡方及召回率等分析，主要针对离散特征
dfchi2 = af.chi2_analysis(data,label)

### 三，多特征分析示范

In [None]:
# 多特征分析示范
import numpy as np
import pandas as pd
from tianjikit.analysisfeatures import AnalysisFeatures

# 构造dftrain 训练集特征数据
dftrain = pd.DataFrame()
dftrain['phone'] = ['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10','x11','x12']
dftrain['loan_dt'] = ['2018-01-01']*12
dftrain['label'] = [0,1,1,0,1,0,0,0,0,0,1,0]
dftrain['feature1'] = [1,0,1,0,1,0,1,0,1,0,1,1]
dftrain['feature2'] = [1.0,2,3,4,5,6,7,8,9,10,11,12]


# 构造dftest测试集特征
dftest = pd.DataFrame()
dftest['phone'] = ['y1','y2','y3','y4','y5','y6','y7','y8','y9','y10']
dftest['loan_dt'] = ['2018-02-01']*10
dftest['label'] = [1,0,0,1,0,0,0,1,0,0]
dftest['feature1'] = [1,0,0,1,0,0,1,0,1,0]
dftest['feature2'] = [10.0,9,8,7,6,5,4,3,2,1]

afs = AnalysisFeatures(dftrain,dftest)

#特征基本分析
dfbasic = afs.basic_analysises()

#特征稳定性分析
dfpsi = afs.psi_analysises()

#特征ks分析
dfks = afs.ks_analysises()

#特征iv分析
dfiv = afs.iv_analysises()

### 四，训练模型示范

In [1]:
# 准备训练数据
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

data,label = datasets.make_classification(n_samples= 10000, n_features=20,n_classes=2, random_state=0)
dfdata = pd.DataFrame(data,columns = ['feature'+str(i) for i in range(data.shape[1])])
dfdata['label'] = label
dftrain,dftest = train_test_split(dfdata)
dftrain,dftest = dftrain.copy(),dftest.copy()
dftrain.index,dftest.index  = range(len(dftrain)),range(len(dftest))
dftrain.loc[0,['feature0','feature1','feature2']] = np.nan #构造若干缺失值

In [3]:
# 训练逻辑回归模型
from tianjikit.trainmodel import TrainModel

model = TrainModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0,
                 outliers_th=None, fillna_method='most', scale_method= None)
lr = model.train_lr(cv=None, model_idx=5)
model.test(lr)
dfimportance = model.dfimportances['lr']



start data preprocessing ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
fillna method:  most
scale method:  None
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  23


start train logistic model ...



train: ks = 0.6536 	 auc = 0.896132045947 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 |  [0.0005,0.03866) |    750    |     0.1     |      27     |     0.036     | 0.18565  |
| 1 | [0.03866,0.09814) |    750    |     0.1     |      59     |    0.07867    | 0.35424  |
| 2 | [0.09814,0.20025) |    750    |     0.1     |      96     |     0.128     | 0.50309  |
| 3 |  [0.20025,0.3525) |    750    |     0.1     |     173

In [4]:
# 训练随机森林模型
from tianjikit.trainmodel import TrainModel
model = TrainModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, 
                 outliers_th=None, fillna_method='most', scale_method= None)
rf = model.train_rf(cv=None, model_idx=5,
      n_estimators=100, max_depth=10, min_samples_split=2,
      min_samples_leaf=1, min_weight_fraction_leaf=0.0,
      max_features='auto', max_leaf_nodes=None, n_jobs = 4)
model.test(rf)
dfimportance = model.dfimportances['rf']



start data preprocessing ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
fillna method:  most
scale method:  None
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  23


start train randomforest model ...



train: ks = 0.8904 	 auc = 0.989788941496 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 | [0.00737,0.02208) |    750    |     0.1     |      0      |      0.0      | 0.20005  |
| 1 | [0.02208,0.03121) |    750    |     0.1     |      0      |      0.0      | 0.40011  |
| 2 | [0.03121,0.04837) |    750    |     0.1     |      0      |      0.0      | 0.60016  |
| 3 | [0.04837,0.18638) |    750    |     0.1     |    

In [5]:
# 训练GBDT模型
from tianjikit.trainmodel import TrainModel
model = TrainModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0,  
                 outliers_th=None, fillna_method='most', scale_method= None)
gbdt = model.train_gbdt(cv=5, model_idx=5,
       learning_rate=0.01, n_estimators=1000, max_depth= 3, min_samples_split= 50, 
       min_samples_leaf= 5, subsample=0.7, max_features='sqrt',random_state= 0) 
model.test(gbdt)
dfimportance = model.dfimportances['gbdt']



start data preprocessing ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
fillna method:  most
scale method:  None
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  23


start train gbdt model ...

2018-12-20 19:33:51: k = 1

train: ks = 0.85336 	 auc = 0.978788626014 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 | [0.01106,0.02236) |    600    |     0.1     |      1      |    0.00167    | 0.19934  |
| 1 | [0.02236,0.02681) |    600    |     0.1     |      2      |    0.00333    |  0.398   |
| 2 | [0.02681,0.03669) |    600    |     0.1     |      5      |    0.00833    | 0.59466  |
| 3 | [0.03669,0.19121) |    600    


train: ks = 0.851 	 auc = 0.979704775523 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 | [0.00896,0.02061) |    600    |     0.1     |      0      |      0.0      | 0.20007  |
| 1 |  [0.02061,0.0256) |    600    |     0.1     |      1      |    0.00167    | 0.39947  |
| 2 |  [0.0256,0.03439) |    600    |     0.1     |      4      |    0.00667    | 0.59686  |
| 3 | [0.03439,0.17944) |    600    |     0.1     |      18     |      0.03     | 0.78494  |
| 4 | [0.17944,0.59099) |    600    |     0.1     |     201     |     0.335     |  0.851   |
| 5 |  [0.59099,0.8239) |    600    |     0.1     |     457     |    0.76167    |  0.7464  |
| 6 |  [0.8239,0.93021) |    600    |     0.1     |     533     |    0.88833    | 0.59114  |
| 7 | [0.93021,0.94837) |  

In [6]:
# 训练XGBOOST模型
from tianjikit.trainmodel import TrainModel
model = TrainModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, 
                 outliers_th=None, fillna_method= None, scale_method= None)
xgb = model.train_xgb(cv=5,learning_rate=0.1, model_idx=5,
      n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8,
      colsample_bytree=0.8,scale_pos_weight=1, n_jobs=4, seed=10) 
model.test(xgb)
dfimportance = model.dfimportances['xgb']



start data preprocessing ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
fillna method:  None
scale method:  None
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  20


start train xgboost model ...

2018-12-20 19:34:56: k = 1

train: ks = 1.0 	 auc = 1.0 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 |    [0.0,4e-05)    |    600    |     0.1     |      0      |      0.0      |   0.2    |
| 1 |  [4e-05,0.00013)  |    600    |     0.1     |      0      |      0.0      |   0.4    |
| 2 | [0.00013,0.00047) |    600    |     0.1     |      0      |      0.0      |   0.6    |
| 3 | [0.00047,0.00489) |    600    |     0.1   


train: ks = 0.99967 	 auc = 1.0 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 |    [0.0,5e-05)    |    600    |     0.1     |      0      |      0.0      | 0.20007  |
| 1 |  [5e-05,0.00016)  |    600    |     0.1     |      0      |      0.0      | 0.40013  |
| 2 | [0.00016,0.00062) |    600    |     0.1     |      0      |      0.0      |  0.6002  |
| 3 | [0.00062,0.00571) |    600    |     0.1     |      0      |      0.0      | 0.80027  |
| 4 | [0.00571,0.94702) |    600    |     0.1     |      1      |    0.00167    | 0.99967  |
| 5 | [0.94702,0.99337) |    600    |     0.1     |     600     |      1.0      | 0.79973  |
| 6 |  [0.99337,0.9977) |    600    |     0.1     |     600     |      1.0      |  0.5998  |
| 7 |  [0.9977,0.99927) |    600    

In [7]:
# 训练神经网络模型
from tianjikit.trainmodel import TrainModel
model = TrainModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, 
             outliers_th=None, fillna_method='most', scale_method= None)
nn = model.train_nn(cv = None, model_idx = 5,
     hidden_layer_sizes=(100,20), activation='relu', alpha=0.0001, 
     learning_rate='constant', learning_rate_init=0.001, max_iter=200,tol=0.0001, 
     early_stopping=False, validation_fraction=0.1, warm_start=False, random_state = None)
model.test(nn)



start data preprocessing ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
fillna method:  most
scale method:  None
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  23


start train neural network model ...



train: ks = 0.98426 	 auc = 0.999018808819 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 |     [0.0,0.0)     |    750    |     0.1     |      1      |    0.00133    | 0.19952  |
| 1 |    [0.0,1e-05)    |    750    |     0.1     |      1      |    0.00133    | 0.39904  |
| 2 |  [1e-05,0.00067)  |    750    |     0.1     |      0      |      0.0      |  0.5991  |
| 3 | [0.00067,0.02748) |    750    |     0.1     | 

### 五，训练xgboost示范

In [24]:
# 准备训练数据
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

data,label = datasets.make_classification(n_samples= 10000, n_features=20,n_classes=2, random_state=0)
dfdata = pd.DataFrame(data,columns = ['feature'+str(i) for i in range(data.shape[1])])
dfdata['label'] = label
dftrain,dftest = train_test_split(dfdata)
dftrain,dftest = dftrain.copy(),dftest.copy()
dftrain.index,dftest.index  = range(len(dftrain)),range(len(dftest))
dftrain.loc[0,['feature0','feature1','feature2']] = np.nan #构造若干缺失值

In [25]:
# 配置xgboost模型参数
params_dict = dict()

# 以下为待调整参数
# booster参数
params_dict['learning_rate'] = 0.1       # 学习率，初始值为 0.1，通常越小越好。
params_dict['n_estimators'] = 60         # 加法模型树的数量，初始值为50。

# tree参数
params_dict['max_depth'] = 3              # 树的深度，通常取值在[3,10]之间，初始值常取[3,6]之间
params_dict['min_child_weight']= 30       # 最小叶子节点样本权重和，越大模型越保守。
params_dict['gamma']= 0                   # 节点分裂所需的最小损失函数下降值，越大模型越保守。
params_dict['subsample']= 0.8             # 横向采样，样本采样比例，通常取值在 [0.5，1]之间 
params_dict['colsample_bytree'] = 1.0     # 纵向采样，特征采样比例，通常取值在 [0.5，1]之间 

# regulazation参数 
# Omega(f) = gamma*T + reg_alpha* sum(abs(wj)) + reg_lambda* sum(wj**2)  

params_dict['reg_alpha'] = 0              #L1 正则化项的权重系数，越大模型越保守，通常取值在[0,1]之间。
params_dict['reg_lambda'] = 1             #L2 正则化项的权重系数，越大模型越保守，通常取值在[1,100]之间。

# 以下参数通常不需要调整
params_dict['objective'] = 'binary:logistic'
params_dict['tree_method'] = 'hist'       # 构建树的策略,可以是auto, exact, approx, hist
params_dict['eval_metric'] =  'auc'
params_dict['silent'] = 1
params_dict['nthread'] = 2
params_dict['scale_pos_weight'] = 1        #不平衡样本时设定为正值可以使算法更快收敛。
params_dict['seed'] = 0

In [27]:
# 训练xgboost模型
from tianjikit.trainxgboost import TrainXgboost
model = TrainXgboost(dftrain = dftrain,dftest = dftest, coverage_th=0, ks_th=0,
        outliers_th=None, selected_features=None)
bst = model.train(cv=5, model_idx=1,params_dict = params_dict,n_jobs = 4, verbose_eval = 10) 
model.test(bst,dftest)
dfimportance = model.dfimportance



start data preprocessing ...

train set size:  7500
test set size:  2500
coverage threshold:  0
outlier threshold:  None
ks threshold:  0
original feature number:  20
feature number remain after dropfeature:  20
start train xgboost model ...


k = 1
[19:43:08] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.918914	valid-auc:0.913578	train-ks:0.696842	valid-ks:0.690084
[10]	train-auc:0.961386	valid-auc:0.953098	train-ks:0.783584	valid-ks:0.77402
[20]	train-auc:0.968356	valid-auc:0.960964	train-ks:0.824487	valid-ks:0.823428
[30]	train-auc:0.970742	valid-auc:0.963777	train-ks:0.827593	valid-ks:0.835508
[40]	train-auc:0.971632	valid-auc:0.963925	train-ks:0.831287	valid-ks:0.838097
[50]	train-auc:0.972871	valid-auc:0.964169	train-ks:0.837219	valid-ks:0.839487
[59]	train-auc:0.973739	valid-auc:0.963908	train-ks:0.843751	valid-ks:0.838113

train: ks = 0.8344 	 auc = 0.973739422524 
+---+-------------------+-----------+-------------+----

[10]	train-auc:0.961099	valid-auc:0.957802	train-ks:0.790668	valid-ks:0.781528
[20]	train-auc:0.968075	valid-auc:0.964086	train-ks:0.829244	valid-ks:0.804649
[30]	train-auc:0.969611	valid-auc:0.964586	train-ks:0.836008	valid-ks:0.811307
[40]	train-auc:0.970574	valid-auc:0.96553	train-ks:0.836768	valid-ks:0.812614
[50]	train-auc:0.971457	valid-auc:0.966421	train-ks:0.840964	valid-ks:0.815387
[59]	train-auc:0.972324	valid-auc:0.966137	train-ks:0.843778	valid-ks:0.819432

train: ks = 0.83437 	 auc = 0.972324334343 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 | [0.01223,0.01569) |    600    |     0.1     |      1      |    0.00167    | 0.20061  |
| 1 | [0.01569,0.01778) |    599    |     0.1     |      3      |    0.00501    | 0.39955  |
| 2 | [0.01778,0.

### 六，xgboost调参示范

In [10]:
from __future__ import print_function
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
import xgboost as xgb
from tianjikit.tunning import Tunning

data,label = datasets.make_classification(n_samples= 10000, n_features=20, n_informative= 6 ,
             n_classes=2, n_clusters_per_class=10,random_state=0)
dfdata = pd.DataFrame(data,columns = [u'f'+str(i) for i in range(data.shape[1])])
dfdata['label'] = label
dftrain,dftest = train_test_split(dfdata)

In [11]:
# 构造初始化参数
params_dict = dict()
# 以下为待调整参数
# booster参数
params_dict['learning_rate'] = 0.1        # 学习率，初始值为 0.1，通常越小越好。
params_dict['n_estimators'] = 60          # 加法模型树的数量，初始值为50，通常通过模型cv确认。
# tree参数
params_dict['max_depth'] = 3              # 树的深度，通常取值在[3,10]之间，初始值常取[3,6]之间
params_dict['min_child_weight']=10        # 最小叶子节点样本权重和，越大模型越保守。
params_dict['gamma']= 0                   # 节点分裂所需的最小损失函数下降值，越大模型越保守。
params_dict['subsample']= 0.8             # 横向采样，样本采样比例，通常取值在 [0.5，1]之间 
params_dict['colsample_bytree'] = 1.0     # 纵向采样，特征采样比例，通常取值在 [0.5，1]之间 
# regulazation参数 
# Omega(f) = gamma*T + reg_alpha* sum(abs(wj)) + reg_lambda* sum(wj**2) 
params_dict['reg_alpha'] = 0              #L1 正则化项的权重系数，越大模型越保守，通常取值在[0,1]之间。
params_dict['reg_lambda'] = 1             #L2 正则化项的权重系数，越大模型越保守，通常取值在[1,100]之间。
# 以下参数通常不需要调整
params_dict['objective'] = 'binary:logistic'
params_dict['tree_method'] = 'hist'       # 构建树的策略,可以是auto, exact, approx, hist
params_dict['eval_metric'] =  'auc'
params_dict['silent'] = 1
params_dict['scale_pos_weight'] = 1       #不平衡样本时设定为正值可以使算法更快收敛。
params_dict['seed'] = 0

In [12]:
# step0: 初始化
tune = Tunning(dftrain,dftest,score_func = 'ks',score_gap_limit = 0.05,params_dict=params_dict,n_jobs=4)



train set size: 7500
test set size: 2500
feature number: 21
score func: ks
score gap limit: 0.05
n_jobs: 4


In [13]:
# step1: tune n_estimators for relatively high learning_rate
params_test1 = {'learning_rate': [0.1],'n_estimators':[50]} 
tune.gridsearch_cv(params_test1,cv = 5,verbose_eval = 10)



{'n_estimators': 50, 'learning_rate': 0.1}


k = 1
[19:37:47] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.709871	valid-auc:0.698727	test-auc:0.699309	train-ks:0.284998	valid-ks:0.268532	test-ks:0.287241
[10]	train-auc:0.780741	valid-auc:0.754787	test-auc:0.765433	train-ks:0.4252	valid-ks:0.403468	test-ks:0.404045
[20]	train-auc:0.801374	valid-auc:0.768653	test-auc:0.774947	train-ks:0.450554	valid-ks:0.418268	test-ks:0.41771
[30]	train-auc:0.817829	valid-auc:0.777606	test-auc:0.787248	train-ks:0.471061	valid-ks:0.423623	test-ks:0.434926
[40]	train-auc:0.828872	valid-auc:0.785102	test-auc:0.791737	train-ks:0.492443	valid-ks:0.436007	test-ks:0.442275
[49]	train-auc:0.837389	valid-auc:0.790257	test-auc:0.796351	train-ks:0.509463	valid-ks:0.436188	test-ks:0.444443


k = 2
[19:37:48] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.719489	valid-auc:0.696846	test-auc:0.709177	tra

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score
0,0,0.490514,0.446326,0.0441876,0.438377


In [14]:
# step2：tune max_depth & min_child_weight 
params_test2 = { 'max_depth': [3], 'min_child_weight': [50,100,200] } 
tune.gridsearch_cv(params_test2,cv = 5,verbose_eval = 10)



{'max_depth': 3, 'min_child_weight': 50}


k = 1
[19:38:00] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.693154	valid-auc:0.676538	test-auc:0.689948	train-ks:0.278883	valid-ks:0.240303	test-ks:0.262982
[10]	train-auc:0.764474	valid-auc:0.756202	test-auc:0.752266	train-ks:0.40039	valid-ks:0.368513	test-ks:0.389452
[20]	train-auc:0.784925	valid-auc:0.769633	test-auc:0.768522	train-ks:0.434387	valid-ks:0.396785	test-ks:0.407128
[30]	train-auc:0.79386	valid-auc:0.775461	test-auc:0.773346	train-ks:0.449785	valid-ks:0.407676	test-ks:0.414261
[40]	train-auc:0.810383	valid-auc:0.787281	test-auc:0.786982	train-ks:0.4796	valid-ks:0.429042	test-ks:0.447319
[50]	train-auc:0.817279	valid-auc:0.790333	test-auc:0.788266	train-ks:0.491684	valid-ks:0.43686	test-ks:0.441669
[59]	train-auc:0.825754	valid-auc:0.794683	test-auc:0.792714	train-ks:0.508013	valid-ks:0.444338	test-ks:0.451472


k = 2
[19:38:02] Tree method is selected to be 'hist', wh



k = 4
[19:38:11] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.698542	valid-auc:0.666228	test-auc:0.686928	train-ks:0.286287	valid-ks:0.22907	test-ks:0.253038
[10]	train-auc:0.758417	valid-auc:0.727064	test-auc:0.745989	train-ks:0.402934	valid-ks:0.354	test-ks:0.385885
[20]	train-auc:0.769007	valid-auc:0.741742	test-auc:0.755897	train-ks:0.409484	valid-ks:0.361872	test-ks:0.39428
[30]	train-auc:0.776921	valid-auc:0.752031	test-auc:0.763946	train-ks:0.426308	valid-ks:0.382113	test-ks:0.415321
[40]	train-auc:0.785722	valid-auc:0.763419	test-auc:0.76991	train-ks:0.445098	valid-ks:0.390252	test-ks:0.421065
[50]	train-auc:0.796357	valid-auc:0.77184	test-auc:0.775901	train-ks:0.454747	valid-ks:0.394321	test-ks:0.435414
[59]	train-auc:0.800758	valid-auc:0.772249	test-auc:0.77736	train-ks:0.463565	valid-ks:0.402033	test-ks:0.433772


k = 5
[19:38:13] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score
0,0,0.490514,0.446326,0.0441876,0.438377


In [15]:
# step3：tune gamma
params_test3 = {'gamma': [0.1,0.5,1]}
tune.gridsearch_cv(params_test3,cv = 5,verbose_eval = 10)



{'gamma': 0.1}


k = 1
[19:38:25] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.714395	valid-auc:0.69331	test-auc:0.707116	train-ks:0.324808	valid-ks:0.280207	test-ks:0.304898
[10]	train-auc:0.780542	valid-auc:0.746806	test-auc:0.765518	train-ks:0.426488	valid-ks:0.388817	test-ks:0.397499
[20]	train-auc:0.801416	valid-auc:0.76137	test-auc:0.778633	train-ks:0.453722	valid-ks:0.397041	test-ks:0.406373
[30]	train-auc:0.815703	valid-auc:0.773204	test-auc:0.786834	train-ks:0.465273	valid-ks:0.418087	test-ks:0.422031
[40]	train-auc:0.827013	valid-auc:0.777784	test-auc:0.793099	train-ks:0.486744	valid-ks:0.418956	test-ks:0.435245
[50]	train-auc:0.836273	valid-auc:0.783645	test-auc:0.794278	train-ks:0.503108	valid-ks:0.429068	test-ks:0.438826
[59]	train-auc:0.843831	valid-auc:0.786375	test-auc:0.795936	train-ks:0.516003	valid-ks:0.428775	test-ks:0.44204


k = 2
[19:38:26] Tree method is selected to be 'hist', which uses a single update



k = 4
[19:38:37] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.712487	valid-auc:0.697886	test-auc:0.699677	train-ks:0.300249	valid-ks:0.287402	test-ks:0.293914
[10]	train-auc:0.772551	valid-auc:0.756878	test-auc:0.760915	train-ks:0.427061	valid-ks:0.411425	test-ks:0.407212
[20]	train-auc:0.798253	valid-auc:0.782069	test-auc:0.779714	train-ks:0.44853	valid-ks:0.42901	test-ks:0.423547
[30]	train-auc:0.811004	valid-auc:0.793507	test-auc:0.788527	train-ks:0.46679	valid-ks:0.446082	test-ks:0.435227
[40]	train-auc:0.82117	valid-auc:0.800321	test-auc:0.794605	train-ks:0.474644	valid-ks:0.455111	test-ks:0.447798
[50]	train-auc:0.835056	valid-auc:0.809947	test-auc:0.803222	train-ks:0.507133	valid-ks:0.47588	test-ks:0.462908
[59]	train-auc:0.841347	valid-auc:0.81266	test-auc:0.804728	train-ks:0.519466	valid-ks:0.475666	test-ks:0.470866


k = 5
[19:38:38] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmake

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score
5,5,0.508529,0.465199,0.0433304,0.445301


In [16]:
# step4：tune subsample & colsample_bytree 
params_test4 = { 'subsample': [0.9,1.0],'colsample_bytree': [1.0] } 
tune.gridsearch_cv(params_test4,cv = 5,verbose_eval = 100)



{'subsample': 0.9, 'colsample_bytree': 1.0}


k = 1
[19:38:55] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.712823	valid-auc:0.692992	test-auc:0.696334	train-ks:0.286713	valid-ks:0.254953	test-ks:0.282338
[59]	train-auc:0.852518	valid-auc:0.793449	test-auc:0.804482	train-ks:0.547981	valid-ks:0.434092	test-ks:0.454428


k = 2
[19:38:57] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.720014	valid-auc:0.695277	test-auc:0.70413	train-ks:0.317346	valid-ks:0.280026	test-ks:0.283685
[59]	train-auc:0.847994	valid-auc:0.798596	test-auc:0.800967	train-ks:0.524199	valid-ks:0.460519	test-ks:0.445855


k = 3
[19:38:58] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.710736	valid-auc:0.683678	test-auc:0.689641	train-ks:0.276818	valid-ks:0.250863	test-ks:0.264986
[59]	train-auc:0.839769	valid-auc:0.814873	test-auc:0.800478	train-ks:

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score
7,7,0.518623,0.46863,0.0499922,0.45218


In [17]:
# step5: tune reg_alpha 
params_test5 = { 'reg_alpha': [0.1,1] } 
tune.gridsearch_cv(params_test5,cv = 5,verbose_eval = 10)



{'reg_alpha': 0.1}


k = 1
[19:39:13] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.713185	valid-auc:0.710291	test-auc:0.704179	train-ks:0.318638	valid-ks:0.311178	test-ks:0.296321
[10]	train-auc:0.774836	valid-auc:0.771998	test-auc:0.760691	train-ks:0.421221	valid-ks:0.422359	test-ks:0.392962
[20]	train-auc:0.799288	valid-auc:0.792846	test-auc:0.781728	train-ks:0.453937	valid-ks:0.443692	test-ks:0.425429
[30]	train-auc:0.81132	valid-auc:0.797759	test-auc:0.788144	train-ks:0.466229	valid-ks:0.445431	test-ks:0.440027
[40]	train-auc:0.822499	valid-auc:0.803761	test-auc:0.792899	train-ks:0.479776	valid-ks:0.452455	test-ks:0.44326
[50]	train-auc:0.831337	valid-auc:0.809415	test-auc:0.795793	train-ks:0.491837	valid-ks:0.464365	test-ks:0.444902
[59]	train-auc:0.839508	valid-auc:0.814819	test-auc:0.798879	train-ks:0.512338	valid-ks:0.47516	test-ks:0.45349


k = 2
[19:39:14] Tree method is selected to be 'hist', which uses a single upd



k = 4
[19:39:26] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.713009	valid-auc:0.723433	test-auc:0.707139	train-ks:0.324463	valid-ks:0.317317	test-ks:0.292154
[10]	train-auc:0.766886	valid-auc:0.767483	test-auc:0.754057	train-ks:0.418897	valid-ks:0.387404	test-ks:0.373934
[20]	train-auc:0.791396	valid-auc:0.784196	test-auc:0.772444	train-ks:0.436844	valid-ks:0.419697	test-ks:0.405847
[30]	train-auc:0.813761	valid-auc:0.799181	test-auc:0.787777	train-ks:0.462579	valid-ks:0.446183	test-ks:0.422787
[40]	train-auc:0.825738	valid-auc:0.807304	test-auc:0.794446	train-ks:0.481901	valid-ks:0.461229	test-ks:0.438066
[50]	train-auc:0.834438	valid-auc:0.811322	test-auc:0.797921	train-ks:0.502977	valid-ks:0.467805	test-ks:0.447146
[59]	train-auc:0.844928	valid-auc:0.817538	test-auc:0.803442	train-ks:0.521224	valid-ks:0.491592	test-ks:0.463077


k = 5
[19:39:28] Tree method is selected to be 'hist', which uses a single updater grow_fast_hi

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score
7,7,0.518623,0.46863,0.0499922,0.45218


In [18]:
# step6: tune reg_lambda 
params_test6 = { 'reg_lambda': [0,0.1] }
tune.gridsearch_cv(params_test6,cv = 5,verbose_eval = 10)



{'reg_lambda': 0}


k = 1
[19:39:44] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.714053	valid-auc:0.718232	test-auc:0.704556	train-ks:0.307274	valid-ks:0.306789	test-ks:0.283244
[10]	train-auc:0.780388	valid-auc:0.769086	test-auc:0.765888	train-ks:0.429719	valid-ks:0.450989	test-ks:0.406302
[20]	train-auc:0.800656	valid-auc:0.780645	test-auc:0.776835	train-ks:0.451111	valid-ks:0.442956	test-ks:0.41678
[30]	train-auc:0.817524	valid-auc:0.793616	test-auc:0.789462	train-ks:0.476198	valid-ks:0.455874	test-ks:0.436024
[40]	train-auc:0.828543	valid-auc:0.800297	test-auc:0.792326	train-ks:0.489572	valid-ks:0.459773	test-ks:0.446573
[50]	train-auc:0.836738	valid-auc:0.806627	test-auc:0.796787	train-ks:0.504705	valid-ks:0.467837	test-ks:0.45691
[59]	train-auc:0.842663	valid-auc:0.809225	test-auc:0.79825	train-ks:0.519798	valid-ks:0.475805	test-ks:0.46012


k = 2
[19:39:47] Tree method is selected to be 'hist', which uses a single upda



k = 4
[19:39:58] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.713923	valid-auc:0.717427	test-auc:0.707594	train-ks:0.308781	valid-ks:0.321301	test-ks:0.288415
[10]	train-auc:0.773717	valid-auc:0.769187	test-auc:0.761013	train-ks:0.425212	valid-ks:0.432588	test-ks:0.394482
[20]	train-auc:0.796154	valid-auc:0.790051	test-auc:0.777292	train-ks:0.448166	valid-ks:0.448781	test-ks:0.411774
[30]	train-auc:0.814433	valid-auc:0.806906	test-auc:0.79023	train-ks:0.472312	valid-ks:0.476232	test-ks:0.43556
[40]	train-auc:0.828011	valid-auc:0.821448	test-auc:0.799304	train-ks:0.487425	valid-ks:0.493421	test-ks:0.446136
[50]	train-auc:0.836157	valid-auc:0.823105	test-auc:0.802559	train-ks:0.500471	valid-ks:0.486461	test-ks:0.457469
[59]	train-auc:0.841082	valid-auc:0.826082	test-auc:0.80353	train-ks:0.509784	valid-ks:0.488173	test-ks:0.453987


k = 5
[19:40:00] Tree method is selected to be 'hist', which uses a single updater grow_fast_histm

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score
7,7,0.518623,0.46863,0.0499922,0.45218


In [19]:
# step7: lower learning_rate and rise n_estimators
params_test7 = { 'learning_rate':[0.08,0.09], 'n_estimators':[100]}
tune.gridsearch_cv(params_test7,cv = 5,verbose_eval = 50)



{'n_estimators': 100, 'learning_rate': 0.08}


k = 1
[19:40:12] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.711077	valid-auc:0.700197	test-auc:0.69378	train-ks:0.289095	valid-ks:0.274724	test-ks:0.272963
[50]	train-auc:0.828442	valid-auc:0.793295	test-auc:0.794278	train-ks:0.500175	valid-ks:0.433511	test-ks:0.441224
[99]	train-auc:0.857283	valid-auc:0.803188	test-auc:0.802253	train-ks:0.551535	valid-ks:0.454215	test-ks:0.451077


k = 2
[19:40:14] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.713809	valid-auc:0.683028	test-auc:0.691969	train-ks:0.285815	valid-ks:0.272431	test-ks:0.27377
[50]	train-auc:0.826517	valid-auc:0.783285	test-auc:0.791693	train-ks:0.490539	valid-ks:0.436231	test-ks:0.435921
[99]	train-auc:0.854963	valid-auc:0.801231	test-auc:0.800147	train-ks:0.545089	valid-ks:0.46173	test-ks:0.454813


k = 3
[19:40:15] Tree method is selected to be 'hist', which

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score
7,7,0.518623,0.46863,0.0499922,0.45218


In [21]:
# step8: train model with tuned parameters and fully train dataset.
bst,dfimportance = tune.train_best()
#bst.save_model('./bst.model')
#dfimportance.to_csv('./dfimportance.csv',sep = '\t')
dfimportance

[19:40:48] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.710117	test-auc:0.699553	train-ks:0.289606	test-ks:0.283075
[10]	train-auc:0.774623	test-auc:0.759156	train-ks:0.42566	test-ks:0.398719
[20]	train-auc:0.795348	test-auc:0.775993	train-ks:0.440485	test-ks:0.40717
[30]	train-auc:0.812076	test-auc:0.788683	train-ks:0.465988	test-ks:0.434368
[40]	train-auc:0.823783	test-auc:0.794913	train-ks:0.48733	test-ks:0.448323
[50]	train-auc:0.830651	test-auc:0.798521	train-ks:0.495128	test-ks:0.456366
[59]	train-auc:0.839132	test-auc:0.800106	train-ks:0.517323	test-ks:0.456296


Unnamed: 0,feature,importance
10,feature10,68
2,feature2,58
7,feature15,56
6,feature9,45
15,feature19,44
4,feature1,36
1,feature4,35
13,feature13,19
3,feature3,6
14,feature18,4


### 七，自动化调参 runtunning.py 示范

打开文件修改以下配置即可直接运行

In [None]:
# %load runtunning.py
#!/usr/bin/python2.7
from __future__ import print_function

#================================================================================
# You can change the code here below! 可以改动以下配置代码修改超参优化目标和范围。
#================================================================================
# 一，配置优化目标条件

task_name = 'example'
score_func = 'ks'                                 #优化评估指标，可以为 'ks'或'auc'
score_gap_limit  = 0.03                           #可接受train和validate最大评分差值gap
train_data_path = './xx_train_data'               #训练集数据位置
test_data_path = './xx_test_data'                 #测试集数据位置
outputdir = './aa_tunning_result_' + task_name    #输出文件夹名
n_jobs = 16                                       #并行任务数量

#--------------------------------------------------------------------------------
# 二，配置超参数初始值

# 初始化参数
params_dict = dict()

# 以下为待调整参数
# booster参数
params_dict['learning_rate'] = 0.1        # 学习率，初始值为 0.1，通常越小越好。
params_dict['n_estimators'] = 50         # 加法模型树的数量，初始值为50。

# tree参数
params_dict['max_depth'] = 3              # 树的深度，通常取值在[3,10]之间，初始值常取[3,6]之间
params_dict['min_child_weight']= 30       # 最小叶子节点样本权重和，越大模型越保守。
params_dict['gamma']= 0                   # 节点分裂所需的最小损失函数下降值，越大模型越保守。
params_dict['subsample']= 0.8             # 横向采样，样本采样比例，通常取值在 [0.5，1]之间 
params_dict['colsample_bytree'] = 1.0     # 纵向采样，特征采样比例，通常取值在 [0.5，1]之间 

# regulazation参数 
# Omega(f) = gamma*T + reg_alpha* sum(abs(wj)) + reg_lambda* sum(wj**2)  

params_dict['reg_alpha'] = 0              #L1 正则化项的权重系数，越大模型越保守，通常取值在[0,1]之间。
params_dict['reg_lambda'] = 1             #L2 正则化项的权重系数，越大模型越保守，通常取值在[1,100]之间。

# 以下参数通常不需要调整
params_dict['objective'] = 'binary:logistic'
params_dict['tree_method'] = 'hist'       # 构建树的策略,可以是auto, exact, approx, hist
params_dict['eval_metric'] =  'auc'
params_dict['silent'] = 1
params_dict['scale_pos_weight'] = 1        #不平衡样本时设定为正值可以使算法更快收敛。
params_dict['seed'] = 0

#--------------------------------------------------------------------------------
# 三，配置超参搜索范围

params_test1 = {'learning_rate': [0.1],'n_estimators':[50]}  #此处应配置较大 learning_rate

params_test2 = { 'max_depth': [3], 'min_child_weight': [50,100,200] } 

params_test3 = {'gamma': [0.1,0.5,1]}

params_test4 = { 'subsample': [0.9,1.0],'colsample_bytree': [1.0] } 

params_test5 = { 'reg_alpha': [0.1,1] } 

params_test6 = { 'reg_lambda': [0,0.1] }

params_test7 = {'learning_rate':[0.09,0.08],'n_estimators':[100]} #此处应配置较小learning_rate
#===============================================================================








#================================================================================
#Don't change the code below!!! 以下代码请勿轻易改动。
#================================================================================
#
#
#
####
###
##
#
    


### 八，跑流水线 runpipeline.py 示范

打开文件修改以下配置即可直接运行。

In [None]:
# %load runpipeline.py
#!/usr/bin/python2.7
from __future__ import print_function

#================================================================================
# You can change the code here below! 可以改动以下配置代码修改超参优化目标和范围。
#================================================================================
# 一，配置优化目标条件

task_name = 'example'
score_func = 'ks'                                 #优化评估指标，可以为 'ks'或'auc'
score_gap_limit  = 0.03                           #可接受train和validate最大评分差值gap
train_data_path = './xx_train_data'               #训练集数据位置
test_data_path = './xx_test_data'                 #测试集数据位置
outputdir = './aa_pipeline_result_' + task_name    #输出文件夹名
n_jobs = 16                                       #并行任务数量

#--------------------------------------------------------------------------------
# 二，配置超参数初始值

# 初始化参数
params_dict = dict()

# 以下为待调整参数
# booster参数
params_dict['learning_rate'] = 0.1        # 学习率，初始值为 0.1，通常越小越好。
params_dict['n_estimators'] = 50         # 加法模型树的数量，初始值为50。

# tree参数
params_dict['max_depth'] = 3              # 树的深度，通常取值在[3,10]之间，初始值常取[3,6]之间
params_dict['min_child_weight']= 30       # 最小叶子节点样本权重和，越大模型越保守。
params_dict['gamma']= 0                   # 节点分裂所需的最小损失函数下降值，越大模型越保守。
params_dict['subsample']= 0.8             # 横向采样，样本采样比例，通常取值在 [0.5，1]之间 
params_dict['colsample_bytree'] = 1.0     # 纵向采样，特征采样比例，通常取值在 [0.5，1]之间 

# regulazation参数 
# Omega(f) = gamma*T + reg_alpha* sum(abs(wj)) + reg_lambda* sum(wj**2)  

params_dict['reg_alpha'] = 0              #L1 正则化项的权重系数，越大模型越保守，通常取值在[0,1]之间。
params_dict['reg_lambda'] = 1             #L2 正则化项的权重系数，越大模型越保守，通常取值在[1,100]之间。

# 以下参数通常不需要调整
params_dict['objective'] = 'binary:logistic'
params_dict['tree_method'] = 'hist'       # 构建树的策略,可以是auto, exact, approx, hist
params_dict['eval_metric'] =  'auc'
params_dict['silent'] = 1
params_dict['scale_pos_weight'] = 1        #不平衡样本时设定为正值可以使算法更快收敛。
params_dict['seed'] = 0

#--------------------------------------------------------------------------------
# 三，配置超参搜索范围

params_test1 = {'learning_rate': [0.1],'n_estimators':[50]}  #此处应配置较大 learning_rate

params_test2 = { 'max_depth': [3], 'min_child_weight': [50,100,200] } 

params_test3 = {'gamma': [0.1,0.5,1]}

params_test4 = { 'subsample': [0.9,1.0],'colsample_bytree': [1.0] } 

params_test5 = { 'reg_alpha': [0.1,1] } 

params_test6 = { 'reg_lambda': [0,0.1] }

params_test7 = {'learning_rate':[0.09,0.08],'n_estimators':[100]} #此处应配置较小learning_rate
#===============================================================================








#================================================================================
#Don't change the code below!!! 以下代码请勿轻易改动。
#================================================================================
#
#
#
    
####
###
##
#
    

    
