# tianjikit模块使用指南

### 一，模块整体架构

![](readme.png)

#### outlier_analysis的输出：

In [None]:
'med', #中位数
'seg_25', #1/4分位数
'seg_75', #3/4分位数
'up_limit',  #离群值判定上边界
'low_limit', #离群值判定下边界
'up_ratio',  #超上边界离群值比例
'low_ratio';  #超下边界离群值比例


#### basic_analysis的输出：

In [None]:
#------覆盖率------------------------#
'not_nan_ratio',  #非空比例，通常覆盖率coverage即指它
'not_zero_ratio', #非零比例，非零值不含空值
'not_outlier_ratio', #非离群值比例，非离群值不含空值

#------统计值------------------------#
'class_num', #数据类别数目
'value_num', #非空数据数目
'min', #最小值
'mean',#均值
'med', #中位数
'most', #众数
'max', #最大值

#------有效性----------------------#
'ks(continous feature)', #ks统计量，适合连续特征
'ks_pvalue', #ks统计量的p值
'chi2(discrete feature)', #chi2统计量，适合离散特征
'chi2_pvalue', #chi2统计量的p值
't(for mean)', #均值t检验,仅对连续特征适用
't_pvalue' ,#均值t检验的p值
'z(for coverage)',#覆盖率z检验，适合连续和离散特征，coverage指 not_nan_ratio
'z_pvalue'; #覆盖率z检验的p值
'iv'; #iv统计量，适合连续和离散特征，iv>0.1有效，iv>0.2强有效


#### psi_analysis的输出：

In [None]:
'psi', #psi指标，仅当 train_data和 test_data 有效数据数量 >10时才取值，否则为 nan值
'is_stable', #是否稳定，psi<0.2判定为稳定
'train_class_num', # train_data中数据类别数目
'test_class_num' , # test_data中数据类别数目
'train_value_num', #train_data中有效数据数目
'test_value_num';#test_data中有效数据数目


#### ks_analysis的输出：

In [None]:
'feature_interval',#特征取值区间
'order_num', #订单数量
'order_ratio', #订单占比
'overdue_num', #逾期订单数量
'overdue_ratio', #逾期订单占比
'normal_num', #正常订单数量
'normal_ratio', #正常订单占比
'overdue_cum_ratio', #累计逾期订单比例
'normal_cum_ratio', #累计正常订单比例
'ks_value'; #ks统计值


#### iv_analysis的输出：

In [None]:
'feature_interval',#区间
'order_num', #订单数量
'order_ratio', #订单占比
'overdue_num', #逾期订单数量
'overdue_ratio', #逾期订单比例
'overdue_interval_ratio', #区间逾期订单占总逾期订单比例
'normal_num', #正常订单数量
'normal_ratio', #正常订单占比
'normal_interval_ratio', #区间正常订单占总正常订单比例
'iv_value'; #iv检验值，列重复


#### chi2_analysis的输出：

In [None]:
'TP', #feature为1的逾期样本数量
'FP', #feature为1的正常样本数量
'TN', #feature为0的正常样本数量
'FN', #feature为0的逾期的样本数量
'TPR', #TP/(TP+FN),逾期样本中feature取1比例
'FPR',#FP/(FP+TN),正常样本中feature取1比例
'overdue_ratio_0',# feature为0样本的逾期率
'overdue_ratio_1',# feature为1样本的逾期率
'precision',#精度
'accuracy',#准确度
'chi2', #shi nme shenmeenme
'chi2_pvalue'; #卡方统计量的p值


### 二，单特征分析示范

In [5]:
import numpy as np
import pandas as pd
from tianjikit.analysisfeature import AnalysisFeature

# 准备数据
data = [1.0,2,3,4,5,6,4,3,2,1,2,9,10,100,np.nan,0,7,8,10,6]
label = [0,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,1]
assert len(data)==len(label)

af = AnalysisFeature()
# 离群值分析
dfoutliers = af.outliers_analysis(data,alpha = 2)

# 去除离群值
data_clean = af.drop_outliers(data,data,alpha = 2)

# 基本分析
dfbasic = af.basic_analysis(data,label)

# psi稳定性分析
test_data = [10,9,5,3,4,3,2,1,6,7,5,np.nan,10,100]
dfpsi = af.psi_analysis(data,test_data)

# ks有效性分析,主要对连续特征，对离散特征也可分析
dfks = af.ks_analysis(data,label)

# iv有效性分析，主要针对离散特征，对连续特征也适用
dfiv = af.iv_analysis(data,label)

# 卡方及召回率等分析，主要针对离散特征
dfchi2 = af.chi2_analysis(data,label)

### 三，多特征分析示范

In [1]:
# 多特征分析示范
import numpy as np
import pandas as pd
from tianjikit.analysisfeatures import AnalysisFeatures

# 构造dftrain 训练集特征数据
dftrain = pd.DataFrame()
dftrain['phone'] = ['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10','x11','x12']
dftrain['loan_dt'] = ['2018-01-01']*12
dftrain['label'] = [0,1,1,0,1,0,0,0,0,0,1,0]
dftrain['feature1'] = [1,0,1,0,1,0,1,0,1,0,1,1]
dftrain['feature2'] = [1.0,2,3,4,5,6,7,8,9,10,11,12]


# 构造dftest测试集特征
dftest = pd.DataFrame()
dftest['phone'] = ['y1','y2','y3','y4','y5','y6','y7','y8','y9','y10']
dftest['loan_dt'] = ['2018-02-01']*10
dftest['label'] = [1,0,0,1,0,0,0,1,0,0]
dftest['feature1'] = [1,0,0,1,0,0,1,0,1,0]
dftest['feature2'] = [10.0,9,8,7,6,5,4,3,2,1]

afs = AnalysisFeatures(dftrain,dftest)

#特征基本分析
dfbasic = afs.basic_analysises()

#特征稳定性分析
dfpsi = afs.psi_analysises()

#特征ks分析
dfks = afs.ks_analysises()

#特征iv分析
dfiv = afs.iv_analysises()

start basic_analysises...
[total|done|todo]
[2|2|0]
start psi_analysises...
[total|done|todo]
[2|2|0]
start ks_analysis...
[total|done|todo]
[2|2|0]
start IvAnalysis...
[total|done|todo]
[2|2|0]


### 四，跑模型评分示范

In [2]:
# 准备训练数据
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

data,label = datasets.make_classification(n_samples= 10000, n_features=20,n_classes=2, random_state=0)
dfdata = pd.DataFrame(data,columns = ['feature'+str(i) for i in range(data.shape[1])])
dfdata['label'] = label
dftrain,dftest = train_test_split(dfdata)
dftrain,dftest = dftrain.copy(),dftest.copy()
dftrain.index,dftest.index  = range(len(dftrain)),range(len(dftest))
dftrain.loc[0,['feature0','feature1','feature2']] = np.nan #构造若干缺失值

In [4]:
# 训练逻辑回归模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, chi2_th=0, 
                 outliers_th=None, fillna_method='most', scale_method= None)
lr = model.train_lr(cv=None, model_idx=5)
model.test(lr)
dfimportance = model.dfimportances['lr']

START DATA PREPROCESSING ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
chi2 threshold:  0
fillna method:  most
scale method:  None
------------------------------------------------------------------------
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  23
------------------------------------------------------------------------
START TRAIN LR MODEL ...

2018-10-26 12:35:04:

train: ks = 0.65334 	 auc = 0.898266528728 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 | [0.00037,0.03416) |    750    |     0.1     |      28     |    0.03733    | 0.18433  |
| 1 |  [0.03416,0.091)  |    750    |     0.1     |      49     |    

In [5]:
# 训练随机森林模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, chi2_th=0, 
                 outliers_th=None, fillna_method='most', scale_method= None)
rf = model.train_rf(cv=None, model_idx=5,
      n_estimators=100, max_depth=10, min_samples_split=2,
      min_samples_leaf=1, min_weight_fraction_leaf=0.0,
      max_features='auto', max_leaf_nodes=None, n_jobs = 4)
model.test(rf)
dfimportance = model.dfimportances['rf']

START DATA PREPROCESSING ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
chi2 threshold:  0
fillna method:  most
scale method:  None
------------------------------------------------------------------------
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  23
------------------------------------------------------------------------
START TRAIN RANDOMFOREST MODEL ...

2018-10-26 12:35:17:

train: ks = 0.90188 	 auc = 0.990171738571 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 | [0.00343,0.02105) |    750    |     0.1     |      0      |      0.0      | 0.19926  |
| 1 | [0.02105,0.03165) |    750    |     0.1     |      0 

In [6]:
# 训练GBDT模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, chi2_th=0, 
                 outliers_th=None, fillna_method='most', scale_method= None)
gbdt = model.train_gbdt(cv=5, model_idx=5,
       learning_rate=0.01, n_estimators=1000, max_depth= 3, min_samples_split= 50, 
       min_samples_leaf= 5, subsample=0.7, max_features='sqrt',random_state= 0) 
model.test(gbdt)
dfimportance = model.dfimportances['gbdt']

START DATA PREPROCESSING ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
chi2 threshold:  0
fillna method:  most
scale method:  None
------------------------------------------------------------------------
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  23
------------------------------------------------------------------------
START TRAIN GBDT MODEL ...

2018-10-26 12:35:26: k = 1

train: ks = 0.85266 	 auc = 0.980178991178 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 | [0.00453,0.02046) |    600    |     0.1     |      0      |      0.0      | 0.19927  |
| 1 |  [0.02046,0.0244) |    600    |     0.1     |      1   


train: ks = 0.84301 	 auc = 0.978524489051 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 | [0.00606,0.02095) |    600    |     0.1     |      1      |    0.00167    | 0.19861  |
| 1 | [0.02095,0.02553) |    600    |     0.1     |      2      |    0.00333    | 0.39654  |
| 2 | [0.02553,0.03448) |    600    |     0.1     |      2      |    0.00333    | 0.59448  |
| 3 | [0.03448,0.17386) |    600    |     0.1     |      19     |    0.03167    | 0.78108  |
| 4 | [0.17386,0.59024) |    600    |     0.1     |     206     |    0.34333    | 0.84301  |
| 5 | [0.59024,0.81136) |    600    |     0.1     |     438     |      0.73     | 0.75027  |
| 6 | [0.81136,0.92612) |    600    |     0.1     |     538     |    0.89667    | 0.59087  |
| 7 | [0.92612,0.94773) |

In [7]:
# 训练XGBOOST模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, chi2_th=0, 
                 outliers_th=None, fillna_method= None, scale_method= None)
xgb = model.train_xgb(cv=5,learning_rate=0.1, model_idx=5,
      n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8,
      colsample_bytree=0.8,scale_pos_weight=1, n_jobs=4, seed=10) 
model.test(xgb)
dfimportance = model.dfimportances['xgb']

START DATA PREPROCESSING ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
chi2 threshold:  0
fillna method:  None
scale method:  None
------------------------------------------------------------------------
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  20
------------------------------------------------------------------------
START TRAIN XGBOOST MODEL ...

2018-10-26 12:36:13: k = 1

train: ks = 0.99601 	 auc = 1.0 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 |    [0.0,4e-05)    |    600    |     0.1     |      0      |      0.0      | 0.19927  |
| 1 |  [4e-05,0.00012)  |    600    |     0.1     |      0      |    


train: ks = 0.99635 	 auc = 1.0 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 |    [0.0,3e-05)    |    600    |     0.1     |      0      |      0.0      | 0.19927  |
| 1 |  [3e-05,0.00013)  |    600    |     0.1     |      0      |      0.0      | 0.39854  |
| 2 | [0.00013,0.00049) |    600    |     0.1     |      0      |      0.0      | 0.59781  |
| 3 | [0.00049,0.00608) |    600    |     0.1     |      0      |      0.0      | 0.79708  |
| 4 | [0.00608,0.04446) |    600    |     0.1     |      0      |      0.0      | 0.99635  |
| 5 | [0.04446,0.99215) |    600    |     0.1     |     589     |    0.98167    | 0.80294  |
| 6 | [0.99215,0.99748) |    600    |     0.1     |     600     |      1.0      | 0.60221  |
| 7 | [0.99748,0.99925) |    600    

In [8]:
# 训练神经网络模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, chi2_th=0, 
             outliers_th=None, fillna_method='most', scale_method= None)
nn = model.train_nn(cv = None, model_idx = 5,
     hidden_layer_sizes=(100,20), activation='relu', alpha=0.0001, 
     learning_rate='constant', learning_rate_init=0.001, max_iter=200,tol=0.0001, 
     early_stopping=False, validation_fraction=0.1, warm_start=False, random_state = None)
model.test(nn)

START DATA PREPROCESSING ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
chi2 threshold:  0
fillna method:  most
scale method:  None
------------------------------------------------------------------------
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  23
------------------------------------------------------------------------
START TRAIN NEURAL NETWORK MODEL ...

2018-10-26 12:37:30:

train: ks = 0.98241 	 auc = 0.999362408891 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 |     [0.0,0.0)     |    750    |     0.1     |      0      |      0.0      | 0.19926  |
| 1 |    [0.0,1e-05)    |    750    |     0.1     |      

### 五，xgboost调参示范

In [9]:
from __future__ import print_function
import numpy as np
import pandas as pd
import xgboost
from sklearn import datasets
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier

from tianjikit.tunning import Tunning

data,label = datasets.make_classification(n_samples= 10000, n_features=20, n_informative= 6 ,
             n_classes=2, n_clusters_per_class=10,random_state=0)
dfdata = pd.DataFrame(data,columns = [u'f'+str(i) for i in range(data.shape[1])])
dfdata['label'] = label
dftrain,dftest = train_test_split(dfdata)

In [10]:
params_dict = dict()

# 以下为待调整参数
# booster参数
params_dict['learning_rate'] = 0.1        # 学习率，初始值为 0.1，通常越小越好。
params_dict['n_estimators'] = 50          # 加法模型树的数量，初始值为50，通常通过xgboost自带模型cv确认。

# tree参数
params_dict['max_depth'] = 5              # 树的深度，通常取值在[3,10]之间，初始值常取[3,6]之间
params_dict['min_child_weight']=1         # 最小叶子节点样本权重和，越大模型越保守。
params_dict['gamma']= 0                   # 节点分裂所需的最小损失函数下降值，越大模型越保守。
params_dict['subsample']= 0.8             # 横向采样，样本采样比例，通常取值在 [0.5，1]之间 
params_dict['colsample_bytree'] = 0.8     # 纵向采样，特征采样比例，通常取值在 [0.5，1]之间 

# regulazation参数 
# Omega(f) = gamma*T + reg_alpha* sum(abs(wj)) + reg_lambda  
params_dict['reg_alpha'] = 0              #L1 正则化项的权重系数，越大模型越保守，通常取值在[0,1]之间。
params_dict['reg_lambda'] = 1             #L2 正则化项的权重系数，越大模型越保守，通常取值在[1,100]之间。

# 以下参数通常不需要调整
params_dict['objective'] = 'binary:logistic'
params_dict['n_jobs'] = 4
params_dict['scale_pos_weight'] = 1       #不平衡样本时设定为正值可以使算法更快收敛。
params_dict['seed'] = 0

In [11]:
# step0: 初始化
model = XGBClassifier()
tune = Tunning(model = model,dftrain = dftrain,dftest = dftest,params_dict = params_dict,n_jobs = 4)
tune.dfscore

Unnamed: 0,model_id,train_score,validate_score,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.711867,0.524533,0.538377,0.1,50,5,1,0,0.8,0.8,0,1


In [13]:
# step1: tune n_estimators for relatively high learning_rate (eg: 0.1)
param_test1 = { 'learning_rate': 0.3, 'n_estimators':1000}
tune.params_dict.update(param_test1)
tune.model.set_params(**tune.params_dict)
tune.xgboost_cv(cv= 5, early_stopping_rounds= 100,n_jobs = 4,seed = 0)
tune.dfscore

[0]	train-auc:0.650211	train-ks:0.215644	test-auc:0.617175	test-ks:0.172071
Multiple eval metrics have been passed: 'test-ks' will be used for early stopping.

Will train until test-ks hasn't improved in 100 rounds.
[1]	train-auc:0.801364	train-ks:0.449576	test-auc:0.762951	test-ks:0.396161
[2]	train-auc:0.825735	train-ks:0.502388	test-auc:0.7807	test-ks:0.427575
[3]	train-auc:0.841881	train-ks:0.526476	test-auc:0.788658	test-ks:0.444223
[4]	train-auc:0.85548	train-ks:0.547742	test-auc:0.799637	test-ks:0.455096
[5]	train-auc:0.864493	train-ks:0.566979	test-auc:0.807322	test-ks:0.474325
[6]	train-auc:0.872816	train-ks:0.582781	test-auc:0.814585	test-ks:0.488015
[7]	train-auc:0.878036	train-ks:0.592275	test-auc:0.816817	test-ks:0.489873
[8]	train-auc:0.88316	train-ks:0.600612	test-auc:0.817964	test-ks:0.49168
[9]	train-auc:0.889966	train-ks:0.613248	test-auc:0.821236	test-ks:0.495217
[10]	train-auc:0.893615	train-ks:0.622356	test-auc:0.822257	test-ks:0.49535
[11]	train-auc:0.897611	train

[106]	train-auc:0.998308	train-ks:0.964614	test-auc:0.826603	test-ks:0.497703
[107]	train-auc:0.998439	train-ks:0.965949	test-auc:0.826222	test-ks:0.499508
[108]	train-auc:0.99853	train-ks:0.96634	test-auc:0.826275	test-ks:0.500965
[109]	train-auc:0.998629	train-ks:0.967404	test-auc:0.826465	test-ks:0.501421
[110]	train-auc:0.998725	train-ks:0.968384	test-auc:0.826455	test-ks:0.501277
[111]	train-auc:0.998766	train-ks:0.968726	test-auc:0.82664	test-ks:0.500594
[112]	train-auc:0.998851	train-ks:0.970536	test-auc:0.826719	test-ks:0.500608
[113]	train-auc:0.998913	train-ks:0.971598	test-auc:0.826901	test-ks:0.500324
[114]	train-auc:0.999002	train-ks:0.972679	test-auc:0.826583	test-ks:0.499342
[115]	train-auc:0.999067	train-ks:0.973674	test-auc:0.826368	test-ks:0.499197
[116]	train-auc:0.999144	train-ks:0.97493	test-auc:0.82632	test-ks:0.502528
[117]	train-auc:0.999207	train-ks:0.975947	test-auc:0.826206	test-ks:0.501606
[118]	train-auc:0.999231	train-ks:0.976743	test-auc:0.826484	test-ks:

Unnamed: 0,model_id,train_score,validate_score,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.711867,0.524533,0.538377,0.1,50,5,1,0,0.8,0.8,0,1
1,1,0.8214,0.498133,0.524126,0.3,46,5,1,0,0.8,0.8,0,1


In [16]:
# step2：tune max_depth & min_child_weight 
param_test2 = { 'max_depth': range(3, 6), 'min_child_weight': [1,10,20,50,100] } 
best_param = tune.gridsearch_cv(param_test2,n_jobs = 4)
tune.dfscore

CV Results: 
+----+-------------------------------------------+------------------+-----------------+
|    |                   params                  | mean_train_score | mean_test_score |
+----+-------------------------------------------+------------------+-----------------+
| 0  |  {'max_depth': 3, 'min_child_weight': 1}  |  0.595866666667  |  0.458133333333 |
| 1  |  {'max_depth': 3, 'min_child_weight': 10} |  0.579733333333  |  0.458666666667 |
| 2  |  {'max_depth': 3, 'min_child_weight': 20} |  0.565133333333  |      0.456      |
| 3  |  {'max_depth': 3, 'min_child_weight': 50} |  0.538666666667  |      0.4528     |
| 4  | {'max_depth': 3, 'min_child_weight': 100} |  0.499733333333  |  0.423466666667 |
| 5  |  {'max_depth': 4, 'min_child_weight': 1}  |  0.703533333333  |      0.4776     |
| 6  |  {'max_depth': 4, 'min_child_weight': 10} |  0.674133333333  |      0.488      |
| 7  |  {'max_depth': 4, 'min_child_weight': 20} |  0.637133333333  |  0.470133333333 |
| 8  |  {'max_depth

Unnamed: 0,model_id,train_score,validate_score,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.711867,0.524533,0.538377,0.1,50,5,1,0,0.8,0.8,0,1
1,1,0.8214,0.498133,0.524126,0.3,46,5,1,0,0.8,0.8,0,1
2,2,0.746467,0.4992,0.506889,0.3,46,5,10,0,0.8,0.8,0,1
3,3,0.746467,0.4992,0.506889,0.3,46,5,10,0,0.8,0.8,0,1
4,4,0.746467,0.4992,0.506889,0.3,46,5,10,0,0.8,0.8,0,1


In [17]:
# step3：tune gamma
param_test3 = {'gamma': [i / 10.0 for i in range(0, 6)]}
best_param = tune.gridsearch_cv(param_test3,n_jobs = 4)
tune.dfscore

CV Results: 
+---+----------------+------------------+-----------------+
|   |     params     | mean_train_score | mean_test_score |
+---+----------------+------------------+-----------------+
| 0 | {'gamma': 0.0} |  0.746466666667  |      0.4992     |
| 1 | {'gamma': 0.1} |      0.7514      |  0.499466666667 |
| 2 | {'gamma': 0.2} |      0.7498      |  0.494666666667 |
| 3 | {'gamma': 0.3} |  0.749466666667  |  0.487466666667 |
| 4 | {'gamma': 0.4} |  0.752733333333  |  0.493066666667 |
| 5 | {'gamma': 0.5} |  0.752733333333  |  0.494133333333 |
+---+----------------+------------------+-----------------+
Best Params: 
{'gamma': 0.1}
Best Score: 
0.4994666666666666


Unnamed: 0,model_id,train_score,validate_score,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.711867,0.524533,0.538377,0.1,50,5,1,0.0,0.8,0.8,0,1
1,1,0.8214,0.498133,0.524126,0.3,46,5,1,0.0,0.8,0.8,0,1
2,2,0.746467,0.4992,0.506889,0.3,46,5,10,0.0,0.8,0.8,0,1
3,3,0.746467,0.4992,0.506889,0.3,46,5,10,0.0,0.8,0.8,0,1
4,4,0.746467,0.4992,0.506889,0.3,46,5,10,0.0,0.8,0.8,0,1
5,5,0.7514,0.499467,0.524069,0.3,46,5,10,0.1,0.8,0.8,0,1


In [18]:
# step4：tune subsample & colsample_bytree 
param_test4 = { 'subsample': [i / 10.0 for i in range(6, 11)],
               'colsample_bytree': [i / 10.0 for i in range(6, 11)] } 
best_param = tune.gridsearch_cv(param_test4,n_jobs = 4)
tune.dfscore

CV Results: 
+----+---------------------------------------------+------------------+-----------------+
|    |                    params                   | mean_train_score | mean_test_score |
+----+---------------------------------------------+------------------+-----------------+
| 0  | {'subsample': 0.6, 'colsample_bytree': 0.6} |      0.7136      |      0.472      |
| 1  | {'subsample': 0.7, 'colsample_bytree': 0.6} |      0.7204      |  0.469333333333 |
| 2  | {'subsample': 0.8, 'colsample_bytree': 0.6} |      0.7344      |  0.490666666667 |
| 3  | {'subsample': 0.9, 'colsample_bytree': 0.6} |  0.752666666667  |      0.5048     |
| 4  | {'subsample': 1.0, 'colsample_bytree': 0.6} |  0.753533333333  |  0.509866666667 |
| 5  | {'subsample': 0.6, 'colsample_bytree': 0.7} |      0.7044      |  0.462933333333 |
| 6  | {'subsample': 0.7, 'colsample_bytree': 0.7} |      0.7288      |      0.4776     |
| 7  | {'subsample': 0.8, 'colsample_bytree': 0.7} |  0.739533333333  |  0.490933333333

Unnamed: 0,model_id,train_score,validate_score,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.711867,0.524533,0.538377,0.1,50,5,1,0.0,0.8,0.8,0,1
1,1,0.8214,0.498133,0.524126,0.3,46,5,1,0.0,0.8,0.8,0,1
2,2,0.746467,0.4992,0.506889,0.3,46,5,10,0.0,0.8,0.8,0,1
3,3,0.746467,0.4992,0.506889,0.3,46,5,10,0.0,0.8,0.8,0,1
4,4,0.746467,0.4992,0.506889,0.3,46,5,10,0.0,0.8,0.8,0,1
5,5,0.7514,0.499467,0.524069,0.3,46,5,10,0.1,0.8,0.8,0,1
6,6,0.7674,0.510933,0.543003,0.3,46,5,10,0.1,0.9,1.0,0,1


In [20]:
# step5: tune reg_alpha 
param_test5 = { 'reg_alpha': [1e-5, 1e-2, 0.1, 1,10,100] } 
best_param = tune.gridsearch_cv(param_test5,n_jobs = 4)
tune.dfscore

CV Results: 
+---+----------------------+------------------+-----------------+
|   |        params        | mean_train_score | mean_test_score |
+---+----------------------+------------------+-----------------+
| 0 | {'reg_alpha': 1e-05} |      0.7674      |  0.510933333333 |
| 1 | {'reg_alpha': 0.01}  |  0.766733333333  |      0.5104     |
| 2 |  {'reg_alpha': 0.1}  |  0.759066666667  |      0.4992     |
| 3 |   {'reg_alpha': 1}   |  0.777066666667  |  0.531733333333 |
| 4 |  {'reg_alpha': 10}   |  0.703866666667  |  0.525866666667 |
| 5 |  {'reg_alpha': 100}  |      0.3606      |  0.335733333333 |
+---+----------------------+------------------+-----------------+
Best Params: 
{'reg_alpha': 1}
Best Score: 
0.5317333333333334


Unnamed: 0,model_id,train_score,validate_score,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.711867,0.524533,0.538377,0.1,50,5,1,0.0,0.8,0.8,0,1
1,1,0.8214,0.498133,0.524126,0.3,46,5,1,0.0,0.8,0.8,0,1
2,2,0.746467,0.4992,0.506889,0.3,46,5,10,0.0,0.8,0.8,0,1
3,3,0.746467,0.4992,0.506889,0.3,46,5,10,0.0,0.8,0.8,0,1
4,4,0.746467,0.4992,0.506889,0.3,46,5,10,0.0,0.8,0.8,0,1
5,5,0.7514,0.499467,0.524069,0.3,46,5,10,0.1,0.8,0.8,0,1
6,6,0.7674,0.510933,0.543003,0.3,46,5,10,0.1,0.9,1.0,0,1
7,7,0.777067,0.531733,0.537952,0.3,46,5,10,0.1,0.9,1.0,1,1
8,8,0.777067,0.531733,0.537952,0.3,46,5,10,0.1,0.9,1.0,1,1


In [21]:
# step6: tune reg_lambda 
param_test6 = { 'reg_lambda': [1e-5, 1e-2, 0.1, 1,10,100] }
best_param = tune.gridsearch_cv(param_test6,n_jobs = 4)
tune.dfscore

CV Results: 
+---+-----------------------+------------------+-----------------+
|   |         params        | mean_train_score | mean_test_score |
+---+-----------------------+------------------+-----------------+
| 0 | {'reg_lambda': 1e-05} |      0.777       |      0.5032     |
| 1 |  {'reg_lambda': 0.01} |  0.776133333333  |      0.5064     |
| 2 |  {'reg_lambda': 0.1}  |      0.775       |  0.506933333333 |
| 3 |   {'reg_lambda': 1}   |  0.777066666667  |  0.531733333333 |
| 4 |   {'reg_lambda': 10}  |  0.741666666667  |  0.518666666667 |
| 5 |  {'reg_lambda': 100}  |  0.654866666667  |  0.511733333333 |
+---+-----------------------+------------------+-----------------+
Best Params: 
{'reg_lambda': 1}
Best Score: 
0.5317333333333334


Unnamed: 0,model_id,train_score,validate_score,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.711867,0.524533,0.538377,0.1,50,5,1,0.0,0.8,0.8,0,1
1,1,0.8214,0.498133,0.524126,0.3,46,5,1,0.0,0.8,0.8,0,1
2,2,0.746467,0.4992,0.506889,0.3,46,5,10,0.0,0.8,0.8,0,1
3,3,0.746467,0.4992,0.506889,0.3,46,5,10,0.0,0.8,0.8,0,1
4,4,0.746467,0.4992,0.506889,0.3,46,5,10,0.0,0.8,0.8,0,1
5,5,0.7514,0.499467,0.524069,0.3,46,5,10,0.1,0.8,0.8,0,1
6,6,0.7674,0.510933,0.543003,0.3,46,5,10,0.1,0.9,1.0,0,1
7,7,0.777067,0.531733,0.537952,0.3,46,5,10,0.1,0.9,1.0,1,1
8,8,0.777067,0.531733,0.537952,0.3,46,5,10,0.1,0.9,1.0,1,1
9,9,0.777067,0.531733,0.537952,0.3,46,5,10,0.1,0.9,1.0,1,1


In [None]:
# step7: lower learning_rate and rise n_estimators
param_test7 = { 'learning_rate': 0.01, 'n_estimators':1000}
tune.params_dict.update(param_test7)
tune.model.set_params(**tune.params_dict)
tune.xgboost_cv(cv= 5, early_stopping_rounds= 100,n_jobs = 4)
tune.dfscore 

[0]	train-auc:0.778448	train-ks:0.410749	test-auc:0.743775	test-ks:0.369227
Multiple eval metrics have been passed: 'test-ks' will be used for early stopping.

Will train until test-ks hasn't improved in 100 rounds.
[1]	train-auc:0.793301	train-ks:0.434408	test-auc:0.760236	test-ks:0.383839
[2]	train-auc:0.797672	train-ks:0.449583	test-auc:0.765614	test-ks:0.400716
[3]	train-auc:0.802649	train-ks:0.457103	test-auc:0.771631	test-ks:0.411271
[4]	train-auc:0.805567	train-ks:0.459226	test-auc:0.773887	test-ks:0.415588
[5]	train-auc:0.809329	train-ks:0.467562	test-auc:0.777389	test-ks:0.423882
[6]	train-auc:0.812722	train-ks:0.472734	test-auc:0.780796	test-ks:0.42692
[7]	train-auc:0.813672	train-ks:0.476692	test-auc:0.781283	test-ks:0.426884
[8]	train-auc:0.814211	train-ks:0.475902	test-auc:0.781616	test-ks:0.432436
[9]	train-auc:0.814063	train-ks:0.476714	test-auc:0.780347	test-ks:0.428695
[10]	train-auc:0.814979	train-ks:0.477943	test-auc:0.780269	test-ks:0.43131
[11]	train-auc:0.816292	t

[106]	train-auc:0.85543	train-ks:0.539545	test-auc:0.810552	test-ks:0.470361
[107]	train-auc:0.855699	train-ks:0.540127	test-auc:0.810619	test-ks:0.470587
[108]	train-auc:0.856104	train-ks:0.540858	test-auc:0.810984	test-ks:0.471464
[109]	train-auc:0.856355	train-ks:0.541059	test-auc:0.811094	test-ks:0.471448
[110]	train-auc:0.85679	train-ks:0.541987	test-auc:0.811449	test-ks:0.472046
[111]	train-auc:0.857258	train-ks:0.543114	test-auc:0.811741	test-ks:0.471728
[112]	train-auc:0.857497	train-ks:0.543648	test-auc:0.811951	test-ks:0.472007
[113]	train-auc:0.857828	train-ks:0.543985	test-auc:0.812298	test-ks:0.472286
[114]	train-auc:0.858212	train-ks:0.543982	test-auc:0.812615	test-ks:0.472421
[115]	train-auc:0.858392	train-ks:0.544064	test-auc:0.81265	test-ks:0.47378
[116]	train-auc:0.858837	train-ks:0.544973	test-auc:0.813192	test-ks:0.473476
[117]	train-auc:0.859217	train-ks:0.54578	test-auc:0.813529	test-ks:0.474268
[118]	train-auc:0.859516	train-ks:0.54642	test-auc:0.813738	test-ks:0

[212]	train-auc:0.881304	train-ks:0.591488	test-auc:0.827986	test-ks:0.499399
[213]	train-auc:0.881446	train-ks:0.591957	test-auc:0.828055	test-ks:0.500147
[214]	train-auc:0.881613	train-ks:0.592224	test-auc:0.828168	test-ks:0.500332
[215]	train-auc:0.881737	train-ks:0.592555	test-auc:0.82827	test-ks:0.499849
[216]	train-auc:0.881888	train-ks:0.592959	test-auc:0.82831	test-ks:0.500088
[217]	train-auc:0.882013	train-ks:0.593094	test-auc:0.828433	test-ks:0.500234
[218]	train-auc:0.882164	train-ks:0.593243	test-auc:0.828519	test-ks:0.500945
[219]	train-auc:0.882327	train-ks:0.593307	test-auc:0.828619	test-ks:0.501158
[220]	train-auc:0.882448	train-ks:0.593245	test-auc:0.828723	test-ks:0.501702
[221]	train-auc:0.882568	train-ks:0.593294	test-auc:0.828781	test-ks:0.501949
[222]	train-auc:0.882722	train-ks:0.593568	test-auc:0.828943	test-ks:0.502049
[223]	train-auc:0.882932	train-ks:0.593913	test-auc:0.829121	test-ks:0.503273
[224]	train-auc:0.883129	train-ks:0.594424	test-auc:0.829296	test-

[318]	train-auc:0.895569	train-ks:0.623598	test-auc:0.835528	test-ks:0.516632
[319]	train-auc:0.895731	train-ks:0.623749	test-auc:0.835515	test-ks:0.516482
[320]	train-auc:0.895868	train-ks:0.623998	test-auc:0.835569	test-ks:0.516495
[321]	train-auc:0.896002	train-ks:0.624343	test-auc:0.835617	test-ks:0.51664
[322]	train-auc:0.89609	train-ks:0.624875	test-auc:0.835651	test-ks:0.517164
[323]	train-auc:0.896244	train-ks:0.625475	test-auc:0.835748	test-ks:0.516895
[324]	train-auc:0.896312	train-ks:0.625865	test-auc:0.835783	test-ks:0.516636
[325]	train-auc:0.896459	train-ks:0.626398	test-auc:0.835854	test-ks:0.516362
[326]	train-auc:0.896601	train-ks:0.626806	test-auc:0.835946	test-ks:0.51699
[327]	train-auc:0.896723	train-ks:0.627274	test-auc:0.836024	test-ks:0.516258
[328]	train-auc:0.89684	train-ks:0.627277	test-auc:0.836084	test-ks:0.51728
[329]	train-auc:0.89695	train-ks:0.627336	test-auc:0.836116	test-ks:0.517543
[330]	train-auc:0.897066	train-ks:0.627461	test-auc:0.836182	test-ks:0

[424]	train-auc:0.90651	train-ks:0.651188	test-auc:0.839344	test-ks:0.52421
[425]	train-auc:0.90663	train-ks:0.651586	test-auc:0.839357	test-ks:0.524748
[426]	train-auc:0.906766	train-ks:0.652119	test-auc:0.839358	test-ks:0.525544
[427]	train-auc:0.906883	train-ks:0.652519	test-auc:0.839375	test-ks:0.525273
[428]	train-auc:0.906963	train-ks:0.652649	test-auc:0.839404	test-ks:0.525282
[429]	train-auc:0.907049	train-ks:0.652861	test-auc:0.839419	test-ks:0.524717
[430]	train-auc:0.907139	train-ks:0.652926	test-auc:0.839441	test-ks:0.524146
[431]	train-auc:0.907205	train-ks:0.652856	test-auc:0.839455	test-ks:0.523323
[432]	train-auc:0.907299	train-ks:0.652858	test-auc:0.839463	test-ks:0.524514
[433]	train-auc:0.907367	train-ks:0.653182	test-auc:0.839483	test-ks:0.524879
[434]	train-auc:0.907452	train-ks:0.653516	test-auc:0.839517	test-ks:0.524491
[435]	train-auc:0.907571	train-ks:0.653522	test-auc:0.839591	test-ks:0.525205
[436]	train-auc:0.907678	train-ks:0.653793	test-auc:0.839571	test-k

[530]	train-auc:0.915417	train-ks:0.673039	test-auc:0.841612	test-ks:0.527512
[531]	train-auc:0.915498	train-ks:0.673109	test-auc:0.841632	test-ks:0.527968
[532]	train-auc:0.91559	train-ks:0.673376	test-auc:0.841664	test-ks:0.528219
[533]	train-auc:0.91565	train-ks:0.67364	test-auc:0.841665	test-ks:0.528158
[534]	train-auc:0.915709	train-ks:0.673635	test-auc:0.841676	test-ks:0.528515
[535]	train-auc:0.915844	train-ks:0.673558	test-auc:0.841652	test-ks:0.528528
[536]	train-auc:0.915934	train-ks:0.67384	test-auc:0.841715	test-ks:0.528767
[537]	train-auc:0.916022	train-ks:0.67394	test-auc:0.841753	test-ks:0.529295
[538]	train-auc:0.916098	train-ks:0.674104	test-auc:0.84175	test-ks:0.529353
[539]	train-auc:0.916151	train-ks:0.674134	test-auc:0.841738	test-ks:0.529089
[540]	train-auc:0.916246	train-ks:0.674446	test-auc:0.841784	test-ks:0.529369
[541]	train-auc:0.916307	train-ks:0.674674	test-auc:0.841817	test-ks:0.529187
[542]	train-auc:0.91638	train-ks:0.674541	test-auc:0.841825	test-ks:0.

[636]	train-auc:0.923343	train-ks:0.692267	test-auc:0.842941	test-ks:0.529272
[637]	train-auc:0.923429	train-ks:0.692474	test-auc:0.842949	test-ks:0.529486
[638]	train-auc:0.92349	train-ks:0.692541	test-auc:0.842982	test-ks:0.53054
[639]	train-auc:0.923531	train-ks:0.692739	test-auc:0.842973	test-ks:0.530281
[640]	train-auc:0.923598	train-ks:0.692677	test-auc:0.842998	test-ks:0.529229
[641]	train-auc:0.923648	train-ks:0.69281	test-auc:0.843036	test-ks:0.529482
[642]	train-auc:0.923714	train-ks:0.69288	test-auc:0.843019	test-ks:0.528954
[643]	train-auc:0.923759	train-ks:0.692931	test-auc:0.843026	test-ks:0.529198
[644]	train-auc:0.923829	train-ks:0.693354	test-auc:0.843061	test-ks:0.528444
[645]	train-auc:0.923883	train-ks:0.693268	test-auc:0.843091	test-ks:0.529254
[646]	train-auc:0.923959	train-ks:0.693541	test-auc:0.843079	test-ks:0.529712
[647]	train-auc:0.924024	train-ks:0.693612	test-auc:0.843106	test-ks:0.529973
[648]	train-auc:0.924123	train-ks:0.693806	test-auc:0.843095	test-ks