# tianjikit模块使用指南

### 一，模块整体架构

![](readme.png)

#### outlier_analysis的输出：

In [None]:
'med', #中位数
'seg_25', #1/4分位数
'seg_75', #3/4分位数
'up_limit',  #离群值判定上边界
'low_limit', #离群值判定下边界
'up_ratio',  #超上边界离群值比例
'low_ratio';  #超下边界离群值比例


#### basic_analysis的输出：

In [None]:
#------覆盖率------------------------#
'not_nan_ratio',  #非空比例，通常覆盖率coverage即指它
'not_zero_ratio', #非零比例，非零值不含空值
'not_outlier_ratio', #非离群值比例，非离群值不含空值

#------统计值------------------------#
'class_num', #数据类别数目
'value_num', #非空数据数目
'min', #最小值
'mean',#均值
'med', #中位数
'most', #众数
'max', #最大值

#------有效性----------------------#
'ks', #ks统计量，适合连续特征
'ks_pvalue', #ks统计量的p值
'chi2', #chi2统计量，适合离散特征
'chi2_pvalue', #chi2统计量的p值
'iv'; #iv统计量，适合连续和离散特征，iv>0.1有效，iv>0.2强有效


#### psi_analysis的输出：

In [None]:
'psi', #psi指标，仅当 train_data和 test_data 有效数据数量 >10时才取值，否则为 nan值
'is_stable', #是否稳定，psi<0.2判定为稳定
'train_class_num', # train_data中数据类别数目
'test_class_num' , # test_data中数据类别数目
'train_value_num', #train_data中有效数据数目
'test_value_num';#test_data中有效数据数目


#### ks_analysis的输出：

In [None]:
'feature_interval',#特征取值区间
'order_num', #订单数量
'order_ratio', #订单占比
'overdue_num', #逾期订单数量
'overdue_ratio', #逾期订单占比
'normal_num', #正常订单数量
'normal_ratio', #正常订单占比
'overdue_cum_ratio', #累计逾期订单比例
'normal_cum_ratio', #累计正常订单比例
'ks_value'; #ks统计值


#### iv_analysis的输出：

In [None]:
'feature_interval',#区间
'order_num', #订单数量
'order_ratio', #订单占比
'overdue_num', #逾期订单数量
'overdue_ratio', #逾期订单比例
'overdue_interval_ratio', #区间逾期订单占总逾期订单比例
'normal_num', #正常订单数量
'normal_ratio', #正常订单占比
'normal_interval_ratio', #区间正常订单占总正常订单比例
'iv_value'; #iv检验值，列重复


#### chi2_analysis的输出：

In [None]:
'TP', #feature为1的逾期样本数量
'FP', #feature为1的正常样本数量
'TN', #feature为0的正常样本数量
'FN', #feature为0的逾期的样本数量
'TPR', #TP/(TP+FN),逾期样本中feature取1比例
'FPR',#FP/(FP+TN),正常样本中feature取1比例
'overdue_ratio_0',# feature为0样本的逾期率
'overdue_ratio_1',# feature为1样本的逾期率
'precision',#精度
'accuracy',#准确度
'chi2', #shi nme shenmeenme
'chi2_pvalue'; #卡方统计量的p值


### 二，单特征分析示范

In [5]:
import numpy as np
import pandas as pd
from tianjikit.analysisfeature import AnalysisFeature

# 准备数据
data = [1.0,2,3,4,5,6,4,3,2,1,2,9,10,100,np.nan,0,7,8,10,6]
label = [0,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,1]
assert len(data)==len(label)

af = AnalysisFeature()
# 离群值分析
dfoutliers = af.outliers_analysis(data,alpha = 2)

# 去除离群值
data_clean = af.drop_outliers(data,data,alpha = 2)

# 基本分析
dfbasic = af.basic_analysis(data,label)

# psi稳定性分析
test_data = [10,9,5,3,4,3,2,1,6,7,5,np.nan,10,100]
dfpsi = af.psi_analysis(data,test_data)

# ks有效性分析,主要对连续特征，对离散特征也可分析
dfks = af.ks_analysis(data,label)

# iv有效性分析，主要针对离散特征，对连续特征也适用
dfiv = af.iv_analysis(data,label)

# 卡方及召回率等分析，主要针对离散特征
dfchi2 = af.chi2_analysis(data,label)

### 三，多特征分析示范

In [1]:
# 多特征分析示范
import numpy as np
import pandas as pd
from tianjikit.analysisfeatures import AnalysisFeatures

# 构造dftrain 训练集特征数据
dftrain = pd.DataFrame()
dftrain['phone'] = ['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10','x11','x12']
dftrain['loan_dt'] = ['2018-01-01']*12
dftrain['label'] = [0,1,1,0,1,0,0,0,0,0,1,0]
dftrain['feature1'] = [1,0,1,0,1,0,1,0,1,0,1,1]
dftrain['feature2'] = [1.0,2,3,4,5,6,7,8,9,10,11,12]


# 构造dftest测试集特征
dftest = pd.DataFrame()
dftest['phone'] = ['y1','y2','y3','y4','y5','y6','y7','y8','y9','y10']
dftest['loan_dt'] = ['2018-02-01']*10
dftest['label'] = [1,0,0,1,0,0,0,1,0,0]
dftest['feature1'] = [1,0,0,1,0,0,1,0,1,0]
dftest['feature2'] = [10.0,9,8,7,6,5,4,3,2,1]

afs = AnalysisFeatures(dftrain,dftest)

#特征基本分析
dfbasic = afs.basic_analysises()

#特征稳定性分析
dfpsi = afs.psi_analysises()

#特征ks分析
dfks = afs.ks_analysises()

#特征iv分析
dfiv = afs.iv_analysises()

start basic_analysises...
[total|done|todo]
[2|2|0]
start psi_analysises...
[total|done|todo]
[2|2|0]
start ks_analysis...
[total|done|todo]
[2|2|0]
start IvAnalysis...
[total|done|todo]
[2|2|0]


### 四，跑模型评分示范

In [2]:
# 准备训练数据
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

data,label = datasets.make_classification(n_samples= 10000, n_features=20,n_classes=2, random_state=0)
dfdata = pd.DataFrame(data,columns = ['feature'+str(i) for i in range(data.shape[1])])
dfdata['label'] = label
dftrain,dftest = train_test_split(dfdata)
dftrain,dftest = dftrain.copy(),dftest.copy()
dftrain.index,dftest.index  = range(len(dftrain)),range(len(dftest))
dftrain.loc[0,['feature0','feature1','feature2']] = np.nan #构造若干缺失值

In [4]:
# 训练逻辑回归模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0,
                 outliers_th=None, fillna_method='most', scale_method= None)
lr = model.train_lr(cv=None, model_idx=5)
model.test(lr)
dfimportance = model.dfimportances['lr']

START DATA PREPROCESSING ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
chi2 threshold:  0
fillna method:  most
scale method:  None
------------------------------------------------------------------------
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  23
------------------------------------------------------------------------
START TRAIN LR MODEL ...

2018-10-26 12:35:04:

train: ks = 0.65334 	 auc = 0.898266528728 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 | [0.00037,0.03416) |    750    |     0.1     |      28     |    0.03733    | 0.18433  |
| 1 |  [0.03416,0.091)  |    750    |     0.1     |      49     |    

In [5]:
# 训练随机森林模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, 
                 outliers_th=None, fillna_method='most', scale_method= None)
rf = model.train_rf(cv=None, model_idx=5,
      n_estimators=100, max_depth=10, min_samples_split=2,
      min_samples_leaf=1, min_weight_fraction_leaf=0.0,
      max_features='auto', max_leaf_nodes=None, n_jobs = 4)
model.test(rf)
dfimportance = model.dfimportances['rf']

START DATA PREPROCESSING ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
chi2 threshold:  0
fillna method:  most
scale method:  None
------------------------------------------------------------------------
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  23
------------------------------------------------------------------------
START TRAIN RANDOMFOREST MODEL ...

2018-10-26 12:35:17:

train: ks = 0.90188 	 auc = 0.990171738571 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 | [0.00343,0.02105) |    750    |     0.1     |      0      |      0.0      | 0.19926  |
| 1 | [0.02105,0.03165) |    750    |     0.1     |      0 

In [6]:
# 训练GBDT模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0,  
                 outliers_th=None, fillna_method='most', scale_method= None)
gbdt = model.train_gbdt(cv=5, model_idx=5,
       learning_rate=0.01, n_estimators=1000, max_depth= 3, min_samples_split= 50, 
       min_samples_leaf= 5, subsample=0.7, max_features='sqrt',random_state= 0) 
model.test(gbdt)
dfimportance = model.dfimportances['gbdt']

START DATA PREPROCESSING ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
chi2 threshold:  0
fillna method:  most
scale method:  None
------------------------------------------------------------------------
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  23
------------------------------------------------------------------------
START TRAIN GBDT MODEL ...

2018-10-26 12:35:26: k = 1

train: ks = 0.85266 	 auc = 0.980178991178 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 | [0.00453,0.02046) |    600    |     0.1     |      0      |      0.0      | 0.19927  |
| 1 |  [0.02046,0.0244) |    600    |     0.1     |      1   


train: ks = 0.84301 	 auc = 0.978524489051 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 | [0.00606,0.02095) |    600    |     0.1     |      1      |    0.00167    | 0.19861  |
| 1 | [0.02095,0.02553) |    600    |     0.1     |      2      |    0.00333    | 0.39654  |
| 2 | [0.02553,0.03448) |    600    |     0.1     |      2      |    0.00333    | 0.59448  |
| 3 | [0.03448,0.17386) |    600    |     0.1     |      19     |    0.03167    | 0.78108  |
| 4 | [0.17386,0.59024) |    600    |     0.1     |     206     |    0.34333    | 0.84301  |
| 5 | [0.59024,0.81136) |    600    |     0.1     |     438     |      0.73     | 0.75027  |
| 6 | [0.81136,0.92612) |    600    |     0.1     |     538     |    0.89667    | 0.59087  |
| 7 | [0.92612,0.94773) |

In [7]:
# 训练XGBOOST模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, 
                 outliers_th=None, fillna_method= None, scale_method= None)
xgb = model.train_xgb(cv=5,learning_rate=0.1, model_idx=5,
      n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8,
      colsample_bytree=0.8,scale_pos_weight=1, n_jobs=4, seed=10) 
model.test(xgb)
dfimportance = model.dfimportances['xgb']

START DATA PREPROCESSING ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
chi2 threshold:  0
fillna method:  None
scale method:  None
------------------------------------------------------------------------
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  20
------------------------------------------------------------------------
START TRAIN XGBOOST MODEL ...

2018-10-26 12:36:13: k = 1

train: ks = 0.99601 	 auc = 1.0 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 |    [0.0,4e-05)    |    600    |     0.1     |      0      |      0.0      | 0.19927  |
| 1 |  [4e-05,0.00012)  |    600    |     0.1     |      0      |    


train: ks = 0.99635 	 auc = 1.0 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 |    [0.0,3e-05)    |    600    |     0.1     |      0      |      0.0      | 0.19927  |
| 1 |  [3e-05,0.00013)  |    600    |     0.1     |      0      |      0.0      | 0.39854  |
| 2 | [0.00013,0.00049) |    600    |     0.1     |      0      |      0.0      | 0.59781  |
| 3 | [0.00049,0.00608) |    600    |     0.1     |      0      |      0.0      | 0.79708  |
| 4 | [0.00608,0.04446) |    600    |     0.1     |      0      |      0.0      | 0.99635  |
| 5 | [0.04446,0.99215) |    600    |     0.1     |     589     |    0.98167    | 0.80294  |
| 6 | [0.99215,0.99748) |    600    |     0.1     |     600     |      1.0      | 0.60221  |
| 7 | [0.99748,0.99925) |    600    

In [8]:
# 训练神经网络模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, 
             outliers_th=None, fillna_method='most', scale_method= None)
nn = model.train_nn(cv = None, model_idx = 5,
     hidden_layer_sizes=(100,20), activation='relu', alpha=0.0001, 
     learning_rate='constant', learning_rate_init=0.001, max_iter=200,tol=0.0001, 
     early_stopping=False, validation_fraction=0.1, warm_start=False, random_state = None)
model.test(nn)

START DATA PREPROCESSING ...

train set size:  7500
test set size:  2500
coverage threshold:  0.1
outlier threshold:  None
ks threshold:  0
chi2 threshold:  0
fillna method:  most
scale method:  None
------------------------------------------------------------------------
original feature number:  20
feature number remain after dropfeature:  20
feature number increased to after fill_na:  23
------------------------------------------------------------------------
START TRAIN NEURAL NETWORK MODEL ...

2018-10-26 12:37:30:

train: ks = 0.98241 	 auc = 0.999362408891 
+---+-------------------+-----------+-------------+-------------+---------------+----------+
|   |  feature_interval | order_num | order_ratio | overdue_num | overdue_ratio | ks_value |
+---+-------------------+-----------+-------------+-------------+---------------+----------+
| 0 |     [0.0,0.0)     |    750    |     0.1     |      0      |      0.0      | 0.19926  |
| 1 |    [0.0,1e-05)    |    750    |     0.1     |      

### 五，xgboost调参示范

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import xgboost
from sklearn import datasets
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier

from tianjikit.tunning import Tunning

data,label = datasets.make_classification(n_samples= 10000, n_features=20, n_informative= 6 ,
             n_classes=2, n_clusters_per_class=10,random_state=0)
dfdata = pd.DataFrame(data,columns = [u'f'+str(i) for i in range(data.shape[1])])
dfdata['label'] = label
dftrain,dftest = train_test_split(dfdata)

In [6]:
params_dict = dict()

# 以下为待调整参数
# booster参数
params_dict['learning_rate'] = 0.1        # 学习率，初始值为 0.1，通常越小越好。
params_dict['n_estimators'] = 50          # 加法模型树的数量，初始值为50，通常通过xgboost自带模型cv确认。

# tree参数
params_dict['max_depth'] = 3              # 树的深度，通常取值在[3,10]之间，初始值常取[3,6]之间
params_dict['min_child_weight']=10        # 最小叶子节点样本权重和，越大模型越保守。
params_dict['gamma']= 0                   # 节点分裂所需的最小损失函数下降值，越大模型越保守。
params_dict['subsample']= 0.8             # 横向采样，样本采样比例，通常取值在 [0.5，1]之间 
params_dict['colsample_bytree'] = 0.8     # 纵向采样，特征采样比例，通常取值在 [0.5，1]之间 

# regulazation参数 
# Omega(f) = gamma*T + reg_alpha* sum(abs(wj)) + reg_lambda* sum(wj**2) 
params_dict['reg_alpha'] = 0              #L1 正则化项的权重系数，越大模型越保守，通常取值在[0,1]之间。
params_dict['reg_lambda'] = 1             #L2 正则化项的权重系数，越大模型越保守，通常取值在[1,100]之间。

# 以下参数通常不需要调整
params_dict['objective'] = 'binary:logistic'
params_dict['n_jobs'] = 4
params_dict['scale_pos_weight'] = 1       #不平衡样本时设定为正值可以使算法更快收敛。
params_dict['seed'] = 0

In [7]:
# step0: 初始化
model = XGBClassifier()
tune = Tunning(model = model,dftrain = dftrain,dftest = dftest,params_dict = params_dict,
               score_func='ks',score_gap_limit=0.05,n_jobs = 4)
tune.dfscore

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.489346,0.435681,0.0536647,0.459991,0.1,50,3,10,0,0.8,0.8,0,1


In [8]:
# step1: tune n_estimators for relatively high learning_rate (eg: 0.1)
param_test1 = { 'learning_rate': 0.3, 'n_estimators':1000}
tune.params_dict.update(param_test1)
tune.model.set_params(**tune.params_dict)
tune.xgboost_cv(cv= 5, early_stopping_rounds= 100,n_jobs = -1,seed = 0)
tune.dfscore

Multiple eval metrics have been passed: 'test-ks' will be used for early stopping.

Will train until test-ks hasn't improved in 100 rounds.
Stopping. Best iteration:
[174]	train-auc:0.950073+0.0015029	train-ks:0.762365+0.00437117	test-auc:0.817742+0.0183046	test-ks:0.501224+0.0395334

Best n_estimators considered score_gap_limit:  17


Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.489346,0.435681,0.0536647,0.459991,0.1,50,3,10,0,0.8,0.8,0,1
1,1,0.476062,0.412215,0.0638464,0.419036,0.3,17,3,10,0,0.8,0.8,0,1


In [9]:
# step2：tune max_depth & min_child_weight 
param_test2 = { 'max_depth': [3,4,5], 'min_child_weight': [10,20,50,100] } 
best_param = tune.gridsearch_cv(param_test2,n_jobs = 4)
tune.dfscore

CV results: 
+----+-------------------------------------------+------------------+-----------------+-----------------+
|    |                   params                  | mean_train_score | mean_test_score |    score_gap    |
+----+-------------------------------------------+------------------+-----------------+-----------------+
| 0  |  {'max_depth': 3, 'min_child_weight': 10} |  0.476061875717  |  0.412215474635 | 0.0638464010817 |
| 1  |  {'max_depth': 3, 'min_child_weight': 20} |  0.464172494568  |  0.414904895949 | 0.0492675986189 |
| 2  |  {'max_depth': 3, 'min_child_weight': 50} |  0.457931093414  |  0.419410493056 | 0.0385206003584 |
| 3  | {'max_depth': 3, 'min_child_weight': 100} |  0.432854689114  |  0.398331722308 | 0.0345229668063 |
| 4  |  {'max_depth': 4, 'min_child_weight': 10} |  0.544564219493  |  0.461077017618 | 0.0834872018742 |
| 5  |  {'max_depth': 4, 'min_child_weight': 20} |  0.529860188672  |  0.453836419739 | 0.0760237689329 |
| 6  |  {'max_depth': 4, 'min_chi

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.489346,0.435681,0.0536647,0.459991,0.1,50,3,10,0,0.8,0.8,0,1
1,1,0.476062,0.412215,0.0638464,0.419036,0.3,17,3,10,0,0.8,0.8,0,1
2,2,0.457931,0.41941,0.0385206,0.503123,0.3,17,3,50,0,0.8,0.8,0,1


In [10]:
# step3：tune gamma
param_test3 = {'gamma': [0.3,0.4,0.5]}
best_param = tune.gridsearch_cv(param_test3,n_jobs = -1)
tune.dfscore

CV results: 
+---+----------------+------------------+-----------------+-----------------+
|   |     params     | mean_train_score | mean_test_score |    score_gap    |
+---+----------------+------------------+-----------------+-----------------+
| 0 | {'gamma': 0.3} |  0.457931093414  |  0.419410493056 | 0.0385206003584 |
| 1 | {'gamma': 0.4} |  0.457931093414  |  0.419410493056 | 0.0385206003584 |
| 2 | {'gamma': 0.5} |  0.457931093414  |  0.419410493056 | 0.0385206003584 |
+---+----------------+------------------+-----------------+-----------------+
Best params this step: 
{'gamma': 0.3}
Best score this step: 
0.41941049305569134


Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.489346,0.435681,0.0536647,0.459991,0.1,50,3,10,0.0,0.8,0.8,0,1
1,1,0.476062,0.412215,0.0638464,0.419036,0.3,17,3,10,0.0,0.8,0.8,0,1
2,2,0.457931,0.41941,0.0385206,0.503123,0.3,17,3,50,0.0,0.8,0.8,0,1
3,3,0.457931,0.41941,0.0385206,0.415738,0.3,17,3,50,0.3,0.8,0.8,0,1


In [11]:
# step4：tune subsample & colsample_bytree 
param_test4 = { 'subsample': [0.6,0.7,0.8,0.9,1],
               'colsample_bytree': [0.6,0.7,0.8,0.9,1] } 
best_param = tune.gridsearch_cv(param_test4,n_jobs = -1)
tune.dfscore

CV results: 
+----+---------------------------------------------+------------------+-----------------+-----------------+
|    |                    params                   | mean_train_score | mean_test_score |    score_gap    |
+----+---------------------------------------------+------------------+-----------------+-----------------+
| 0  | {'subsample': 0.6, 'colsample_bytree': 0.6} |  0.433241607532  |  0.39779906785  | 0.0354425396816 |
| 1  | {'subsample': 0.7, 'colsample_bytree': 0.6} |  0.44355353485   |  0.400196267964 | 0.0433572668858 |
| 2  | {'subsample': 0.8, 'colsample_bytree': 0.6} |  0.452299740046  |  0.403656182381 |  0.048643557665 |
| 3  | {'subsample': 0.9, 'colsample_bytree': 0.6} |  0.451351054228  |  0.407937339148 | 0.0434137150797 |
| 4  |  {'subsample': 1, 'colsample_bytree': 0.6}  |  0.466306276316  |  0.425301289187 | 0.0410049871289 |
| 5  | {'subsample': 0.6, 'colsample_bytree': 0.7} |  0.443683311441  |  0.397289966382 | 0.0463933450592 |
| 6  | {'subsam

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.489346,0.435681,0.0536647,0.459991,0.1,50,3,10,0.0,0.8,0.8,0,1
1,1,0.476062,0.412215,0.0638464,0.419036,0.3,17,3,10,0.0,0.8,0.8,0,1
2,2,0.457931,0.41941,0.0385206,0.503123,0.3,17,3,50,0.0,0.8,0.8,0,1
3,3,0.457931,0.41941,0.0385206,0.415738,0.3,17,3,50,0.3,0.8,0.8,0,1
4,4,0.466306,0.425301,0.041005,0.445563,0.3,17,3,50,0.3,1.0,0.6,0,1


In [12]:
# step5: tune reg_alpha 
param_test5 = { 'reg_alpha': [0, 1e-2, 0.1, 1,10,100] } 
best_param = tune.gridsearch_cv(param_test5,n_jobs = 4)
tune.dfscore

CV results: 
+---+---------------------+------------------+-----------------+-----------------+
|   |        params       | mean_train_score | mean_test_score |    score_gap    |
+---+---------------------+------------------+-----------------+-----------------+
| 0 |   {'reg_alpha': 0}  |  0.466306276316  |  0.425301289187 | 0.0410049871289 |
| 1 | {'reg_alpha': 0.01} |  0.466239631864  |  0.425301289187 |  0.040938342677 |
| 2 |  {'reg_alpha': 0.1} |  0.46364256351   |  0.424233557274 | 0.0394090062357 |
| 3 |   {'reg_alpha': 1}  |  0.462613060698  |  0.413835737071 | 0.0487773236273 |
| 4 |  {'reg_alpha': 10}  |  0.456024808104  |  0.415668605515 | 0.0403562025897 |
| 5 |  {'reg_alpha': 100} |  0.364629537171  |   0.3447064489  | 0.0199230882706 |
+---+---------------------+------------------+-----------------+-----------------+
Best params this step: 
{'reg_alpha': 0}
Best score this step: 
0.4253012891874771


Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.489346,0.435681,0.0536647,0.459991,0.1,50,3,10,0.0,0.8,0.8,0,1
1,1,0.476062,0.412215,0.0638464,0.419036,0.3,17,3,10,0.0,0.8,0.8,0,1
2,2,0.457931,0.41941,0.0385206,0.503123,0.3,17,3,50,0.0,0.8,0.8,0,1
3,3,0.457931,0.41941,0.0385206,0.415738,0.3,17,3,50,0.3,0.8,0.8,0,1
4,4,0.466306,0.425301,0.041005,0.445563,0.3,17,3,50,0.3,1.0,0.6,0,1
5,5,0.466306,0.425301,0.041005,0.445563,0.3,17,3,50,0.3,1.0,0.6,0,1


In [13]:
# step6: tune reg_lambda 
param_test6 = { 'reg_lambda': [0, 0.01, 0.1, 1,10,100] }
best_param = tune.gridsearch_cv(param_test6,n_jobs = 4)
tune.dfscore

CV results: 
+---+----------------------+------------------+-----------------+-----------------+
|   |        params        | mean_train_score | mean_test_score |    score_gap    |
+---+----------------------+------------------+-----------------+-----------------+
| 0 |  {'reg_lambda': 0}   |  0.465849586845  |  0.415102833605 | 0.0507467532404 |
| 1 | {'reg_lambda': 0.01} |  0.465849586845  |  0.415102833605 | 0.0507467532404 |
| 2 | {'reg_lambda': 0.1}  |  0.466115964586  |  0.415102477575 | 0.0510134870114 |
| 3 |  {'reg_lambda': 1}   |  0.466306276316  |  0.425301289187 | 0.0410049871289 |
| 4 |  {'reg_lambda': 10}  |  0.466173364101  |  0.419133170133 | 0.0470401939684 |
| 5 | {'reg_lambda': 100}  |  0.445391687288  |  0.409556935035 | 0.0358347522531 |
+---+----------------------+------------------+-----------------+-----------------+
Best params this step: 
{'reg_lambda': 1}
Best score this step: 
0.4253012891874771


Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.489346,0.435681,0.0536647,0.459991,0.1,50,3,10,0.0,0.8,0.8,0,1
1,1,0.476062,0.412215,0.0638464,0.419036,0.3,17,3,10,0.0,0.8,0.8,0,1
2,2,0.457931,0.41941,0.0385206,0.503123,0.3,17,3,50,0.0,0.8,0.8,0,1
3,3,0.457931,0.41941,0.0385206,0.415738,0.3,17,3,50,0.3,0.8,0.8,0,1
4,4,0.466306,0.425301,0.041005,0.445563,0.3,17,3,50,0.3,1.0,0.6,0,1
5,5,0.466306,0.425301,0.041005,0.445563,0.3,17,3,50,0.3,1.0,0.6,0,1
6,6,0.466306,0.425301,0.041005,0.445563,0.3,17,3,50,0.3,1.0,0.6,0,1


In [15]:
# step7: lower learning_rate and rise n_estimators
param_test7 = { 'learning_rate': 0.01, 'n_estimators':2000}
tune.params_dict.update(param_test7)
tune.model.set_params(**tune.params_dict)
tune.xgboost_cv(cv= 5, early_stopping_rounds= 100,n_jobs = -1)
tune.dfscore 

Multiple eval metrics have been passed: 'test-ks' will be used for early stopping.

Will train until test-ks hasn't improved in 100 rounds.
Stopping. Best iteration:
[1087]	train-auc:0.837759+0.00282261	train-ks:0.510076+0.00779805	test-auc:0.789208+0.020775	test-ks:0.44171+0.0363255

Best n_estimators considered score_gap_limit:  751


Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score,learning_rate,n_estimators,max_depth,min_child_weight,gamma,subsample,colsample_bytree,reg_alpha,reg_lambda
0,0,0.489346,0.435681,0.0536647,0.459991,0.1,50,3,10,0.0,0.8,0.8,0,1
1,1,0.476062,0.412215,0.0638464,0.419036,0.3,17,3,10,0.0,0.8,0.8,0,1
2,2,0.457931,0.41941,0.0385206,0.503123,0.3,17,3,50,0.0,0.8,0.8,0,1
3,3,0.457931,0.41941,0.0385206,0.415738,0.3,17,3,50,0.3,0.8,0.8,0,1
4,4,0.466306,0.425301,0.041005,0.445563,0.3,17,3,50,0.3,1.0,0.6,0,1
5,5,0.466306,0.425301,0.041005,0.445563,0.3,17,3,50,0.3,1.0,0.6,0,1
6,6,0.466306,0.425301,0.041005,0.445563,0.3,17,3,50,0.3,1.0,0.6,0,1
7,7,0.485296,0.425567,0.0597288,0.470695,0.01,751,3,50,0.3,1.0,0.6,0,1
8,8,0.485296,0.425567,0.0597288,0.470695,0.01,751,3,50,0.3,1.0,0.6,0,1
