# tianjikit模块使用指南

### 一，模块整体架构

![](readme.png)

#### outlier_analysis的输出：

In [None]:
'med', #中位数
'seg_25', #1/4分位数
'seg_75', #3/4分位数
'up_limit',  #离群值判定上边界
'low_limit', #离群值判定下边界
'up_ratio',  #超上边界离群值比例
'low_ratio';  #超下边界离群值比例


#### basic_analysis的输出：

In [None]:
#------覆盖率------------------------#
'not_nan_ratio',  #非空比例，通常覆盖率coverage即指它
'not_zero_ratio', #非零比例，非零值不含空值
'not_outlier_ratio', #非离群值比例，非离群值不含空值

#------统计值------------------------#
'class_num', #数据类别数目
'value_num', #非空数据数目
'min', #最小值
'mean',#均值
'med', #中位数
'most', #众数
'max', #最大值

#------有效性----------------------#
'ks', #ks统计量，适合连续特征
'ks_pvalue', #ks统计量的p值
'chi2', #chi2统计量，适合离散特征
'chi2_pvalue', #chi2统计量的p值
'iv'; #iv统计量，适合连续和离散特征，iv>0.1有效，iv>0.2强有效


#### psi_analysis的输出：

In [None]:
'psi', #psi指标，仅当 train_data和 test_data 有效数据数量 >10时才取值，否则为 nan值
'is_stable', #是否稳定，psi<0.2判定为稳定
'train_class_num', # train_data中数据类别数目
'test_class_num' , # test_data中数据类别数目
'train_value_num', #train_data中有效数据数目
'test_value_num';#test_data中有效数据数目


#### ks_analysis的输出：

In [None]:
'feature_interval',#特征取值区间
'order_num', #订单数量
'order_ratio', #订单占比
'overdue_num', #逾期订单数量
'overdue_ratio', #逾期订单占比
'normal_num', #正常订单数量
'normal_ratio', #正常订单占比
'overdue_cum_ratio', #累计逾期订单比例
'normal_cum_ratio', #累计正常订单比例
'ks_value'; #ks统计值


#### iv_analysis的输出：

In [None]:
'feature_interval',#区间
'order_num', #订单数量
'order_ratio', #订单占比
'overdue_num', #逾期订单数量
'overdue_ratio', #逾期订单比例
'overdue_interval_ratio', #区间逾期订单占总逾期订单比例
'normal_num', #正常订单数量
'normal_ratio', #正常订单占比
'normal_interval_ratio', #区间正常订单占总正常订单比例
'iv_value'; #iv检验值，列重复


#### chi2_analysis的输出：

In [None]:
'TP', #feature为1的逾期样本数量
'FP', #feature为1的正常样本数量
'TN', #feature为0的正常样本数量
'FN', #feature为0的逾期的样本数量
'TPR', #TP/(TP+FN),逾期样本中feature取1比例
'FPR',#FP/(FP+TN),正常样本中feature取1比例
'overdue_ratio_0',# feature为0样本的逾期率
'overdue_ratio_1',# feature为1样本的逾期率
'precision',#精度
'accuracy',#准确度
'chi2', #shi nme shenmeenme
'chi2_pvalue'; #卡方统计量的p值


### 二，单特征分析示范

In [None]:
import numpy as np
import pandas as pd
from tianjikit.analysisfeature import AnalysisFeature

# 准备数据
data = [1.0,2,3,4,5,6,4,3,2,1,2,9,10,100,np.nan,0,7,8,10,6]
label = [0,1,1,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,1]
assert len(data)==len(label)

af = AnalysisFeature()
# 离群值分析
dfoutliers = af.outliers_analysis(data,alpha = 2)

# 去除离群值
data_clean = af.drop_outliers(data,data,alpha = 2)

# 基本分析
dfbasic = af.basic_analysis(data,label)

# psi稳定性分析
test_data = [10,9,5,3,4,3,2,1,6,7,5,np.nan,10,100]
dfpsi = af.psi_analysis(data,test_data)

# ks有效性分析,主要对连续特征，对离散特征也可分析
dfks = af.ks_analysis(data,label)

# iv有效性分析，主要针对离散特征，对连续特征也适用
dfiv = af.iv_analysis(data,label)

# 卡方及召回率等分析，主要针对离散特征
dfchi2 = af.chi2_analysis(data,label)

### 三，多特征分析示范

In [None]:
# 多特征分析示范
import numpy as np
import pandas as pd
from tianjikit.analysisfeatures import AnalysisFeatures

# 构造dftrain 训练集特征数据
dftrain = pd.DataFrame()
dftrain['phone'] = ['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10','x11','x12']
dftrain['loan_dt'] = ['2018-01-01']*12
dftrain['label'] = [0,1,1,0,1,0,0,0,0,0,1,0]
dftrain['feature1'] = [1,0,1,0,1,0,1,0,1,0,1,1]
dftrain['feature2'] = [1.0,2,3,4,5,6,7,8,9,10,11,12]


# 构造dftest测试集特征
dftest = pd.DataFrame()
dftest['phone'] = ['y1','y2','y3','y4','y5','y6','y7','y8','y9','y10']
dftest['loan_dt'] = ['2018-02-01']*10
dftest['label'] = [1,0,0,1,0,0,0,1,0,0]
dftest['feature1'] = [1,0,0,1,0,0,1,0,1,0]
dftest['feature2'] = [10.0,9,8,7,6,5,4,3,2,1]

afs = AnalysisFeatures(dftrain,dftest)

#特征基本分析
dfbasic = afs.basic_analysises()

#特征稳定性分析
dfpsi = afs.psi_analysises()

#特征ks分析
dfks = afs.ks_analysises()

#特征iv分析
dfiv = afs.iv_analysises()

### 四，跑模型评分示范

In [None]:
# 准备训练数据
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

data,label = datasets.make_classification(n_samples= 10000, n_features=20,n_classes=2, random_state=0)
dfdata = pd.DataFrame(data,columns = ['feature'+str(i) for i in range(data.shape[1])])
dfdata['label'] = label
dftrain,dftest = train_test_split(dfdata)
dftrain,dftest = dftrain.copy(),dftest.copy()
dftrain.index,dftest.index  = range(len(dftrain)),range(len(dftest))
dftrain.loc[0,['feature0','feature1','feature2']] = np.nan #构造若干缺失值

In [None]:
# 训练逻辑回归模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0,
                 outliers_th=None, fillna_method='most', scale_method= None)
lr = model.train_lr(cv=None, model_idx=5)
model.test(lr)
dfimportance = model.dfimportances['lr']

In [None]:
# 训练随机森林模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, 
                 outliers_th=None, fillna_method='most', scale_method= None)
rf = model.train_rf(cv=None, model_idx=5,
      n_estimators=100, max_depth=10, min_samples_split=2,
      min_samples_leaf=1, min_weight_fraction_leaf=0.0,
      max_features='auto', max_leaf_nodes=None, n_jobs = 4)
model.test(rf)
dfimportance = model.dfimportances['rf']

In [None]:
# 训练GBDT模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0,  
                 outliers_th=None, fillna_method='most', scale_method= None)
gbdt = model.train_gbdt(cv=5, model_idx=5,
       learning_rate=0.01, n_estimators=1000, max_depth= 3, min_samples_split= 50, 
       min_samples_leaf= 5, subsample=0.7, max_features='sqrt',random_state= 0) 
model.test(gbdt)
dfimportance = model.dfimportances['gbdt']

In [None]:
# 训练XGBOOST模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, 
                 outliers_th=None, fillna_method= None, scale_method= None)
xgb = model.train_xgb(cv=5,learning_rate=0.1, model_idx=5,
      n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8,
      colsample_bytree=0.8,scale_pos_weight=1, n_jobs=4, seed=10) 
model.test(xgb)
dfimportance = model.dfimportances['xgb']

In [None]:
# 训练神经网络模型
from tianjikit.runmodel import RunModel
model = RunModel(dftrain = dftrain,dftest = dftest,coverage_th=0.1, ks_th=0, 
             outliers_th=None, fillna_method='most', scale_method= None)
nn = model.train_nn(cv = None, model_idx = 5,
     hidden_layer_sizes=(100,20), activation='relu', alpha=0.0001, 
     learning_rate='constant', learning_rate_init=0.001, max_iter=200,tol=0.0001, 
     early_stopping=False, validation_fraction=0.1, warm_start=False, random_state = None)
model.test(nn)

### 五，xgboost调参示范

In [2]:
from __future__ import print_function
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from tianjikit.tunning import Tunning

data,label = datasets.make_classification(n_samples= 10000, n_features=20, n_informative= 6 ,
             n_classes=2, n_clusters_per_class=10,random_state=0)
dfdata = pd.DataFrame(data,columns = [u'f'+str(i) for i in range(data.shape[1])])
dfdata['label'] = label
dftrain,dftest = train_test_split(dfdata)

In [4]:
# 构造初始化参数
params_dict = dict()
# 以下为待调整参数
# booster参数
params_dict['learning_rate'] = 0.1        # 学习率，初始值为 0.1，通常越小越好。
params_dict['n_estimators'] = 60         # 加法模型树的数量，初始值为50，通常通过模型cv确认。
# tree参数
params_dict['max_depth'] = 3              # 树的深度，通常取值在[3,10]之间，初始值常取[3,6]之间
params_dict['min_child_weight']=10        # 最小叶子节点样本权重和，越大模型越保守。
params_dict['gamma']= 0                   # 节点分裂所需的最小损失函数下降值，越大模型越保守。
params_dict['subsample']= 0.8             # 横向采样，样本采样比例，通常取值在 [0.5，1]之间 
params_dict['colsample_bytree'] = 1.0     # 纵向采样，特征采样比例，通常取值在 [0.5，1]之间 
# regulazation参数 
# Omega(f) = gamma*T + reg_alpha* sum(abs(wj)) + reg_lambda* sum(wj**2) 
params_dict['reg_alpha'] = 0              #L1 正则化项的权重系数，越大模型越保守，通常取值在[0,1]之间。
params_dict['reg_lambda'] = 1             #L2 正则化项的权重系数，越大模型越保守，通常取值在[1,100]之间。
# 以下参数通常不需要调整
params_dict['objective'] = 'binary:logistic'
params_dict['tree_method'] = 'hist'       # 构建树的策略,可以是auto, exact, approx, hist
params_dict['eval_metric'] =  'auc'
params_dict['silent'] = 1
params_dict['scale_pos_weight'] = 1       #不平衡样本时设定为正值可以使算法更快收敛。
params_dict['seed'] = 0

In [6]:
# step0: 初始化
tune = Tunning(dftrain,dftest,
               score_func = 'auc',
               score_gap_limit = 0.05,
               params_dict=params_dict,
               n_jobs=3)


train set size: 7500
test set size: 2500
score func: auc
score gap limit: 0.05
n_jobs: 3


In [7]:
# step1: tune n_estimators for relatively high learning_rate (eg: 0.1)
params_test1 = { 'learning_rate': [0.1],'n_estimators':[60]}
tune.gridsearch_cv(params_test1,cv = 5,verbose_eval = True)


{'n_estimators': 60, 'learning_rate': 0.1}

k = 1
[16:29:20] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.710861	valid-auc:0.711456	test-auc:0.706099	train-ks:0.305389	valid-ks:0.304632	test-ks:0.289003
[1]	train-auc:0.71757	valid-auc:0.717531	test-auc:0.711302	train-ks:0.309382	valid-ks:0.316611	test-ks:0.290443
[2]	train-auc:0.732287	valid-auc:0.737994	test-auc:0.72552	train-ks:0.344755	valid-ks:0.371487	test-ks:0.328559
[3]	train-auc:0.746008	valid-auc:0.739996	test-auc:0.735794	train-ks:0.380065	valid-ks:0.370179	test-ks:0.347503
[4]	train-auc:0.753641	valid-auc:0.749818	test-auc:0.743165	train-ks:0.389394	valid-ks:0.380908	test-ks:0.360387
[5]	train-auc:0.757914	valid-auc:0.752813	test-auc:0.745162	train-ks:0.391416	valid-ks:0.391523	test-ks:0.369331
[6]	train-auc:0.763992	valid-auc:0.755236	test-auc:0.75174	train-ks:0.396091	valid-ks:0.394145	test-ks:0.378614
[7]	train-auc:0.765679	valid-auc:0.756296	test-auc:0.751774	tra

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score
0,0,0.838425,0.800741,0.037684,0.801644


In [9]:
# step2：tune max_depth & min_child_weight 
params_test2 = { 'max_depth': [3,4], 'min_child_weight': [10,30,50,100,120] } 
tune.gridsearch_cv(params_test2,cv = 5)


{'max_depth': 3, 'min_child_weight': 10}

k = 1
[16:30:24] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 2
[16:30:25] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 3
[16:30:26] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 4
[16:30:26] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 5
[16:30:27] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

{'max_depth': 3, 'min_child_weight': 30}

k = 1
[16:30:28] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 2
[16:30:28] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 3
[16:30:29] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 4
[16:30:30] Tree method is selected to be 'hist', which uses a single 


{'max_depth': 4, 'min_child_weight': 50}

k = 1
[16:30:48] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 2
[16:30:49] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 3
[16:30:50] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 4
[16:30:50] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 5
[16:30:51] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

{'max_depth': 4, 'min_child_weight': 100}

k = 1
[16:30:52] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 2
[16:30:52] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 3
[16:30:53] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 4
[16:30:54] Tree method is selected to be 'hist', which uses a single

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score
11,11,0.867573,0.817668,0.049905,0.82025


In [8]:
# step3：tune gamma
params_test3 = {'gamma': [0,0.1,0.5,1,10]}
tune.gridsearch_cv(params_test3,cv = 5)


{'gamma': 0}

k = 1
[16:30:01] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 2
[16:30:01] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 3
[16:30:02] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 4
[16:30:03] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 5
[16:30:04] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

{'gamma': 0.1}

k = 1
[16:30:05] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 2
[16:30:05] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 3
[16:30:06] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 4
[16:30:07] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 5
[16:30:08] Tree me

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score
0,0,0.838425,0.800741,0.037684,0.801644


In [10]:
# step4：tune subsample & colsample_bytree 
params_test4 = { 'subsample': [0.8,0.9,1],
               'colsample_bytree': [0.9,0.1] } 
tune.gridsearch_cv(params_test4,cv = 5)


{'subsample': 0.8, 'colsample_bytree': 0.9}

k = 1
[16:31:59] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 2
[16:32:00] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 3
[16:32:01] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 4
[16:32:01] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 5
[16:32:02] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

{'subsample': 0.8, 'colsample_bytree': 0.1}

k = 1
[16:32:03] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 2
[16:32:04] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 3
[16:32:04] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.

k = 4
[16:32:05] Tree method is selected to be 'hist', which uses a s

Unnamed: 0,model_id,train_score,validate_score,score_gap,test_score
20,20,0.866698,0.818911,0.047787,0.820993


In [None]:
# step5: tune reg_alpha 
params_test5 = { 'reg_alpha': [0,  0.1, 1, 10, 100] } 
tune.gridsearch_cv(params_test5,cv = 5)

In [None]:
# step6: tune reg_lambda 
params_test6 = { 'reg_lambda': [0,  0.1, 1, 10, 100] }
tune.gridsearch_cv(params_test6,cv = 5)

In [None]:
# step7: lower learning_rate and rise n_estimators
params_test7 = { 'learning_rate':[0.05,0.02], 'n_estimators':[300]}
tune.gridsearch_cv(params_test7,cv = 5)