In [146]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.serif'] = ['SimHei']
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## KS测评

In [53]:
from sklearn import metrics
def ks(y_predicted, y_true):
    label=y_true
    #label = y_true.get_label()
    fpr,tpr,thres = metrics.roc_curve(label,y_predicted,pos_label=1)
    return 'ks',abs(fpr - tpr).max()

In [89]:
dataset = pd.read_csv("../feature/特征汇总.csv",encoding="gb2312")
#dataset=pd.get_dummies(dataset,columns=dataset[['用户性别','用户职业','用户教育程度','用户婚姻状态','用户户口类型']])
dataset  = dataset.drop(['放款时间'],axis=1)

dataset.shape

(69495, 389)

In [90]:
tests=dataset[(dataset['标签']<0)]#测试集:13899行
trains=dataset[(dataset['标签']>=0)]#测试集:55596
print('trains:',trains.shape)
print('tests:',tests.shape)

online_tests = tests.copy()

train_xy,offline_test = train_test_split(trains, test_size = 0.2,random_state=21)#41697/13899

y_train = train_xy.标签
X_train = train_xy.drop(['标签','用户标识'],axis=1)
#X=train[feature_list]

y_offline_test = offline_test.标签
X_offline_test = offline_test.drop(['标签','用户标识'],axis=1)
#val_X = val[feature_list]

X_online_test = online_tests.drop(['标签','用户标识'],axis=1)

trains: (55596, 389)
tests: (13899, 389)


## XGBoost 计算特征重要性

In [35]:
# model = XGBClassifier(objective='rank:pairwise',max_depth=5,learning_rate=0.02)
# model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='rank:pairwise', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [44]:

# # plot feature importance

# fig, ax = plt.subplots(1,1,figsize=(10,50))
# xgb.plot_importance(model, ax=ax)
# plt.show()


In [91]:
params = {'booster': 'gbtree',
          'objective':'rank:pairwise',
          'eval_metric' : 'auc',
          'eta': 0.02,# 如同学习率
          'max_depth': 5,  # 4 3# 构建树的深度，越大越容易过拟合
          'colsample_bytree': 0.7,#0.8 # 生成树时进行的列采样
          'subsample': 0.7, # 随机采样训练样本
          'min_child_weight': 1,  # 2 3
          'silent':1# 设置成1则没有运行信息输出，最好是设置为0
          }

dtrain = xgb.DMatrix(X_train, label=y_train)
dvali = xgb.DMatrix(X_offline_test)
xgb_model = xgb.train(params, dtrain, num_boost_round=800)
xgb_predict = xgb_model.predict(dvali)

In [92]:
print('xgb线下ks得分：',ks(xgb_predict,y_offline_test))
print('valid auc',roc_auc_score(y_offline_test,xgb_predict))

xgb线下ks得分： ('ks', 0.47445946689252155)
valid auc 0.8029523660373838


In [96]:
df = pd.DataFrame(list(xgb_model.get_fscore().keys()),columns=['gbm_feature'])
df['gbm_importance'] = list(xgb_model.get_fscore().values())
df=df.sort_values(["gbm_importance"],ascending=False)
df[:30]

Unnamed: 0,gbm_feature,gbm_importance
85,用户性别,454
48,放款后账单还款差额,372
33,放款前浏览子行为编号_10,328
335,放款后浏览行为数据sum,281
273,放款前浏览行为数据std,276
52,放款前浏览子行为编号_6,269
216,放款后该用户账单还款金额最大值汇总(去重),254
34,放款前浏览行为数据mean,238
329,整体信用卡额度max与整体本期账单余额max差值,237
199,x1,233


## LightBoost计算特征重要性

In [97]:
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_offline_test, y_offline_test, reference=lgb_train,free_raw_data=False)

# specify your configurations as a dict
seed=13
params2 = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    #'metric': 'binary_logloss',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.7,
    'bagging_fraction': 1,
    'bagging_freq': 10,
    'verbose': 0,
    #'num_iterations':500,
    'tree_learner':'serial',  
    'min_data_in_leaf':10,
    'feature_fraction_seed':seed,
    'bagging_seed':seed,
    'metric_freq':1 
}


# train
gbm = lgb.train(params2,
                lgb_train,
                num_boost_round=5000,
                valid_sets=lgb_eval,
                early_stopping_rounds=500)

[1]	valid_0's auc: 0.767359
Training until validation scores don't improve for 500 rounds.
[2]	valid_0's auc: 0.769781
[3]	valid_0's auc: 0.771893
[4]	valid_0's auc: 0.779274
[5]	valid_0's auc: 0.782686
[6]	valid_0's auc: 0.784831
[7]	valid_0's auc: 0.78648
[8]	valid_0's auc: 0.786554
[9]	valid_0's auc: 0.787497
[10]	valid_0's auc: 0.787947
[11]	valid_0's auc: 0.78736
[12]	valid_0's auc: 0.787924
[13]	valid_0's auc: 0.788875
[14]	valid_0's auc: 0.789136
[15]	valid_0's auc: 0.789081
[16]	valid_0's auc: 0.789552
[17]	valid_0's auc: 0.789723
[18]	valid_0's auc: 0.789623
[19]	valid_0's auc: 0.789927
[20]	valid_0's auc: 0.78986
[21]	valid_0's auc: 0.790601
[22]	valid_0's auc: 0.791318
[23]	valid_0's auc: 0.791884
[24]	valid_0's auc: 0.792033
[25]	valid_0's auc: 0.792034
[26]	valid_0's auc: 0.791968
[27]	valid_0's auc: 0.791906
[28]	valid_0's auc: 0.79229
[29]	valid_0's auc: 0.79216
[30]	valid_0's auc: 0.792319
[31]	valid_0's auc: 0.79262
[32]	valid_0's auc: 0.79301
[33]	valid_0's auc: 0.793

[279]	valid_0's auc: 0.804253
[280]	valid_0's auc: 0.804124
[281]	valid_0's auc: 0.80411
[282]	valid_0's auc: 0.804116
[283]	valid_0's auc: 0.804005
[284]	valid_0's auc: 0.804016
[285]	valid_0's auc: 0.804054
[286]	valid_0's auc: 0.80406
[287]	valid_0's auc: 0.804079
[288]	valid_0's auc: 0.804211
[289]	valid_0's auc: 0.804235
[290]	valid_0's auc: 0.80423
[291]	valid_0's auc: 0.804332
[292]	valid_0's auc: 0.804363
[293]	valid_0's auc: 0.80441
[294]	valid_0's auc: 0.804363
[295]	valid_0's auc: 0.804386
[296]	valid_0's auc: 0.804458
[297]	valid_0's auc: 0.804501
[298]	valid_0's auc: 0.80446
[299]	valid_0's auc: 0.804337
[300]	valid_0's auc: 0.80425
[301]	valid_0's auc: 0.804254
[302]	valid_0's auc: 0.804343
[303]	valid_0's auc: 0.804371
[304]	valid_0's auc: 0.804305
[305]	valid_0's auc: 0.804278
[306]	valid_0's auc: 0.804297
[307]	valid_0's auc: 0.804293
[308]	valid_0's auc: 0.804446
[309]	valid_0's auc: 0.804483
[310]	valid_0's auc: 0.804358
[311]	valid_0's auc: 0.804368
[312]	valid_0's 

[553]	valid_0's auc: 0.804748
[554]	valid_0's auc: 0.80483
[555]	valid_0's auc: 0.804886
[556]	valid_0's auc: 0.804833
[557]	valid_0's auc: 0.804871
[558]	valid_0's auc: 0.804854
[559]	valid_0's auc: 0.804887
[560]	valid_0's auc: 0.804857
[561]	valid_0's auc: 0.804866
[562]	valid_0's auc: 0.804821
[563]	valid_0's auc: 0.804774
[564]	valid_0's auc: 0.804853
[565]	valid_0's auc: 0.804872
[566]	valid_0's auc: 0.804898
[567]	valid_0's auc: 0.804918
[568]	valid_0's auc: 0.804925
[569]	valid_0's auc: 0.804897
[570]	valid_0's auc: 0.804881
[571]	valid_0's auc: 0.804819
[572]	valid_0's auc: 0.804806
[573]	valid_0's auc: 0.804751
[574]	valid_0's auc: 0.804753
[575]	valid_0's auc: 0.804744
[576]	valid_0's auc: 0.804778
[577]	valid_0's auc: 0.804738
[578]	valid_0's auc: 0.804687
[579]	valid_0's auc: 0.804632
[580]	valid_0's auc: 0.804664
[581]	valid_0's auc: 0.804687
[582]	valid_0's auc: 0.804651
[583]	valid_0's auc: 0.804658
[584]	valid_0's auc: 0.804663
[585]	valid_0's auc: 0.804671
[586]	valid

[830]	valid_0's auc: 0.802483
[831]	valid_0's auc: 0.802445
[832]	valid_0's auc: 0.802467
[833]	valid_0's auc: 0.802472
[834]	valid_0's auc: 0.802464
[835]	valid_0's auc: 0.802458
[836]	valid_0's auc: 0.802469
[837]	valid_0's auc: 0.802458
[838]	valid_0's auc: 0.802409
[839]	valid_0's auc: 0.802475
[840]	valid_0's auc: 0.802404
[841]	valid_0's auc: 0.802434
[842]	valid_0's auc: 0.802468
[843]	valid_0's auc: 0.802416
[844]	valid_0's auc: 0.802371
[845]	valid_0's auc: 0.802356
[846]	valid_0's auc: 0.802381
[847]	valid_0's auc: 0.80239
[848]	valid_0's auc: 0.802404
[849]	valid_0's auc: 0.802367
[850]	valid_0's auc: 0.802397
[851]	valid_0's auc: 0.802411
[852]	valid_0's auc: 0.802442
[853]	valid_0's auc: 0.802488
[854]	valid_0's auc: 0.802448
[855]	valid_0's auc: 0.802376
[856]	valid_0's auc: 0.80235
[857]	valid_0's auc: 0.802362
[858]	valid_0's auc: 0.80238
[859]	valid_0's auc: 0.802404
[860]	valid_0's auc: 0.802346
[861]	valid_0's auc: 0.802288
[862]	valid_0's auc: 0.802304
[863]	valid_0

In [98]:
gbm_preds_offline = gbm.predict(X_offline_test, num_iteration=gbm.best_iteration)

print('lgb线下ks得分：',ks(gbm_preds_offline,y_offline_test))
print('valid auc',roc_auc_score(y_offline_test,gbm_preds_offline))

lgb线下ks得分： ('ks', 0.4807491301255882)
valid auc 0.8049871760875975


In [99]:

df = pd.DataFrame(gbm.feature_name(),columns=['lgb_feature'])
df['lgb_importance'] = gbm.feature_importance()
df=df.sort_values(["lgb_importance"],ascending=False)
df[:30]

Unnamed: 0,lgb_feature,lgb_importance
187,整体信用卡额度max与整体本期账单余额max差值,175
6,放款前浏览行为数据mean,171
22,放款前浏览子行为编号_10,167
7,放款前浏览行为数据std,160
18,放款前浏览子行为编号_6,155
235,整体信用卡额度median与整体本期账单余额median差值,133
186,整体上期还款金额max与整体上期账单金额max差值,132
0,用户性别,128
178,整体本期账单最低还款额max,122
20,放款前浏览子行为编号_8,119


# 预测

In [105]:
X = trains.drop(['用户标识','标签'],axis=1)
y = trains['标签']

X_test = tests.drop(['用户标识','标签'],axis=1)
y_predict = tests.ix[:,:2]

In [136]:
def xgb_feature(X_train, y_train, X_test, y_test=None):
    # 模型参数
    params = {'booster': 'gbtree', # bgtree基于树的模型，gbliner线性模型
              'objective':'rank:pairwise', #最小化的损失函数
              'eval_metric' : 'auc',
              'eta': 0.02, #类似学习率，通过减少每一步的权重，可以提高模型的鲁棒性。典型值为0.01-0.2
              'max_depth': 5,  # 树的最大深度，默认为6，避免过拟合，越大越容易过拟合
              'colsample_bytree': 0.7, # 生成树时进行的列采样，取值0.5-1
              'subsample': 0.7,#随机采样训练样本，减小会避免过拟合，过小会欠拟合，取值0.5-1
              'min_child_weight': 1,  # 最小叶子节点样本权重和，默认为1，避免过拟合，如果这个值过高，会导致欠拟合
              'seed': 1111, # 随机数的种子
              'silent':1 ## 设置成1则没有运行信息输出，默认为0.
              }
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvali = xgb.DMatrix(X_test)
    model = xgb.train(params, dtrain, num_boost_round=800)
    predict = model.predict(dvali)
    minmin = min(predict)
    maxmax = max(predict)
    vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))
    return vfunc(predict)

def lgb_feature(X_train, y_train, X_test, y_test=None):

    lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
    lgb_test = lgb.Dataset(X_test)
    #lgb_eval = lgb.Dataset(X_offline_test, y_offline_test, reference=lgb_train,free_raw_data=False)

    params = {
        'boosting_type': 'gbdt',
    'objective': 'binary',
    #'metric': 'binary_logloss',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.7,
    'bagging_fraction': 1,
    'bagging_freq': 10,
    'verbose': 0,
    #'num_iterations':500,
    'tree_learner':'serial',  
    'min_data_in_leaf':10,
    'feature_fraction_seed':13,
    'bagging_seed':13,
    'metric_freq':1
    }
    gbm = lgb.train(params,lgb_train,num_boost_round=2000)
    predict = gbm.predict(X_test)
    minmin = min(predict)
    maxmax = max(predict)
    vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))
    return vfunc(predict)

def gbdt_model(X_train, y_train, X_test, y_test=None):
    model = GradientBoostingClassifier(learning_rate = 0.02, max_features = 0.7, n_estimators = 700 , max_depth = 5)
    model.fit(X_train,y_train)
    predict = model.predict_proba(X_test)[:,1]
    minmin = min(predict)
    maxmax = max(predict)
    vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))
    return vfunc(predict)

In [137]:
lgb_feature(X,y,X_test)

array([0.23190207, 0.00277535, 0.04431269, ..., 0.0044371 , 0.1691354 ,
       0.41424784])

In [140]:
dataset1 = Dataset(X_train=X,y_train=y,X_test=X_test,y_test=None,use_cache=False)

model_xgb = Regressor(dataset=dataset1, estimator=xgb_feature,name='xgb',use_cache=False)
model_lgb = Regressor(dataset=dataset1, estimator=lgb_feature,name='lgb',use_cache=False)
model_gbdt = Regressor(dataset=dataset1, estimator=gbdt_model,name='gbdt',use_cache=False)
# 3个模型
pipeline = ModelsPipeline(model_xgb,model_lgb,model_gbdt)

In [None]:
stack_ds = pipeline.stack(k=10, seed=111)#, add_diff=False, full_test=True)
#第二层使用lr模型stack
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression,parameters={'fit_intercept': False})
stacking_predict_result = stacker.predict()

In [147]:
blend_ds = pipeline.blend(proportion=0.2, seed=111)
blender = Regressor(dataset=blend_ds, estimator=LinearRegression)
blending_predict_result = blender.predict()

In [None]:
blend_output = tests[['用户标识','标签']].rename(index=str, columns={"用户标识": "userid", "标签": "probability"})
blend_output.probability = MinMaxScaler().fit_transform(blending_predict_result)
blend_output.to_csv("../output/blend_result.csv",index=None,encoding='utf-8')


In [None]:
bst = xgb.Booster({'nthread':4}) #init model
bst.load_model('model/20170119_D.model')

In [4]:
#save feature score and feature information:  feature,score,min,max,n_null,n_gt1w
feature_score = bst.get_fscore()
feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)

In [8]:
fs = []
for (key,value) in feature_score:
    fs.append("{0},{1}\n".format(key,value))
    
with open('feature_score/feature_score_{0}.csv'.format(6),'w') as f:
        f.writelines("feature,score\n")
        f.writelines(fs)
