In [8]:
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline
import pandas as pd
import xgboost as xgb
import datetime
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [3]:
def xgb_feature(X_train, y_train, X_test, y_test=None):
    # 模型参数
    params = {'booster': 'gbtree', # bgtree基于树的模型，gbliner线性模型
              'objective':'rank:pairwise', #最小化的损失函数
              'eval_metric' : 'auc',
              'eta': 0.02, #类似学习率，通过减少每一步的权重，可以提高模型的鲁棒性。典型值为0.01-0.2
              'max_depth': 5,  # 树的最大深度，默认为6，避免过拟合，越大越容易过拟合
              'colsample_bytree': 0.7, # 生成树时进行的列采样，取值0.5-1
              'subsample': 0.7,#随机采样训练样本，减小会避免过拟合，过小会欠拟合，取值0.5-1
              'min_child_weight': 1,  # 最小叶子节点样本权重和，默认为1，避免过拟合，如果这个值过高，会导致欠拟合
              'seed': 1111, # 随机数的种子
              'silent':1 ## 设置成1则没有运行信息输出，默认为0.
              }
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvali = xgb.DMatrix(X_test)
    model = xgb.train(params, dtrain, num_boost_round=800)
    predict = model.predict(dvali)
    minmin = min(predict)
    maxmax = max(predict)
    vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))
    return vfunc(predict)

def xgb_feature2(X_train, y_train, X_test, y_test=None):
    # 模型参数
    params = {'booster': 'gbtree',
              'objective':'rank:pairwise',
              'eval_metric' : 'auc',
              'eta': 0.015,
              'max_depth': 5,  # 4 3
              'colsample_bytree': 0.7,#0.8
              'subsample': 0.7,
              'min_child_weight': 1,  # 2 3
              'seed': 11,
              'silent':1
              }
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvali = xgb.DMatrix(X_test)
    model = xgb.train(params, dtrain, num_boost_round=1200)
    predict = model.predict(dvali)
    minmin = min(predict)
    maxmax = max(predict)
    vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))
    return vfunc(predict)

def xgb_feature3(X_train, y_train, X_test, y_test=None):
    # 模型参数
    params = {'booster': 'gbtree',
              'objective':'rank:pairwise',
              'eval_metric' : 'auc',
              'eta': 0.01,
              'max_depth': 5,  # 4 3
              'colsample_bytree': 0.7,#0.8
              'subsample': 0.7,
              'min_child_weight': 1,  # 2 3
              'seed': 1,
              'silent':1
              }
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvali = xgb.DMatrix(X_test)
    model = xgb.train(params, dtrain, num_boost_round=2000)
    predict = model.predict(dvali)
    minmin = min(predict)
    maxmax = max(predict)
    vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))
    return vfunc(predict)

In [5]:
def et_model(X_train, y_train, X_test, y_test=None):
    model = ExtraTreesClassifier(max_features = 'log2', n_estimators = 1000 , n_jobs = -1).fit(X_train,y_train)
    return model.predict_proba(X_test)[:,1]

def gbdt_model(X_train, y_train, X_test, y_test=None):
    model = GradientBoostingClassifier(learning_rate = 0.02, max_features = 0.7, n_estimators = 700 , max_depth = 5)
    model.fit(X_train,y_train)
    predict = model.predict_proba(X_test)[:,1]
    minmin = min(predict)
    maxmax = max(predict)
    vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))
    return vfunc(predict)

def logistic_model(X_train, y_train, X_test, y_test=None):
    model = LogisticRegression(penalty = 'l2').fit(X_train,y_train)
    return model.predict_proba(X_test)[:,1]

In [6]:
def lgb_feature(X_train, y_train, X_test, y_test=None):
    lgb_train = lgb.Dataset(X_train, y_train,categorical_feature={'sex', 'merriage', 'income', 'qq_bound', 'degree', 'wechat_bound','account_grade','industry'})
    lgb_test = lgb.Dataset(X_test,categorical_feature={'sex', 'merriage', 'income', 'qq_bound', 'degree', 'wechat_bound','account_grade','industry'})
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric':'auc',
        'num_leaves': 25,
        'learning_rate': 0.01,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'min_data_in_leaf':5,
        'max_bin':200,
        'verbose': 0,
    }
    gbm = lgb.train(params,lgb_train,num_boost_round=2000)
    predict = gbm.predict(X_test)
    minmin = min(predict)
    maxmax = max(predict)
    vfunc = np.vectorize(lambda x:(x-minmin)/(maxmax-minmin))
    return vfunc(predict)

In [26]:
train_data = pd.read_csv('AI_risk_train_V3.0/train_data.csv',encoding='gb2312')

## 利用测试集计算AUC

In [39]:
valid_train_x1.shape

(56007, 105)

In [53]:
# train_data = pd.read_csv('AI_risk_train_V3.0/train_data.csv',encoding='gb2312')
# dummy_fea1 = ['sex', 'merriage', 'income', 'qq_bound', 'degree', 'wechat_bound','account_grade','industry']
# dummy_df1 = pd.get_dummies(train_data.loc[:,dummy_fea])
# train_data_copy1 = pd.concat([train_data,dummy_df],axis=1)
# train_data_copy1 = train_data_copy1.drop(['appl_sbm_tm','id','auth_time','phone','birthday','hobby','id_card'],axis=1)
# train_data_copy1 = train_data_copy1.fillna(0)
# valid_train_data1 = train_data_copy1.drop(dummy_fea,axis=1)
# valid_train_train1 = valid_train_data1[(valid_train_data1.loan_year <= 2017) & (valid_train_data1.loan_month < 4)]
# valid_train_test1 = valid_train_data1[(valid_train_data1.loan_year >= 2017) & (valid_train_data1.loan_month >= 4)]
# valid_train_x1 = valid_train_train1.drop(['target'],axis=1)
# valid_test_x1 = valid_train_test1.drop(['target'],axis=1)

# xgb_dataset1 = Dataset(X_train=valid_train_x1,y_train=valid_train_train1['target'],X_test=valid_test_x1,y_test=None,use_cache=False)


In [57]:
lr_redict_result = logistic_model(valid_train_x1,valid_train_train1['target'].values,valid_test_x1,None)
print('LR valid auc',roc_auc_score(valid_train_test['target'].values,lr_redict_result))

LR valid auc 0.6895657003902818


In [58]:
train_data = pd.read_csv('AI_risk_train_V3.0/train_data.csv',encoding='gb2312')
dummy_fea = ['sex', 'merriage', 'income', 'qq_bound', 'degree', 'wechat_bound','account_grade','industry']
for _fea in dummy_fea:
    le = LabelEncoder()
    le.fit(train_data[_fea].tolist())
    train_data[_fea] = le.transform(train_data[_fea].tolist())
train_data_copy1 = train_data.copy()
valid_train_data1 = train_data_copy1.drop(['appl_sbm_tm','id','auth_time','phone','birthday','hobby','id_card'],axis=1)
valid_train_data1 = valid_train_data1.fillna(0)
valid_train_train1 = valid_train_data1[(valid_train_data1.loan_year <= 2017) & (valid_train_data1.loan_month < 4)]
valid_train_test1 = valid_train_data1[(valid_train_data1.loan_year >= 2017) & (valid_train_data1.loan_month >= 4)]
valid_train_x1 = valid_train_train1.drop(['target'],axis=1)
valid_test_x1 = valid_train_test1.drop(['target'],axis=1)

dataset1 = Dataset(valid_train_x1,valid_train_train1['target'],valid_test_x1,use_cache=False)


In [59]:
lgb_redict_result = lgb_feature(valid_train_x1,valid_train_train1['target'].values,valid_test_x1,None)
print('lgb valid auc：',roc_auc_score(valid_train_test1['target'].values,lgb_redict_result))

lgb valid auc： 0.8195550244976295


In [60]:
model_xgb = Regressor(dataset=dataset1, estimator=xgb_feature,name='xgb',use_cache=False)
model_xgb2 = Regressor(dataset=dataset1, estimator=xgb_feature2,name='xgb2',use_cache=False)
model_xgb3 = Regressor(dataset=dataset1, estimator=xgb_feature3,name='xgb3',use_cache=False)
model_lgb = Regressor(dataset=dataset1, estimator=lgb_feature,name='lgb',use_cache=False)
model_gbdt = Regressor(dataset=dataset1, estimator=gbdt_model,name='gbdt',use_cache=False)
# Stack5个模型
pipeline = ModelsPipeline(model_xgb,model_xgb2,model_xgb3,model_lgb,model_gbdt)
stack_ds = pipeline.stack(k=5, seed=111, add_diff=False, full_test=True)
#第二层使用lr模型stack
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression,parameters={'fit_intercept': False})
stacking_predict_result = stacker.predict()

In [62]:
print('stacking valid auc：',roc_auc_score(valid_train_test1['target'].values,stacking_predict_result))

stacking valid auc： 0.8134186440056409


In [63]:
blend_ds = pipeline.blend(proportion=0.2, seed=111)
blender = Regressor(dataset=blend_ds, estimator=LinearRegression)
blending_predict_result = blender.predict()

print('blending valid auc：',roc_auc_score(valid_train_test1['target'].values,blending_predict_result))

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x0000022813EC8908>>
Traceback (most recent call last):
  File "c:\software\python\programs\python\python35\lib\site-packages\xgboost\core.py", line 482, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


blending valid auc： 0.8139588698988814


## 开始预测

In [19]:
# lgb数据集处理
# 对文本类型LabelEncoder()转换

train_data = pd.read_csv('AI_risk_train_V3.0/train_data.csv',encoding='gb2312')
train_data = train_data.drop(['appl_sbm_tm','id','auth_time','phone','birthday','hobby','id_card'],axis=1)
train_data = train_data.fillna(0)

test_data = pd.read_csv('AI_risk_test_V3.0/test_data.csv',encoding='gb2312')
test_data = test_data.drop(['appl_sbm_tm','id','auth_time','phone','birthday','hobby','id_card'],axis=1)
test_data = test_data.fillna(0)

train_test_data = pd.concat([train_data,test_data],axis=0,ignore_index = True)
train_test_data = train_test_data.fillna(0)
train_data = train_test_data.iloc[:train_data.shape[0],:]
test_data = train_test_data.iloc[train_data.shape[0]:,:]
dummy_fea = ['sex', 'merriage', 'income', 'qq_bound', 'degree', 'wechat_bound','account_grade','industry']
for _fea in dummy_fea:
    #print(_fea)
    le = LabelEncoder()
    le.fit(train_data[_fea].tolist() + test_data[_fea].tolist())
    tmp = le.transform(train_data[_fea].tolist() + test_data[_fea].tolist())
    train_data[_fea] = tmp[:train_data.shape[0]]
    test_data[_fea] = tmp[train_data.shape[0]:]
train_x = train_data.drop(['target'],axis=1)
test_x = test_data.drop(['target'],axis=1)
lgb_dataset = Dataset(train_x,train_data['target'],test_x,use_cache=False)


In [20]:
#xgb数据集处理
# 对文本类型one-hot编码


train_data = pd.read_csv('AI_risk_train_V3.0/train_data.csv',encoding='gb2312')
train_data = train_data.drop(['appl_sbm_tm','id','auth_time','phone','birthday','hobby','id_card'],axis=1)
train_data = train_data.fillna(0)
test_data = pd.read_csv('AI_risk_test_V3.0/test_data.csv',encoding='gb2312')
test_data = test_data.drop(['appl_sbm_tm','id','auth_time','phone','birthday','hobby','id_card'],axis=1)
test_data = test_data.fillna(0)
dummy_fea = ['sex', 'merriage', 'income', 'qq_bound', 'degree', 'wechat_bound','account_grade','industry']
train_test_data = pd.concat([train_data,test_data],axis=0,ignore_index = True)
train_test_data = train_test_data.fillna(0)
dummy_df = pd.get_dummies(train_test_data.loc[:,dummy_fea])
train_test_data = pd.concat([train_test_data,dummy_df],axis=1)
train_test_data = train_test_data.drop(dummy_fea,axis=1)
train_train = train_test_data.iloc[:train_data.shape[0],:]
test_test = train_test_data.iloc[train_data.shape[0]:,:]
train_train_x = train_train.drop(['target'],axis=1)
test_test_x = test_test.drop(['target'],axis=1)
xgb_dataset = Dataset(X_train=train_train_x,y_train=train_train['target'],X_test=test_test_x,y_test=None,use_cache=False)


In [21]:
#heamy 
# 1.5h
model_xgb = Regressor(dataset=xgb_dataset, estimator=xgb_feature,name='xgb',use_cache=False)
model_xgb2 = Regressor(dataset=xgb_dataset, estimator=xgb_feature2,name='xgb2',use_cache=False)
model_xgb3 = Regressor(dataset=xgb_dataset, estimator=xgb_feature3,name='xgb3',use_cache=False)
model_lgb = Regressor(dataset=lgb_dataset, estimator=lgb_feature,name='lgb',use_cache=False)
model_gbdt = Regressor(dataset=xgb_dataset, estimator=gbdt_model,name='gbdt',use_cache=False)

pipeline = ModelsPipeline(model_xgb,model_xgb2,model_xgb3,model_lgb,model_gbdt)
stack_ds = pipeline.stack(k=5, seed=111, add_diff=False, full_test=True)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression,parameters={'fit_intercept': False})
predict_result = stacker.predict()


In [23]:
ans = pd.read_csv('AI_risk_test_V3.0/test_list.csv',parse_dates = ['appl_sbm_tm'])
ans['PROB'] = predict_result
ans = ans.drop(['appl_sbm_tm'],axis=1)
minmin, maxmax = min(ans['PROB']),max(ans['PROB'])
ans['PROB'] = ans['PROB'].map(lambda x:(x-minmin)/(maxmax-minmin))
ans['PROB'] = ans['PROB'].map(lambda x:'%.4f' % x)

In [25]:
ans.to_csv('result/ans_stacking.csv',index=None)