In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
loan = pd.read_csv('loan.txt',encoding = 'gbk',sep=';')
client = pd.read_csv('client.txt',encoding = 'gbk',sep=';')
account = pd.read_csv('account.txt',encoding = 'gbk',sep=';')
order = pd.read_csv('order.txt',encoding = 'gbk',sep=';')
disp = pd.read_csv('disp.txt',encoding = 'gbk',sep=';')
district = pd.read_csv('district.txt',encoding = 'gbk',sep=';')
card = pd.read_csv('card.txt', encoding='gbk', sep=';')
trans = pd.read_csv('trans.txt', encoding='gbk', sep=';',dtype={"bank":object})

In [4]:
# 找出status列的唯一值并进行映射
loan.status.unique()
# 增加新的一列 ，通过status进行映射
bad_good = {'B':1,'D':1,'A':0,'C':2}
loan['bad_good'] = loan.status.map(bad_good)

In [5]:
# 贷款人的年龄、性别
data = pd.merge(loan,disp,on = 'account_id',how='left')
data = pd.merge(data,client,on='client_id',how='left')
data = data[data.type=='OWNER']

In [6]:
# 贷款人居住地的经济状况
data = pd.merge(data, district, left_on = 'district_id', right_on = 'A1', how = 'left')

In [7]:
# 将date转化为日期类型
data['date'] = data['date'].astype('str')
data['date'] = '19' + data['date']
data['date'] = pd.to_datetime(data['date'])

In [8]:
# 计算每个账户余额的平均值、标准差
data_4temp1 = pd.merge(loan[['account_id', 'date']],
                       trans[['account_id','type','amount','balance','date']],
                       on = 'account_id')

In [9]:
data_4temp1.columns = ['account_id', 'date', 'type', 'amount', 'balance', 't_date']
data_4temp1 = data_4temp1.sort_values(by=['account_id','t_date'])

In [10]:
# 将date,t_date转化为日期类型
def convert(L):
    for l in L:
        data_4temp1[l] =  data_4temp1[l].astype('str')
        data_4temp1[l] = '19' + data_4temp1[l]
        data_4temp1[l] = pd.to_datetime(data_4temp1[l])
L = ['date','t_date']
convert(L)

In [11]:
data_4temp1.head()

Unnamed: 0,account_id,date,type,amount,balance,t_date
10020,2,1994-01-05,PRIJEM,1100.0,1100.0,1993-02-26
10021,2,1994-01-05,PRIJEM,20236.0,21336.0,1993-03-12
10022,2,1994-01-05,PRIJEM,3700.0,25036.0,1993-03-28
10023,2,1994-01-05,PRIJEM,13.5,25049.5,1993-03-31
10024,2,1994-01-05,PRIJEM,20236.0,45285.5,1993-04-12


In [None]:
# # 将balance列和amount列四舍五入
# data_4temp1['balance'] = round(data_4temp1['balance'])
# data_4temp1['amount'] = round(data_4temp1['amount'])
# # 将balance列和amount列转换数据类型为int
# data_4temp1[['balance','amount']] = data_4temp1[['balance','amount']].astype(int)

In [12]:
# 提取交易数据
import datetime
data_4temp2 = data_4temp1[data_4temp1.date>data_4temp1.t_date]
data_4temp2 = data_4temp2[data_4temp2.date<data_4temp2.t_date+datetime.timedelta(days=365)]

In [None]:
# 计算变异系数
# data_4temp3 = data_4temp2.groupby('account_id')['balance'].agg({'avg_balance':'mean', 'stdev_balance':'std'})

In [13]:
# 计算变异系数
def f(data_4temp2):
    avg_balance = data_4temp2['balance'].mean()
    stdev_balance = data_4temp2['balance'].std() 
    return pd.Series([avg_balance,stdev_balance], index=['avg_balance','stdev_balance'])
data_4temp3 = data_4temp2.groupby('account_id').apply(f)
data_4temp3['cv_balance'] = data_4temp3[['avg_balance','stdev_balance']].apply(lambda x: x[1]/x[0],axis = 1)

In [14]:
data_4temp3.head()

Unnamed: 0_level_0,avg_balance,stdev_balance,cv_balance
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,32590.624074,12061.705682,0.370097
19,25871.173684,15057.448942,0.582016
25,60792.815504,21315.682316,0.350628
37,38528.082243,22177.51255,0.575619
38,31383.449091,10950.811533,0.348936


In [15]:
# 计算平均支出和平均收入的比例
type_dict = {'VYDAJ':'out','PRIJEM':'income'}
data_4temp2['type'] = data_4temp2.type.map(type_dict)

In [16]:
data_4temp4 = data_4temp2.groupby(['account_id','type'])[['amount']].sum()

In [17]:
data_4temp4.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,amount
account_id,type,Unnamed: 2_level_1
2,income,228685.3
2,out,153017.6
19,income,225946.0
19,out,198016.8
25,income,659633.6


In [19]:
data_4temp5 = pd.pivot_table(data_4temp4, values = 'amount', 
    index = 'account_id', columns = 'type')

In [20]:
data_4temp5.head()

type,income,out
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,228685.3,153017.6
19,225946.0,198016.8
25,659633.6,629103.2
37,344702.0,328537.0
38,148403.7,105091.0


In [21]:
data_4temp5.isnull().any()

type
income    False
out        True
dtype: bool

In [22]:
data_4temp5.fillna(0, inplace = True)
data_4temp5['r_out_in'] = data_4temp5[
    ['out','income']].apply(lambda x: x[0]/x[1], axis = 1)

In [23]:
data_4temp5.head()

type,income,out,r_out_in
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,228685.3,153017.6,0.669119
19,225946.0,198016.8,0.87639
25,659633.6,629103.2,0.953716
37,344702.0,328537.0,0.953104
38,148403.7,105091.0,0.708143


In [24]:
data3 = pd.merge(data, data_4temp3, left_on='account_id', right_index= True, how = 'left')

In [25]:
data4 = pd.merge(data3, data_4temp5, left_on='account_id', right_index= True, how = 'left')

In [26]:
# 计算贷存比、贷收比
data4['r_lb'] = data4[['amount','avg_balance']].apply(lambda x: x[0]/x[1],axis = 1)
data4['r_lincome'] = data4[['amount','income']].apply(lambda x: x[0]/x[1],axis = 1)

In [27]:
#  删除不必要的编号列
data4.drop(['A1','A2','A3','loan_id','account_id','date','type'],axis=1,inplace = True)

In [28]:
# 提取状态为C的用于预测。其它样本随机抽样，建立训练集与测试集
data_model=data4[data4.status!='C']
for_predict=data4[data4.status=='C']
# 贷款表（loans）中的还款的状态可分为 
# A：代表合同终止，没问题
# B：代表合同终止，贷款没支付
# C:代表合同处于执行期，至今正常；
# D：代表合同处于执行期，欠债状态。对数据进行转换，使A=0，B、D=1，C=2

In [29]:
# 选择影响较大的列构造训练集和测试集   
# candidates = ['A4', 'A10', 'A11', 'A12','amount', 'duration',
#        'A13', 'A14', 'A15', 'A16', 'avg_balance', 'stdev_balance',
#        'cv_balance', 'income', 'out', 'r_out_in', 'r_lb', 'r_lincome']

# 发现变异系数和代收比是结合使用时logistic回归和lgb的效果最好  # 0.8858(不取整)
candidates = ['cv_balance', 'r_lincome']
X_select = data_model[candidates]

In [30]:
# 标签
y_train = data_model['bad_good'].values

In [31]:
# 正负样本比例差距不大
np.sum(y_train==1)/np.sum(y_train==0)

0.37438423645320196

In [32]:
# 可训练的样本集，用交叉验证法进行分割
X = X_select.values
# X[X=='?'] = 0
X = X.astype(np.float64)

In [33]:
X.shape

(279, 2)

In [34]:
for_predict = for_predict[candidates]
# 构建训练集与测试集
x_predict = for_predict.values
# x_predict[x_predict=='?'] = 0
X_predict = x_predict.astype(np.float64)

In [35]:
X_predict.shape

(403, 2)

In [36]:
# logistic模型
from sklearn.linear_model import SGDClassifier as SGD
from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=2019)
predictions = np.zeros(X_predict.shape[0])
a = 0
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y_train)):
    print("Fold :{}".format(fold_ + 1))
    trn_data = X[trn_idx]
    trn_label = y_train[trn_idx]
    val_data = X[val_idx]
    val_label = y_train[val_idx]
    model_SGD = SGD(alpha=0.00001, random_state=2, shuffle=True, loss='log',max_iter=1000,tol=1e-3)
    model_SGD.fit(trn_data, trn_label)  # Fit the model.
    # model_SGD.predict_proba(val_data)的输出指：数据属于0的概率和数据属于1的概率
    print("auc score: {:<8.5f}".format(metrics.roc_auc_score(val_label, model_SGD.predict_proba(val_data)[:, 1])))
    a += metrics.roc_auc_score(val_label, model_SGD.predict_proba(val_data)[:, 1])/ folds.n_splits
   # 所有预测值相加再除以folds.n_splits(10)
    predictions += model_SGD.predict_proba(X_predict)[:, 1] / folds.n_splits

Fold :1
auc score: 0.65476 
Fold :2
auc score: 0.90476 
Fold :3
auc score: 0.94643 
Fold :4
auc score: 0.81875 
Fold :5
auc score: 0.83125 
Fold :6
auc score: 0.90938 
Fold :7
auc score: 0.97857 
Fold :8
auc score: 0.89286 
Fold :9
auc score: 0.97143 
Fold :10
auc score: 0.91429 


In [37]:
print(a,predictions.shape)

0.8822470238095239 (403,)


In [None]:
# !pip install lightgbm
# !pip install bayesian-optimization

In [38]:
#bayesian optimization to find hyperparameter for lightgbm
#  本例数据量较小，调参后结果只适合本例的数据
import lightgbm as lgb
from sklearn.model_selection import KFold,StratifiedKFold
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [39]:
def LGB_CV(
          min_data_in_leaf,
          feature_fraction,
          bagging_fraction,
         ):
    
    folds = KFold(n_splits=5, shuffle=True, random_state=2019)
    oof = np.zeros(X.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y_train)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(X[trn_idx],
                               label=y_train[trn_idx],
                               )
        val_data = lgb.Dataset(X[val_idx],
                               label=y_train[val_idx],
                               )
    
        param = {
            'max_depth': -1,
            'min_data_in_leaf': int(min_data_in_leaf), 
            'objective':'binary',
            'bagging_fraction':bagging_fraction,
            'feature_fraction':feature_fraction,
            'learning_rate': 0.005,
            "boosting": "gbdt",
            "bagging_freq": 5,
            "bagging_seed": 11,
            "metric": 'auc',
            "verbosity": -1
        }
    
        clf = lgb.train(param,
                        trn_data,
                        8000,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds = 500)
        
        oof[val_idx] = clf.predict(X[val_idx],
                                   num_iteration=clf.best_iteration)
        
        del clf, trn_idx, val_idx
        
    return metrics.roc_auc_score(y_train,oof)

In [40]:
LGB_BO = BayesianOptimization(LGB_CV, {
        'min_data_in_leaf': (2, 40),
        'bagging_fraction': (0.01, 0.999),
        'feature_fraction':(0.01, 0.999)
    })

In [41]:
LGB_BO.maximize(init_points=1,n_iter=2)

|   iter    |  target   | baggin... | featur... | min_da... |
-------------------------------------------------------------
fold n°0
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.918128	valid_1's auc: 0.916327
Early stopping, best iteration is:
[23]	training's auc: 0.900866	valid_1's auc: 0.927891
fold n°1
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.944242	valid_1's auc: 0.798374
Early stopping, best iteration is:
[43]	training's auc: 0.932706	valid_1's auc: 0.83252
fold n°2
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.927747	valid_1's auc: 0.869106
Early stopping, best iteration is:
[30]	training's auc: 0.914086	valid_1's auc: 0.873984
fold n°3
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.918598	valid_1's auc: 0.876768
Early stopping, best iteration is:
[108]	training's auc: 0.906378	valid_1's auc: 0.886869
fold n°4
Tra

In [42]:
LGB_BO.max['target']

0.7732045631319678

In [43]:
LGB_BO.max['params']

{'bagging_fraction': 0.7351778705348716,
 'feature_fraction': 0.11287860341482422,
 'min_data_in_leaf': 20.779283696886267}

In [44]:
X_predict.shape

(403, 2)

In [45]:
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof = np.zeros(X.shape[0])
predictions = np.zeros(X_predict.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y_train)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(X[trn_idx],
                           label=y_train[trn_idx],
                           )
    val_data = lgb.Dataset(X[val_idx],
                           label=y_train[val_idx],
                           )

    param = {
        'max_depth': -1,
        'min_data_in_leaf':21, 
        'objective':'binary',
        'bagging_fraction':0.73,
        'feature_fraction':0.11,
        'learning_rate': 0.005,
        "boosting": "gbdt",
        "bagging_freq": 5,
        "bagging_seed": 11,
        "metric": 'auc',
        "verbosity": -1
    }

    clf = lgb.train(param,
                    trn_data,
                    8000,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=500,
                    early_stopping_rounds = 500)

    oof[val_idx] = clf.predict(X[val_idx],
                               num_iteration=clf.best_iteration)
    predictions = predictions + clf.predict(X_predict, num_iteration=clf.best_iteration) / folds.n_splits

fold n°0
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.916937	valid_1's auc: 0.917007
Early stopping, best iteration is:
[26]	training's auc: 0.901732	valid_1's auc: 0.927891
fold n°1
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.942724	valid_1's auc: 0.8
Early stopping, best iteration is:
[23]	training's auc: 0.920664	valid_1's auc: 0.835772
fold n°2
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.92542	valid_1's auc: 0.86748
Early stopping, best iteration is:
[30]	training's auc: 0.910089	valid_1's auc: 0.873984
fold n°3
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.917575	valid_1's auc: 0.872727
Early stopping, best iteration is:
[115]	training's auc: 0.905307	valid_1's auc: 0.885859
fold n°4
Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.921396	valid_1's auc: 0.898955
[1000]	training

In [46]:
(0.92+0.8357+0.874+0.886+0.91)/5

0.88514

In [47]:
predictions.shape

(403,)