In [1]:
import numpy as np
import pandas as pd

In [2]:
loan = pd.read_csv('loan.txt',encoding = 'gbk',sep=';')
client = pd.read_csv('client.txt',encoding = 'gbk',sep=';')
account = pd.read_csv('account.txt',encoding = 'gbk',sep=';')
order = pd.read_csv('order.txt',encoding = 'gbk',sep=';')
disp = pd.read_csv('disp.txt',encoding = 'gbk',sep=';')
district = pd.read_csv('district.txt',encoding = 'gbk',sep=';')
card = pd.read_csv('card.txt', encoding='gbk', sep=';')
trans = pd.read_csv('trans.txt', encoding='gbk', sep=';',dtype={"bank":object})

In [3]:
# 找出status列的唯一值并进行映射
loan.status.unique()
# 增加新的一列 ，通过status进行映射
bad_good = {'B':1,'D':1,'A':0,'C':2}
loan['bad_good'] = loan.status.map(bad_good)

In [4]:
# 贷款人的年龄、性别
data = pd.merge(loan,disp,on = 'account_id',how='left')
data = pd.merge(data,client,on='client_id',how='left')
data = data[data.type=='OWNER']

In [5]:
# 贷款人居住地的经济状况
data = pd.merge(data, district, left_on = 'district_id', right_on = 'A1', how = 'left')
data.head()

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status,bad_good,disp_id,client_id,...,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,5314,1787,930705,96396,12,8033.0,B,1,2166,2166,...,8,2,10,81.8,9650,3.38,3.67,100,2985,2804
1,5316,1801,930711,165960,36,4610.0,A,0,2181,2181,...,7,3,10,73.5,8369,1.79,2.31,117,2854,2618
2,6863,9188,930728,127080,60,2118.0,A,0,11006,11314,...,6,1,5,53.5,8390,2.28,2.89,132,2080,2122
3,5325,1843,930803,105804,36,2939.0,A,0,2235,2235,...,10,1,9,74.8,10045,1.42,1.71,135,6604,6295
4,7240,11013,930906,274740,60,4579.0,A,0,13231,13539,...,5,1,5,50.5,8288,3.79,4.52,110,1562,1460


In [6]:
# 将date转化为日期类型
data['date'] = data['date'].astype('str')
data['date'] = '19' + data['date']
data['date'] = pd.to_datetime(data['date'])

In [7]:
# 计算每个账户余额的平均值、标准差
data_4temp1 = pd.merge(loan[['account_id', 'date']],
                       trans[['account_id','type','amount','balance','date']],
                       on = 'account_id')

In [8]:
data_4temp1.columns = ['account_id', 'date', 'type', 'amount', 'balance', 't_date']
data_4temp1 = data_4temp1.sort_values(by=['account_id','t_date'])

In [9]:
# 将date,t_date转化为日期类型
def convert(L):
    for l in L:
        data_4temp1[l] =  data_4temp1[l].astype('str')
        data_4temp1[l] = '19' + data_4temp1[l]
        data_4temp1[l] = pd.to_datetime(data_4temp1[l])
L = ['date','t_date']
convert(L)

In [10]:
data_4temp1.head()

Unnamed: 0,account_id,date,type,amount,balance,t_date
10020,2,1994-01-05,PRIJEM,1100.0,1100.0,1993-02-26
10021,2,1994-01-05,PRIJEM,20236.0,21336.0,1993-03-12
10022,2,1994-01-05,PRIJEM,3700.0,25036.0,1993-03-28
10023,2,1994-01-05,PRIJEM,13.5,25049.5,1993-03-31
10024,2,1994-01-05,PRIJEM,20236.0,45285.5,1993-04-12


In [11]:
# # 将balance列和amount列四舍五入
# data_4temp1['balance'] = round(data_4temp1['balance'])
# data_4temp1['amount'] = round(data_4temp1['amount'])
# # 将balance列和amount列转换数据类型为int
# data_4temp1[['balance','amount']] = data_4temp1[['balance','amount']].astype(int)

In [12]:
# 提取交易数据
import datetime
data_4temp2 = data_4temp1[data_4temp1.date>data_4temp1.t_date]
data_4temp2 = data_4temp2[data_4temp2.date<data_4temp2.t_date+datetime.timedelta(days=365)]

In [13]:
# 计算变异系数
def f(data_4temp2):
    avg_balance = data_4temp2['balance'].mean()
    stdev_balance = data_4temp2['balance'].std() 
    return pd.Series([avg_balance,stdev_balance], index=['avg_balance','stdev_balance'])
data_4temp3 = data_4temp2.groupby('account_id').apply(f)
data_4temp3['cv_balance'] = data_4temp3[['avg_balance','stdev_balance']].apply(lambda x: x[1]/x[0],axis = 1)

In [14]:
data_4temp3.head()

Unnamed: 0_level_0,avg_balance,stdev_balance,cv_balance
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,32590.624074,12061.705682,0.370097
19,25871.173684,15057.448942,0.582016
25,60792.815504,21315.682316,0.350628
37,38528.082243,22177.51255,0.575619
38,31383.449091,10950.811533,0.348936


In [15]:
# 计算平均支出和平均收入的比例
type_dict = {'VYDAJ':'out','PRIJEM':'income'}
data_4temp2['type'] = data_4temp2.type.map(type_dict)

In [16]:
data_4temp4 = data_4temp2.groupby(['account_id','type'])[['amount']].sum()

In [17]:
data_4temp5 = pd.pivot_table(data_4temp4, values = 'amount', 
    index = 'account_id', columns = 'type')

In [18]:
data_4temp5.isnull().any()

type
income    False
out        True
dtype: bool

In [19]:
data_4temp5.fillna(0, inplace = True)
# 计算平均支出和平均收入的比例
data_4temp5['r_out_in'] = data_4temp5[
    ['out','income']].apply(lambda x: x[0]/x[1], axis = 1)

In [20]:
data_4temp5.head()

type,income,out,r_out_in
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,228685.3,153017.6,0.669119
19,225946.0,198016.8,0.87639
25,659633.6,629103.2,0.953716
37,344702.0,328537.0,0.953104
38,148403.7,105091.0,0.708143


In [21]:
data3 = pd.merge(data, data_4temp3, left_on='account_id', right_index= True, how = 'left')

In [22]:
data4 = pd.merge(data3, data_4temp5, left_on='account_id', right_index= True, how = 'left')

In [23]:
# 计算贷存比、贷收比
data4['r_lb'] = data4[['amount','avg_balance']].apply(lambda x: x[0]/x[1],axis = 1)
data4['r_lincome'] = data4[['amount','income']].apply(lambda x: x[0]/x[1],axis = 1)

In [24]:
data4.drop(['A1','A2','A3','loan_id','account_id','date','type'],axis=1,inplace = True)

In [25]:
data4.head()

Unnamed: 0,amount,duration,payments,status,bad_good,disp_id,client_id,birth_number,district_id,A4,...,A15,A16,avg_balance,stdev_balance,cv_balance,income,out,r_out_in,r_lb,r_lincome
0,96396,12,8033.0,B,1,2166,2166,475722,30,94812,...,2985,2804,12250.0,8330.866301,0.680071,20100.0,0.0,0.0,7.869061,4.795821
1,165960,36,4610.0,A,0,2181,2181,680722,46,112709,...,2854,2618,52083.859459,29122.059454,0.559138,229893.7,164002.8,0.713385,3.1864,0.721899
2,127080,60,2118.0,A,0,11006,11314,360602,45,77917,...,2080,2122,30060.954167,11520.184451,0.383228,75146.0,54873.0,0.730219,4.227411,1.691108
3,105804,36,2939.0,A,0,2235,2235,405420,14,177686,...,6604,6295,41297.48,14151.260443,0.342666,120309.8,86017.2,0.714964,2.561997,0.87943
4,274740,60,4579.0,A,0,13231,13539,780907,63,86513,...,1562,1460,57188.211111,25256.665817,0.441641,276327.1,235214.0,0.851216,4.804137,0.994256


In [26]:
# 提取状态为C的用于预测。其它样本随机抽样，建立训练集与测试集
data_model=data4[data4.status!='C']
for_predict=data4[data4.status=='C']
# 贷款表（loans）中的还款的状态可分为 
# A：代表合同终止，没问题
# B：代表合同终止，贷款没支付
# C:代表合同处于执行期，至今正常；
# D：代表合同处于执行期，欠债状态。对数据进行转换，使A=0，B、D=1，C=2

In [27]:
# 模型选用的特征
candidates = ['cv_balance', 'income', 'out', 'r_out_in', 'r_lb', 'r_lincome']
X_select = data_model[candidates]

In [28]:
# 标签
ytrain = data_model['bad_good'].values

In [29]:
np.sum(ytrain==1)/np.sum(ytrain==0)

0.37438423645320196

In [30]:
# 可训练的样本集，用交叉验证法进行分割
X = X_select.values
X = X.astype(np.float64)

In [31]:
for_predict = for_predict[candidates]
# 构建训练集与测试集
x_predict = for_predict.values
X_predict = x_predict.astype(np.float64)

In [32]:
data = X
target = ytrain

In [41]:
# !pip3 install --upgrade scikit-learn

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Requirement already up-to-date: scikit-learn in d:\anaconda\lib\site-packages (0.21.2)


In [33]:
"""模型评估"""
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier as SGD

'''模型融合中使用到的各个单模型'''
clfs = [RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
#         SVC(gamma='auto',probability=True),
#         SGD(alpha=0.00001, random_state=2, shuffle=True, loss='log', max_iter=1000,tol=1e-3),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]

'''切分一部分数据作为测试集'''
Xc, Xc_predict, yc, yc_predict = train_test_split(data, target, test_size=0.33, random_state=2017)


dataset_blend_train = np.zeros((Xc.shape[0], len(clfs)))
dataset_blend_test = np.zeros((Xc_predict.shape[0], len(clfs)))


'''5折stacking'''
skf = StratifiedKFold(n_splits=3)
for j, clf in enumerate(clfs):
    '''依次训练各个单模型'''
    # print(j, clf)
    dataset_blend_test_j = np.zeros((Xc_predict.shape[0], 5))
    for i,(train_index, test_index) in enumerate(skf.split(Xc,yc)):
        '''使用第i个部分作为预测，剩余的部分来训练模型，获得其预测的输出作为第i部分的新特征。'''
        # print("Fold", i)
        X_train, y_train, X_test, y_test = Xc[train_index], yc[train_index], Xc[test_index], yc[test_index]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]
        dataset_blend_train[test_index, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(Xc_predict)[:, 1]
    '''对于测试集，直接用这k个模型的预测值均值作为新的特征。'''
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
    print("val auc Score: %f" % roc_auc_score(yc_predict, dataset_blend_test[:, j]))

# clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
clf = SGD(alpha=0.00001, random_state=2, shuffle=True, loss='log', max_iter=1000,tol=1e-3)
clf.fit(dataset_blend_train, yc)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]
# print("Linear stretch of predictions to [0,1]")
# y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
# print("blend result")
print("val auc Score: %f" % (roc_auc_score(yc_predict, y_submission)))

  from numpy.core.umath_tests import inner1d


val auc Score: 0.836395
val auc Score: 0.805396
val auc Score: 0.784443
val auc Score: 0.831228
val auc Score: 0.829219
val auc Score: 0.821470


In [42]:
'''模型预测'''
clfs = [RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
#         SVC(gamma='auto',probability=True),
#         SGD(alpha=0.00001, random_state=2, shuffle=True, loss='log', max_iter=1000,tol=1e-3),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=5)]
# 训练集（X，ytrain含标签）
# 预测及 X_predict 


dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
# 预测集所需
dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))


'''5折stacking'''
skf = StratifiedKFold(n_splits=3)
for j, clf in enumerate(clfs):
    '''依次训练各个单模型'''
    # print(j, clf)
    dataset_blend_test_j = np.zeros((X_predict.shape[0], 5))
    for i,(train_index, test_index) in enumerate(skf.split(X,ytrain)):
        '''使用第i个部分作为预测，剩余的部分来训练模型，获得其预测的输出作为第i部分的新特征。'''
        # print("Fold", i)
        X_train, y_train, X_test, y_test = X[train_index], ytrain[train_index], X[test_index], ytrain[test_index]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]
        dataset_blend_train[test_index, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
    '''对于测试集，直接用这k个模型的预测值均值作为新的特征。'''
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
    
# clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30)
clf = SGD(alpha=0.00001, random_state=2, shuffle=True, loss='log', max_iter=1000,tol=1e-3)
clf.fit(dataset_blend_train, ytrain)
predictions = clf.predict_proba(dataset_blend_test)[:, 1]

In [43]:
predictions.shape

(403,)

In [None]:
# # SVM
# from sklearn.svm import SVC
# from sklearn.model_selection import StratifiedKFold
# from sklearn import metrics
# folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=2019)
# predictions2 = np.zeros((X_predict.shape[0],2))
# b = 0
# for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y_train)):
#     print("Fold :{}".format(fold_ + 1))
#     trn_data = X[trn_idx]
#     trn_label = y_train[trn_idx]
#     val_data = X[val_idx]
#     val_label = y_train[val_idx]
#     clf = SVC(gamma='auto',probability=True)
#     clf.fit(trn_data, trn_label)  # Fit the model.
#     print("auc score: {:<8.5f}".format(metrics.roc_auc_score(val_label, clf.predict_proba(val_data)[:, 1])))
#     b += metrics.roc_auc_score(val_label, clf.predict_proba(val_data)[:, 1])/ folds.n_splits
#    # 所有预测值相加再除以folds.n_splits(10)
#     predictions22 += clf.predict_proba(val_data) / folds.n_splits
#     predictions2 += clf.predict_proba(X_predict) / folds.n_splits

In [None]:
# # 决策树
# from sklearn import tree 
# from sklearn.model_selection import StratifiedKFold
# from sklearn import metrics
# folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=2019)
# predictions3 = np.zeros((X_predict.shape[0],2))
# c = 0
# for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y_train)):
#     print("Fold :{}".format(fold_ + 1))
#     trn_data = X[trn_idx]
#     trn_label = y_train[trn_idx]
#     val_data = X[val_idx]
#     val_label = y_train[val_idx]
#     model = tree.DecisionTreeClassifier(max_depth=5,random_state=2019)
#     model.fit(trn_data, trn_label)  # Fit the model.
#     print("auc score: {:<8.5f}".format(metrics.roc_auc_score(val_label, model.predict_proba(val_data)[:, 1])))
#     c += metrics.roc_auc_score(val_label, model.predict_proba(val_data)[:, 1])/ folds.n_splits
#    # 所有预测值相加再除以folds.n_splits(10)
#     predictions3 += model.predict_proba(X_predict) / folds.n_splits
