In [None]:
# https://www.datafountain.cn/competitions/530

In [1]:
import warnings
import pandas as pd
import numpy as np
import datetime

from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

train_bank = pd.read_csv('./train_public.csv')
train_internet = pd.read_csv('./train_internet.csv')
test_bank = pd.read_csv('./test_public.csv')

print(list(train_bank))
print(list(train_internet))



['loan_id', 'user_id', 'total_loan', 'year_of_loan', 'interest', 'monthly_payment', 'class', 'employer_type', 'industry', 'work_year', 'house_exist', 'censor_status', 'issue_date', 'use', 'post_code', 'region', 'debt_loan_ratio', 'del_in_18month', 'scoring_low', 'scoring_high', 'known_outstanding_loan', 'known_dero', 'pub_dero_bankrup', 'recircle_b', 'recircle_u', 'initial_list_status', 'app_type', 'earlies_credit_mon', 'title', 'policy_code', 'f0', 'f1', 'f2', 'f3', 'f4', 'early_return', 'early_return_amount', 'early_return_amount_3mon', 'isDefault']
['loan_id', 'user_id', 'total_loan', 'year_of_loan', 'interest', 'monthly_payment', 'class', 'sub_class', 'work_type', 'employer_type', 'industry', 'work_year', 'house_exist', 'house_loan_status', 'censor_status', 'marriage', 'offsprings', 'issue_date', 'use', 'post_code', 'region', 'debt_loan_ratio', 'del_in_18month', 'scoring_low', 'scoring_high', 'pub_dero_bankrup', 'early_return', 'early_return_amount', 'early_return_amount_3mon', 're

# Steps
1. Preprocessing
2. Feature Engineering
3. Feature Selection
4. Building Model

### Preprocessing

In [2]:
import sys
sys.path.append(f'/Users/doradong/pyTools')
from data_preprocessing import mem_usage,compress_df,decompress_df 

In [4]:
# compress
train_bank = compress_df(train_bank, ['int64', 'float64'])
train_internet = compress_df(train_internet, ['int64', 'float64'])
test_bank = compress_df(test_bank, ['int64', 'float64'])

pre compression: 0 MB
aft compression:  0 MB
pre compression: 51 MB
aft compression:  51 MB
pre compression: 0 MB
aft compression:  0 MB


In [5]:
train_bank.columns = [col.replace('isDefault', 'is_default') for col in train_bank.columns]

In [6]:
common_cols = list(set(list(train_bank)) & set(list(train_internet)))
print(common_cols)

['work_year', 'monthly_payment', 'year_of_loan', 'initial_list_status', 'house_exist', 'user_id', 'class', 'f4', 'interest', 'use', 'recircle_b', 'early_return', 'region', 'f3', 'industry', 'earlies_credit_mon', 'issue_date', 'post_code', 'loan_id', 'debt_loan_ratio', 'f1', 'early_return_amount_3mon', 'employer_type', 'early_return_amount', 'f2', 'del_in_18month', 'scoring_high', 'recircle_u', 'is_default', 'pub_dero_bankrup', 'title', 'f0', 'scoring_low', 'total_loan', 'censor_status', 'policy_code']


In [7]:
print('The common # of features are: ', len(common_cols))
print('public csv contains # of features: ', len(list(train_bank)))
print('internet csv contains # of features: ', len(list(train_internet.columns)))
print('public csv test contains # of features: ', len(list(test_bank.columns)))

The common # of features are:  36
public csv contains # of features:  39
internet csv contains # of features:  42
public csv test contains # of features:  38


In [8]:
train_bank_left = list(set(list(train_bank.columns)) - set(common_cols))
train_internet_left = list(set(list(train_internet.columns)) - set(common_cols))

print('public csv features not in common: ', train_bank_left)
print('internet csv features not in common: ', train_internet_left)

public csv features not in common:  ['app_type', 'known_outstanding_loan', 'known_dero']
internet csv features not in common:  ['marriage', 'work_type', 'offsprings', 'f5', 'sub_class', 'house_loan_status']


In [9]:
train_internet = train_internet[common_cols]
train_bank = train_bank[common_cols]
test_bank = test_bank[list(set(common_cols)-set(['is_default']))]

In [10]:
train_internet.select_dtypes(include=['object', 'category']).columns

Index(['work_year', 'class', 'industry', 'earlies_credit_mon', 'issue_date',
       'employer_type'],
      dtype='object')

['post_code', 'region', 'title', 'use', 
'user_id', 'loan_id', 'is_default', 'issue_date', 
'policy_code',  ]

### Feature Engineering

In [11]:
month_name_list = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
def split_month(string):
    for i in month_name_list:
        if i in string:
            return month_name_list.index(i)+1 
industry_1_list = ['农、林、牧、渔业']
industry_2_list = ['制造业', '采矿业', '建筑业', '批发和零售业', '电力、热力生产供应业']
industry_3_list = ['金融业', '交通运输、仓储和邮政业', '公共服务、社会组织', '信息传输、软件和信息技术服务业', '房地产业', '文化和体育业', '住宿和餐饮业']
def cate_preprocess(df):
    # 字段处理
    #"""
    df['work_year'] = df['work_year'].apply(str).apply(lambda x: x.split('year')[0].replace('10+', '10').replace('< 1', '0')).apply(np.float)
    df['earlies_credit_mon'] = df['earlies_credit_mon'].apply(split_month)
    df['is_censor_status_1'] = df['censor_status'].apply(lambda x: 1 if x==1 else 0)
    df['is_censor_status_2'] = df['censor_status'].apply(lambda x: 1 if x==2 else 0)
    df['is_censor_status_0'] = df['censor_status'].apply(lambda x: 1 if x==0 else 0)
    df = df.drop('censor_status', axis=1)
    
    df['is_industry_1'] = df['industry'].apply(lambda x: 1 if x in industry_1_list else 0)
    df['is_industry_2'] = df['industry'].apply(lambda x: 1 if x in industry_2_list else 0)
    df['is_industry_3'] = df['industry'].apply(lambda x: 1 if x in industry_3_list else 0)
    df = df.drop('industry', axis=1)
    
    df['is_low_A'] = df['class'].apply(lambda x: 0 if x in ['A'] else 1)
    df['is_low_B'] = df['class'].apply(lambda x: 0 if x in ['A', 'B'] else 1)
    df['is_low_C'] = df['class'].apply(lambda x: 0 if x in ['A', 'B', 'C'] else 1)
    df['is_low_D'] = df['class'].apply(lambda x: 0 if x in ['A', 'B', 'C', 'D'] else 1)
    df  = df.drop('class', axis=1)
    
    df['is_employ_1'] = df['employer_type'].apply(lambda x: 1 if x in ['政府机构', '高等教育机构'] else 0)
    df['is_employ_2'] = df['employer_type'].apply(lambda x: 1 if x in ['上市企业', '世界五百强'] else 0)
    df['is_employ_3'] = df['employer_type'].apply(lambda x: 1 if x in ['幼教与中小学校', '普通企业'] else 0)
    df = df.drop('employer_type', axis=1)
    #"""
    
    return df
train_bank = cate_preprocess(train_bank)
train_internet = cate_preprocess(train_internet)

In [12]:
def feature_preprocess(df):
    # process the time feature
    df['issue_date'] = pd.to_datetime(df['issue_date'])
    #df['issue_date_y'] = df['issue_date'].dt.year
    #df['issue_date_m'] = df['issue_date'].dt.month
    
    base_time = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
    df['issue_date_diff'] = df['issue_date'].apply(lambda x: x-base_time).dt.days
    df['issue_month_diff'] = df['issue_date'].apply(lambda x: x-base_time).dt.days//30
    df['issue_year_diff'] = df['issue_date'].apply(lambda x: x-base_time).dt.days//365
    #df[['issue_date', 'issue_date_y', 'issue_date_m', 'issue_date_diff']]
    df.drop('issue_date', axis = 1, inplace = True)
    
    return df
train_bank = feature_preprocess(train_bank)
train_internet = feature_preprocess(train_internet)

In [13]:
train_bank.head(3)

Unnamed: 0,work_year,monthly_payment,year_of_loan,initial_list_status,house_exist,user_id,f4,interest,use,recircle_b,...,is_low_A,is_low_B,is_low_C,is_low_D,is_employ_1,is_employ_2,is_employ_3,issue_date_diff,issue_month_diff,issue_year_diff
0,3.0,1174.910034,3,0,0,240418,4.0,11.466,2,7734.230957,...,1,1,0,0,1,0,0,3410,113,9
1,10.0,670.690002,5,1,0,225197,22.0,16.841,0,31329.0,...,1,1,0,0,1,0,0,2192,73,6
2,10.0,603.320007,3,1,1,209360,19.0,8.9,4,18514.0,...,0,0,0,0,1,0,0,2406,80,6


### 自动分箱、特征筛选


In [65]:
train_bank.columns = [col.replace('is_default', 'target') for col in train_bank ]
train_internet.columns = [col.replace('is_default', 'target') for col in train_internet ]

In [66]:
X_train = pd.concat([train_bank, train_internet])
y_train = X_train['target']
X_train.drop(['target'],axis=1,inplace=True)

X_test = test_bank

In [67]:
from sklearn.tree import DecisionTreeClassifier
def optimal_binning_boundary(x: pd.Series, y: pd.Series, nan: float = -999.) -> list:
    '''
        obtain optimal boundary based on decision tree
    '''
    boundary = []  # 
    
    x = x.fillna(nan).values  
    y = y.values
    
    clf = DecisionTreeClassifier(criterion='entropy',    
                                 max_leaf_nodes=6,       
                                 min_samples_leaf=0.05)  

    clf.fit(x.reshape(-1, 1), y)  
    
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    threshold = clf.tree_.threshold
    
    for i in range(n_nodes):
        if children_left[i] != children_right[i]:  
            boundary.append(threshold[i])

    boundary.sort()

    min_x = x.min()
    max_x = x.max() + 0.1  
    boundary = [min_x] + boundary + [max_x]

    return boundary
def feature_woe_iv(x: pd.Series, y: pd.Series, nan: float = -999.) -> pd.DataFrame:
    '''
        compute woe & iv 
    '''
    x = x.fillna(nan)
    boundary = optimal_binning_boundary(x, y, nan)        
    df = pd.concat([x, y], axis=1)                        
    df.columns = ['x', 'y']                               
    df['bins'] = pd.cut(x=x, bins=boundary, right=False)  
    
    grouped = df.groupby('bins')['y']                     
    result_df = grouped.agg([('good',  lambda y: (y == 0).sum()), 
                             ('bad',   lambda y: (y == 1).sum()),
                             ('total', 'count')])

    result_df['good_pct'] = result_df['good'] / result_df['good'].sum()       
    result_df['bad_pct'] = result_df['bad'] / result_df['bad'].sum()          
    result_df['total_pct'] = result_df['total'] / result_df['total'].sum()    

    result_df['bad_rate'] = result_df['bad'] / result_df['total']             
    
    result_df['woe'] = np.log(result_df['good_pct'] / result_df['bad_pct'])              
    result_df['iv'] = (result_df['good_pct'] - result_df['bad_pct']) * result_df['woe']  
    
    return result_df


In [68]:
iv_record=[]
woe_df_record = []
for feat in X_train.columns:
    woe_df_record.append(feature_woe_iv(X_train[feat], y_train, -999))
    iv_record.append(woe_df_record[-1]['iv'].sum())
    
iv_thre = 0.02
# keep features that iv>iv_thre
iv_record_np = np.array(iv_record)
iv_record_index = iv_record_np>=iv_thre
X_train = X_train[X_train.columns[iv_record_index]]
woe_df_record = np.array(woe_df_record)[iv_record_index]
woe_df_record = woe_df_record.tolist()
list_record = X_train.columns.tolist()
assert len(woe_df_record)==len(X_train.columns)


In [69]:
def fill_woe_data(X_train, woe_df_record):
    """
        substitute ori value with woe 
    """
    X_train = X_train.fillna(-999)
    for i, col in enumerate(X_train.columns):
        bins_tmp = [a.left for a in woe_df_record[i].index.tolist()]+[woe_df_record[i].index.tolist()[-1].right]
        X_train[col] = pd.cut(X_train[col], bins=bins_tmp, labels=woe_df_record[i].index.tolist(), right=False, include_lowest=True)
        X_train[col] = X_train[col].replace(woe_df_record[i][['woe']].to_dict('dict')['woe'])
    return X_train
X_train = fill_woe_data(X_train, woe_df_record)
X_test = fill_woe_data(X_test[X_train.columns], woe_df_record)


In [71]:
# Remove features with high pearson value 
from scipy.stats import pearsonr

p_thre = 0.75
def my_pearsonr(x_data):
    sorce_p_value = []
    index_i=0
    while index_i<len(x_data.columns)-1:
        index_j=index_i+1
        while index_j<len(x_data.columns):
            p_coeff = pearsonr(x_data[x_data.columns[index_i]], x_data[x_data.columns[index_j]])[0]
            if p_coeff>p_thre:
                x_data.drop([x_data.columns[index_j]], axis=1, inplace=True)
                continue
            else:
                index_j+=1
        index_i+=1
    return x_data

X_train = my_pearsonr(X_train)
woe_df_record = [woe_df_record[i] for i in [list_record.index(a) for a in X_train.columns]]

In [72]:
# Choose 20 features with the highest iv
feat_num = 20
iv_record = {}
for i in range(len(woe_df_record)):
    iv_record[X_train.columns[i]]=woe_df_record[i]['iv'].sum()

iv_list = sorted(iv_record.items(), key=lambda d:d[1], reverse = True)
iv_list = iv_list[:feat_num]
X_train = X_train[[a[0] for a in iv_list]]

In [73]:
# Separate data into dev and val sets
from sklearn.model_selection import train_test_split
X_dev, X_val, y_dev, y_val = train_test_split(X_train, y_train, test_size=0.2)


### Building Model

In [74]:
import lightgbm
from sklearn import metrics

def ks_score(y_true, y_score):
    """ Calculating the Kolmogorov-Smirnov score"""
    fpr, tpr, _ = metrics.roc_curve(y_true, y_score)
    return max(tpr - fpr)

In [78]:
clf_ex=lightgbm.LGBMRegressor(n_estimators = 200)
clf_ex.fit(X = X_dev, y = y_dev, verbose=1)
#clf_ex.booster_.save_model('LGBMmode.txt')
val_pred = clf_ex.predict(X_val)
val_pred_list = [int(i) for i in val_pred]
print('acc: ', metrics.accuracy_score(y_val, val_pred_list))
print('ks: ', ks_score(y_val, val_pred))

X_test = X_test[X_train.columns]
pred = clf_ex.predict(X_test)
pred = [int(i) for i in pred]

acc:  0.8006776315789473
ks:  0.43066442289261236


In [79]:
# submission
submission = pd.DataFrame({'id':test_bank['loan_id'], 'is_default':pred})
submission.to_csv('submission.csv', index = None)

#### NN

In [85]:
import matplotlib.pyplot as plt
%matplotlib inline

from keras.models import Sequential
from keras.layers import Dense , Dropout , Lambda, Flatten
from tensorflow.keras.optimizers import Adam, RMSprop
from sklearn.model_selection import train_test_split
from keras import  backend as K
from keras import models
from keras.preprocessing.image import ImageDataGenerator

In [90]:
# 
import numpy as np
#mean_px = X_train.mean().astype(np.float32)
#std_px = X_train.std().astype(np.float32)

#X_train_NN =(X_train - mean_px) / std_px
#X_test_NN  = (X_test - mean_px) / std_px

X_train_NN = (X_train.values).astype('float32')
y_train_NN = y_train.astype('int32')

X_test_NN = (X_test.values).astype('float32')

In [87]:
# 修改初始化、加归一层、加dropout、改用不同的metrics
seed = 43
np.random.seed(seed)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import AUC

def auroc(y_true, y_pred):
    return tf.compat.v1.py_func(roc_auc_score, (y_true, y_pred), tf.double)

input_shape = X_train_NN.shape[1]
b_size = 1024
max_epochs = 10

import tensorflow.keras as K
init = K.initializers.glorot_uniform(seed=1)
simple_adam = K.optimizers.Adam(lr=0.01)

model = K.models.Sequential()
model.add(K.layers.Dense(units=256, input_dim=input_shape, kernel_initializer='he_normal', activation='relu',kernel_regularizer=l2(0.0001)))
model.add(K.layers.LayerNormalization())
model.add(K.layers.Dropout(0.3))
model.add(K.layers.Dense(units= 64, kernel_initializer='he_normal', activation='relu'))
model.add(K.layers.LayerNormalization())
model.add(K.layers.Dropout(0.3))
model.add(K.layers.Dense(units=1, kernel_initializer='he_normal', activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=simple_adam, metrics=['accuracy',AUC(name='auc')])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 256)               4864      
                                                                 
 layer_normalization_2 (Laye  (None, 256)              512       
 rNormalization)                                                 
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 dense_4 (Dense)             (None, 64)                16448     
                                                                 
 layer_normalization_3 (Laye  (None, 64)               128       
 rNormalization)                                                 
                                                                 
 dropout_3 (Dropout)         (None, 64)               

In [88]:
print("Starting NN training")
h = model.fit(X_train_NN, y_train_NN, batch_size=b_size, epochs=max_epochs, shuffle=True, verbose=1)
print("NN training finished")

Starting NN training
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
NN training finished


In [None]:
pred_NN = model.predict(X_test_NN)
pred_NN = [item[0] for item in pred_NN]
pred_NN = [int(i) for i in pred_NN]

In [None]:
model.save('NN_model.h5')
submission = pd.DataFrame({'id':test_bank['loan_id'], 'is_default':pred_NN})
submission.to_csv('submission.csv', index = None)

### Other tries...

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_dev, y_dev)

print(ks_score(y_val, clf.predict_proba(X_val)[:,1]))
print(metrics.accuracy_score(y_val, clf.predict(X_val)))

In [None]:
y_train.value_counts()

In [None]:
print(ks_score(y_val, clf.predict_proba(X_val)[:,1]))

In [None]:
submission = pd.DataFrame({'id':test_bank['loan_id'], 'is_default':clf.predict(X_test)})
submission.to_csv('submission.csv', index = None)

In [None]:
submission['is_default'].value_counts()