In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import learning_curve,validation_curve
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.metrics import roc_auc_score,auc,f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

import xgboost as xgb
from xgboost import XGBClassifier,XGBRegressor,plot_importance
import lightgbm as lgb
from lightgbm import LGBMClassifier,callback
from catboost import Pool,metrics,cv,MetricVisualizer
from catboost.utils import get_roc_curve,select_threshold,get_fpr_curve,get_fnr_curve
from catboost import CatBoostClassifier,CatBoostRegressor
import shap
import Meancoder

In [3]:
def reduce_mem_usage(data):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = data.memory_usage().sum() 
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else:
            data[col] = data[col].astype('category')
    end_mem = data.memory_usage().sum() 
    return data 

## 一、导入数据

In [4]:
train_data = pd.read_csv('./data/train.csv',encoding='utf-8')
test_data =pd.read_csv('./data/testA.csv',encoding='utf-8')
submission=pd.read_csv('./data/sample_submit.csv',encoding='utf-8')

## 二、数据预处理
- 缺失值处理：（1）类别特征：空值用众数填充 （2）离散数值特征：空值用中位数填充
- 异常值处理：（1）异常检测方法：均方差，（2）处理:删除

In [5]:
def data_preprocessing(train_data,test_data):    
    # 合并数据，方便处理和构造特征  
    data = pd.concat([train_data,test_data],ignore_index=True)  
    
    # 1、分离分类特征和数值特征
    # 连续性变量
    num_cols = list(train_data.select_dtypes(exclude=['object']).columns)
    # 离散型变量
    cat_cols = list(train_data.select_dtypes(include=['object']).columns)

    # 发现分类型变量和时间变量
    time_cols = ['issueDate','earliesCreditLine']
    cat_cols.remove('issueDate')
    cat_cols.remove('earliesCreditLine')
    class_cols = ['term','employmentTitle','homeOwnership','verificationStatus', 'purpose',
                  'postCode','regionCode','initialListStatus','applicationType','title',]
    for col in class_cols:
        num_cols.remove(col)
        cat_cols.append(col)
    num_cols.remove('isDefault')

    # 2、缺失值处理
    missing = data.drop('isDefault',axis=1).isnull().sum()
    missing = missing[missing>0]
    missing_df = pd.DataFrame({'missing_key':missing.keys(),'missing_value':np.round(missing.values,4)})

    missing_feature = list(missing_df.missing_key)
    null_cat,null_num,null_other = [],[],[]
    for feat in missing_feature:
        if feat in num_cols:
            null_num.append(feat)
        elif feat in cat_cols:
            null_cat.append(feat)
        else:
            null_other.append(feat)

    # 类别特征：空值用众数填充
    si_cat = SimpleImputer(strategy='most_frequent').fit(data[null_cat])
    data[null_cat] = si_cat.transform(data[null_cat])

    # 离散数值特征：空值用中位数填充
    si_num = SimpleImputer(strategy='median').fit(data[null_num])
    data[null_num] = si_num.transform(data[null_num])

    # 3、异常值处理
    data['annualIncome'] = data['annualIncome'].apply(lambda x:x if x<283000 else 300000)
    data['revolBal'] = data['revolBal'].apply(lambda x:x if x<83000 else 100000)
    
    income_zero_index = list(data[data['annualIncome']==0].index)
    for i in income_zero_index:
        data['annualIncome'][i] = (data['loanAmnt'][i]/data['dti'][i])*100
     
    
    return data

In [6]:
data = data_preprocessing(train_data,test_data)

## 三、特征工程
- 特征工程
    - 数据分桶：
    - 特征编码：
    - 特征衍生：
- 特征融合
    - 模型选取：
    - 特征筛选：

In [7]:
def feature_engineering(data):
    # 1.可解释性特征
    # 针对时间特征'issueDate'
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d', errors='coerce')
    data['issueDate_year'] = data['issueDate'].dt.year
    data['issueDate_month'] = data['issueDate'].dt.month.astype('int8')
    data['issueDate_dayofweek']= data['issueDate'].dt.dayofweek.astype('int8')

    # 针对时间特征'earliesCreditLine'
    data['earliesCreditLine_year'] = data['earliesCreditLine'].apply(lambda x:str(x)[str(x).find('-')+1:])
    data['earliesCreditLine_month'] = data['earliesCreditLine'].apply(lambda x:str(x)[:str(x).find('-')])
    month_dict = {'Jan':1,'Feb':2,'Mar':3,'Apr':4,'May':5,'Jun':6,'Jul':7, 'Aug':8, 'Sep':9,'Oct':10,'Nov':11,'Dec':12}       
    data['earliesCreditLine_month'] =data['earliesCreditLine_month'].map(month_dict)
    data['earliesCreditLine_date'] = data['earliesCreditLine_year'].astype('str') + '-' + data['earliesCreditLine_month'].apply(date_concat)+ '-' + '01'
    data['earliesCreditLine_date'] =  pd.to_datetime(data['earliesCreditLine_date'],format='%Y-%m-%d', errors='coerce')
    data['Credit_issue_days'] = (data['issueDate'].astype('datetime64') - data['earliesCreditLine_date'].astype('datetime64')).dt.days
    data['Credit_issue_years'] = (data['Credit_issue_days']/365).round(2)
    del data['issueDate'],data['earliesCreditLine'],data['earliesCreditLine_date'],data['Credit_issue_days']

    # 数值特征
    # 1) 数值特征:数据分桶
    # 通过对数函数映射到指定宽度分箱
    data['loanAmnt_bin'] = np.floor(np.log10(data['loanAmnt']))
    data['annualIncome_bin'] = np.floor(np.log10(data['annualIncome']))
    # 通过除法映射到间隔均匀的分箱中
#     data['annualIncome_bin'] = np.floor_divide(data['annualIncome'],1000)

    # 2）四则运算组合
    data['annual_installment'] = np.ceil(data['installment'] * 12)
    data['annual_loanAmnt'] = data['loanAmnt']/data['term']
    data['debt_rate'] = ((data['annual_installment']/data['annualIncome'])*100).round(2)
    data['debt'] = np.ceil(data['annualIncome'] * data['dti']/100)
    del data['annual_installment']

    data['pubRec_keep'] = data['pubRec'] - data['pubRecBankruptcies']
    data['Acc_keep'] = data['totalAcc'] - data['openAcc']
    data['Acc_rate'] = (data['openAcc']/data['totalAcc']).round(3)
    data['revolBal/totalAcc'] = (data['revolBal']/data['totalAcc']).round(2)

    n_list = ['n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8','n9', 'n10', 'n11', 'n12', 'n13', 'n14']
    n_data = data[n_list]
    data['n_sum_1'] = n_data.T.sum().astype('float16')
    data['n_sum_2'] = data['n0']+data['n1']+data['n2']+data['n3']+data['n4']-data['n5']-data['n6']+data['n7']-data['n8']+data['n9']+data['n10']-data['n11']+data['n12']+data['n13']+data['n14']

    # 部分数值特征归一化
    min_max_cols = ['loanAmnt','installment','annualIncome','revolBal','debt']            
    scaler = MinMaxScaler().fit(data[min_max_cols])
    data[min_max_cols] = pd.DataFrame(scaler.transform(data[min_max_cols]),columns=min_max_cols)

    del data['id'],data['policyCode']
    del data['loanAmnt'],data['ficoRangeHigh'],data['annualIncome']
    del data['n3'],data['n9'],data['n10'],data['n11'],data['n12'],data['n13']

    # 2.组合交叉特征
    # 离散性特征与连续性特征的一阶交叉
    employment_cat = ['homeOwnership','employmentTitle','employmentLength',]
    employment_num = ['debt_rate','revolBal',]
    data = cross_cat_num(data,employment_cat,employment_num)

    debt_cat = ['subGrade','purpose','regionCode',]
    debt_num = ['interestRate',]
    data = cross_cat_num(data,debt_cat,debt_num)

    return data
 
    
def date_concat(x):
    if len(str(x))==1:
        return "0"+str(x)
    else:
        return str(x)
    
    
def cross_cat_num(data,cat_col,num_col):
    for f1 in cat_col:
        group = data.groupby(f1,as_index=False)
        for f2 in num_col:
            feat = group[f2].agg({
                                 '{}_{}_max'.format(f1,f2):'max',
                                 '{}_{}_min'.format(f1,f2):'min',
#                                  '{}_{}_median'.format(f1,f2):'median',
                                 '{}_{}_mean'.format(f1,f2):'mean',
                                 '{}_{}_std'.format(f1,f2):'std',
                                '{}_{}_skew'.format(f1,f2):'skew'
            })
            data = data.merge(feat,how='left',on=f1)
    return data

In [8]:
data = feature_engineering(data)
data = reduce_mem_usage(data)

### 特征选择

In [9]:
correlation = data.corr('spearman')
correlation['isDefault'].sort_values(ascending=False)

isDefault                        1.000000
subGrade_interestRate_max        0.263101
subGrade_interestRate_mean       0.263101
interestRate                     0.254540
subGrade_interestRate_std        0.226274
                                   ...   
homeOwnership_revolBal_mean     -0.071456
ficoRangeLow                    -0.130551
subGrade_interestRate_skew      -0.135939
employmentLength_revolBal_max         NaN
employmentLength_revolBal_min         NaN
Name: isDefault, Length: 92, dtype: float64

In [10]:
# 显示相关性高的变量
def getHighRelatedFeature(corr_matrix,corr_threshold):
    highRelatedFeature = pd.DataFrame(corr_matrix[corr_matrix > corr_threshold].stack().reset_index())
    highRelatedFeature.rename({'level_0':'feature1','level_1':'feature2',0:'corr'},axis=1,inplace=True)
    highRelatedFeature = highRelatedFeature[highRelatedFeature.feature1 != highRelatedFeature.feature2]
    highRelatedFeature['feature_pair_key'] = highRelatedFeature.loc[:,['feature1','feature2']].apply(lambda x:"#".join(np.sort(x.values)),axis=1)
    highRelatedFeature.drop_duplicates(subset=['feature_pair_key'],inplace=True)
    highRelatedFeature.drop(columns=['feature_pair_key'],inplace=True)
    return highRelatedFeature

In [11]:
HighRelated_df = getHighRelatedFeature(data.corr(),0.9).sort_values(by='corr',ascending=False)
HighRelated_df[:50]

Unnamed: 0,feature1,feature2,corr
60,homeOwnership_debt_rate_mean,homeOwnership_debt_rate_std,0.998967
56,homeOwnership_debt_rate_max,homeOwnership_debt_rate_std,0.99793
90,employmentLength_debt_rate_max,employmentLength_debt_rate_mean,0.997429
107,subGrade_interestRate_max,subGrade_interestRate_mean,0.994384
55,homeOwnership_debt_rate_max,homeOwnership_debt_rate_mean,0.993987
91,employmentLength_debt_rate_max,employmentLength_debt_rate_std,0.989271
95,employmentLength_debt_rate_mean,employmentLength_debt_rate_std,0.983711
72,homeOwnership_revolBal_mean,homeOwnership_revolBal_std,0.981746
101,employmentLength_revolBal_mean,employmentLength_revolBal_std,0.981369
3,interestRate,subGrade_interestRate_mean,0.977184


In [13]:
del data['homeOwnership_debt_rate_std'],data['employmentLength_debt_rate_max']
del data['subGrade_interestRate_max'],data['homeOwnership_debt_rate_max']
del data['employmentLength_debt_rate_std'],data['homeOwnership_revolBal_std'],data['employmentLength_revolBal_std']
del data['subGrade_interestRate_mean'],data['employmentTitle_debt_rate_max']
del data['annual_loanAmnt'],data['homeOwnership_revolBal_skew']

### 特征编码
- 自定义编码
- count encoder 频数编码
- one_hot Encoder 独热编码
- label Encoder 标签编码
- target Encoder 目标编码
- mean Encoder 平均编码

In [17]:
# 1.自定义编码
grade_dict = {'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7}
data['grade'] = data['grade'].map(grade_dict).astype('int8')
data['subGrade'] = data['subGrade'].astype('str').apply(lambda x:grade_dict[x[0]]*10 + int(x[1]))   
employmentLength_dict = {
        '< 1 year':0,
        '1 year':1,
        '2 years':2,
        '3 years':3,
        '4 years':4,
        '5 years':5,
        '6 years':6,
        '7 years':7,
        '8 years':8,
        '9 years':9,
        '10+ years':10
        }
data['employmentLength'] = data['employmentLength'].map(employmentLength_dict)

# 数值特征重新编码：'ficoRangeLow'
ficoRangeLow_dict = {}
ficoRangeLow_list = list(data['ficoRangeLow'].sort_values().unique())
for ind,val in enumerate(ficoRangeLow_list):
    ficoRangeLow_dict[val] = ind
data['ficoRangeLow'] = data['ficoRangeLow'].map(ficoRangeLow_dict).astype('int8')

# 2.labelEncoder 编码
high_cat = ['postCode','employmentTitle','title']
for col in high_cat:
    lbl = LabelEncoder().fit(data[col])
    data[col] = lbl.transform(data[col])
      

In [18]:
# 3. mean Encoder 平均编码

cat_features = ['term','grade','subGrade','employmentLength','homeOwnership','verificationStatus','purpose',
                'postCode','regionCode','initialListStatus','applicationType','employmentTitle','title','ficoRangeLow',
                'issueDate_year','issueDate_month','issueDate_dayofweek','earliesCreditLine_year','earliesCreditLine_month',
                'annualIncome_bin','loanAmnt_bin']

data[cat_features] = data[cat_features].astype('str')
X = data.drop('isDefault',axis=1)[:800000]
y = data['isDefault'][:800000]
test = data.drop('isDefault',axis=1)[800000:]


MeanEncoderFeature = ['employmentLength','homeOwnership','annualIncome_bin']
ME = Meancoder.MeanEncoder(MeanEncoderFeature,target_type='classification')
X = ME.fit_transform(X,y)
test = ME.transform(test)

for feat in MeanEncoderFeature:
    cat_features.remove(feat)

X = X.drop(columns=MeanEncoderFeature,axis=1)
test = test.drop(columns=MeanEncoderFeature,axis=1)

### 模型融合
- 模型差异：（1）lightgbm （2）catboost （3）xgboost
- 特征差异：（1）特征组合1 （2）特征组合2
- 参数差异：（1）行采样比 bagging fraction （2）列采样比 feature fraction

## 四、模型预测评估

In [None]:
X[cat_features] = X[cat_features].astype('str')
test[cat_features] = test[cat_features].astype('str')

In [None]:
# n_folds = 15,learning_rate=0.1,预测评分 0.7440

sk=StratifiedKFold(n_splits=15,shuffle=True,random_state=42)

pred_test = np.zeros(len(test)) 
auc_val = 0
for fold,(train_idx,val_idx) in enumerate(sk.split(X,y)):
    x_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    x_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]

    model = CatBoostClassifier(iterations=10000,
                             learning_rate=0.1,
                             depth=5, 
                             subsample=1,
                             reg_lambda=5,
                             loss_function='Logloss',
                             custom_loss='AUC',
                             eval_metric='AUC',
                             random_seed=42,
                             early_stopping_rounds=100)
    
    model.fit(x_train,y_train,
        eval_set=(x_val,y_val),
        cat_features=cat_features,
        verbose=1000)
    
    pred_val = model.predict(x_val)
    auc_val += roc_auc_score(pred_val,y_val)/sk.n_splits        
    pred_test +=  model.predict_proba(test)[:,1]/sk.n_splits
    
print('catb auc:{:<8.8f}'.format(auc_val))

In [None]:
result_cat = pd.DataFrame()
result_cat['id'] = test_data['id']           
result_cat['isDefault'] = pred_test  

In [None]:
result_cat.to_csv('./result/result_cat.csv',index=False)