In [1]:
import pandas as pd
import gc
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from scipy.stats import pearsonr
import xgboost
import lightgbm

seed_state = 2022


In [64]:
train_X = pd.read_csv("../train_bin.csv")
test_X = pd.read_csv("../test_bin.csv")

In [65]:
train_y = train_X['isDefault']
train_X.drop(labels = ['isDefault'], axis = 1, inplace = True)

In [66]:
drop_list = ['ficoRangeHigh','n3','policyCode']
train_X.drop(labels = drop_list, axis = 1, inplace = True)
test_X.drop(labels = drop_list, axis = 1, inplace = True)

In [67]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 45 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       800000 non-null  int64  
 1   loanAmnt                 800000 non-null  float64
 2   term                     800000 non-null  int64  
 3   interestRate             800000 non-null  float64
 4   installment              800000 non-null  float64
 5   grade                    800000 non-null  int64  
 6   subGrade                 800000 non-null  int64  
 7   employmentTitle          800000 non-null  float64
 8   employmentLength         800000 non-null  int64  
 9   homeOwnership            800000 non-null  int64  
 10  annualIncome             800000 non-null  int64  
 11  verificationStatus       800000 non-null  int64  
 12  purpose                  800000 non-null  int64  
 13  postCode                 800000 non-null  float64
 14  regi

## 1. 重要特征——暴力组合

In [68]:
"""暴力 组合重要特则"""
importance_feature_list = ['grade','subGrade','term','homeOwnership'
,'issueDate_year','n11','n2','employmentLength','ficoRangeLow','n14',
'loanAmnt','dti','n9','issueDate_month','annualIncome','verificationStatus',
'revolBal',
]
temp_list = [] #暂存组合的特征名
result = []
index = 0
temp_df = pd.DataFrame()
for i in importance_feature_list:
    for j in train_X.columns:
        col = i + '_' + j
        if j + '_' + i in temp_list: #避免重复

            continue
        temp_list.append(col)
        corr =  pearsonr(train_X[i] * train_X[j],train_y)
        if corr[1] <= 0.05 and corr[0] > 0.19:

            result.append([col,corr[0],corr[1]]) #存入列表
            temp_df[col] = train_X[i] * train_X[j]
        del col, corr
        gc.collect()



train_X = pd.concat([train_X, temp_df], axis = 1)
del temp_df
gc.collect()




0

In [69]:
"""四则运算 构造新特征"""
train_X['loanAmnt_annualIncome'] = train_X.apply(lambda x: x['loanAmnt'] / x['annualIncome'] if x['annualIncome'] != 0 else -1, axis = 1)
train_X['installment_loanAmnt'] = train_X.apply(lambda x: x['installment'] / x['loanAmnt'] if x['loanAmnt'] != 0 else -1, axis = 1)
train_X['issueDate_year_earliesCreditLine_year'] = train_X['issueDate_year'] - train_X['earliesCreditLine_year']
train_X['loanAmnt_totalAcc'] = train_X.apply(lambda x: x['loanAmnt'] / x['totalAcc'] if x['totalAcc'] != 0 else -1, axis = 1)
train_X['openAcc_totalAcc'] = train_X.apply(lambda x: x['openAcc'] / x['totalAcc'] if x['totalAcc'] != 0 else -1, axis = 1)
train_X['purpose_totalAcc'] = train_X['purpose'] * train_X['totalAcc']
train_X['purpose_totalAcc'] = train_X['purpose_totalAcc'].apply(lambda x: -1 if x == np.inf else x)
       

In [70]:
train_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 87 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     800000 non-null  int64  
 1   loanAmnt                               800000 non-null  float64
 2   term                                   800000 non-null  int64  
 3   interestRate                           800000 non-null  float64
 4   installment                            800000 non-null  float64
 5   grade                                  800000 non-null  int64  
 6   subGrade                               800000 non-null  int64  
 7   employmentTitle                        800000 non-null  float64
 8   employmentLength                       800000 non-null  int64  
 9   homeOwnership                          800000 non-null  int64  
 10  annualIncome                           800000 non-null  

In [71]:
feature_names = list(train_X.columns)

#包外验证
df_oof = train_X[['id']].copy()
df_oof['label'] = train_y
df_oof['prob'] = 0

#存放测试结果
# prediction = test_X[['id']]
# prediction['prob'] = 0

#重要特征列表
df_importance_list = []

model = xgboost.XGBClassifier(
                            n_estimators = 10000,
                            n_jobs = -1,
                            max_depth = 6,
                            learning_rate = 0.07,
                            subsample = 0.8,
                            tree_method = 'gpu_hist',
                            gamma = 0.3,
                            reg_alpha = 3,
                            reg_lambda = 1,
                            random_state = seed_state)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed_state)

for fold_id,(train_idx, val_idx) in enumerate(kfold.split(train_X, train_y)):
    train_X_kfold = train_X.iloc[train_idx]
    train_y_kfold = train_y.iloc[train_idx]
    val_X_kfold = train_X.iloc[val_idx]
    val_y_kfold = train_y.iloc[val_idx]
    
    print('\nFold_{} Training ========================\n'.format(
        fold_id + 1))
    
    xgb_model = model.fit(train_X_kfold,
                          train_y_kfold,
                          eval_set=[(train_X_kfold, train_y_kfold), (val_X_kfold, val_y_kfold)],
                          verbose=50,
                          eval_metric='auc',
                          early_stopping_rounds=50)
    
    pred_val = xgb_model.predict_proba(val_X_kfold)[:, 1]
    df_oof.loc[val_idx, 'prob'] = pred_val
    
    
#     pred_test = xgb_model.predict_proba(test_X)[:, 1]
#     prediction['prob'] += pred_test / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': xgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del xgb_model, pred_val, train_X_kfold, train_y_kfold, val_X_kfold, val_y_kfold
    gc.collect()







[0]	validation_0-auc:0.70594	validation_1-auc:0.70492
[50]	validation_0-auc:0.72311	validation_1-auc:0.71855
[100]	validation_0-auc:0.73249	validation_1-auc:0.72367
[150]	validation_0-auc:0.73838	validation_1-auc:0.72556
[200]	validation_0-auc:0.74325	validation_1-auc:0.72657
[250]	validation_0-auc:0.74755	validation_1-auc:0.72721
[300]	validation_0-auc:0.75118	validation_1-auc:0.72762
[350]	validation_0-auc:0.75487	validation_1-auc:0.72778
[400]	validation_0-auc:0.75825	validation_1-auc:0.72784
[450]	validation_0-auc:0.76139	validation_1-auc:0.72796
[500]	validation_0-auc:0.76435	validation_1-auc:0.72784
[514]	validation_0-auc:0.76534	validation_1-auc:0.72789






[0]	validation_0-auc:0.70653	validation_1-auc:0.70470
[50]	validation_0-auc:0.72293	validation_1-auc:0.71845
[100]	validation_0-auc:0.73260	validation_1-auc:0.72438
[150]	validation_0-auc:0.73866	validation_1-auc:0.72665
[200]	validation_0-auc:0.74312	validation_1-auc:0.72770
[250]	validation_0-auc:0.74716	validation_1-auc:0.72832
[300]	validation_0-auc:0.75102	validation_1-auc:0.72875
[350]	validation_0-auc:0.75450	validation_1-auc:0.72894
[400]	validation_0-auc:0.75773	validation_1-auc:0.72890
[439]	validation_0-auc:0.76025	validation_1-auc:0.72887






[0]	validation_0-auc:0.70544	validation_1-auc:0.70664
[50]	validation_0-auc:0.72253	validation_1-auc:0.72076
[100]	validation_0-auc:0.73177	validation_1-auc:0.72604
[150]	validation_0-auc:0.73784	validation_1-auc:0.72839
[200]	validation_0-auc:0.74268	validation_1-auc:0.72950
[250]	validation_0-auc:0.74697	validation_1-auc:0.73014
[300]	validation_0-auc:0.75061	validation_1-auc:0.73064
[350]	validation_0-auc:0.75418	validation_1-auc:0.73084
[400]	validation_0-auc:0.75751	validation_1-auc:0.73092
[450]	validation_0-auc:0.76065	validation_1-auc:0.73091
[500]	validation_0-auc:0.76392	validation_1-auc:0.73106
[550]	validation_0-auc:0.76696	validation_1-auc:0.73108
[584]	validation_0-auc:0.76910	validation_1-auc:0.73101






[0]	validation_0-auc:0.70663	validation_1-auc:0.70561
[50]	validation_0-auc:0.72314	validation_1-auc:0.71818


KeyboardInterrupt: 

In [73]:
train_X.head().T

Unnamed: 0,0,1,2,3,4
id,0.000000,1.000000,2.000000,3.000000,4.000000
loanAmnt,6.000000,3.000000,2.000000,2.000000,0.000000
term,5.000000,5.000000,5.000000,3.000000,3.000000
interestRate,2.000000,2.000000,2.000000,0.000000,1.000000
installment,3.000000,1.000000,0.000000,1.000000,0.000000
...,...,...,...,...,...
installment_loanAmnt,0.500000,0.333333,0.000000,0.500000,-1.000000
issueDate_year_earliesCreditLine_year,13.000000,10.000000,9.000000,16.000000,39.000000
loanAmnt_totalAcc,0.222222,0.166667,0.074074,0.071429,0.000000
openAcc_totalAcc,0.259259,0.722222,0.407407,0.321429,0.444444
