In [1]:
import pandas as pd
import gc
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import xgboost
import lightgbm
seed_state = 2022

In [51]:
train_df = pd.read_pickle("./train_df.pkl")
test_df = pd.read_pickle("./test_df.pkl")

In [22]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 50 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       800000 non-null  int64  
 1   loanAmnt                 800000 non-null  float64
 2   term                     800000 non-null  int64  
 3   interestRate             800000 non-null  float64
 4   installment              800000 non-null  float64
 5   grade                    800000 non-null  int32  
 6   subGrade                 800000 non-null  int32  
 7   employmentTitle          800000 non-null  float64
 8   employmentLength         800000 non-null  int32  
 9   homeOwnership            800000 non-null  int64  
 10  annualIncome             800000 non-null  float64
 11  verificationStatus       800000 non-null  int64  
 12  isDefault                800000 non-null  int64  
 13  purpose                  800000 non-null  int64  
 14  post

In [52]:
train_X = train_df.copy()
train_y = train_X['isDefault']

train_X.drop(labels = ['isDefault'], axis = 1, inplace = True)

In [53]:
del train_df
gc.collect()

0

In [54]:
train_X.columns

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'purpose', 'postCode',
       'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
       'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
       'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7',
       'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'issueDate_year',
       'issueDate_month', 'issueDate_day', 'earliesCreditLine_month',
       'earliesCreditLine_year'],
      dtype='object')

In [55]:
test_df.columns

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'purpose', 'postCode',
       'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
       'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
       'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7',
       'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'issueDate_year',
       'issueDate_month', 'issueDate_day', 'earliesCreditLine_month',
       'earliesCreditLine_year'],
      dtype='object')

In [56]:
"""将最重要的特征grade和剩余的特征做交叉，看是否会有提升"""
for col in train_X.columns:
    if col != 'issueDate_year':
        train_X['issueDate_year_' + col] = train_X['issueDate_year'] * train_X[col]
        test_df['issueDate_year_' + col] = test_df['issueDate_year'] * test_df[col]

In [57]:
train_X.columns

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'purpose', 'postCode',
       'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
       'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
       'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7',
       'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'issueDate_year',
       'issueDate_month', 'issueDate_day', 'earliesCreditLine_month',
       'earliesCreditLine_year', 'issueDate_year_id',
       'issueDate_year_loanAmnt', 'issueDate_year_term',
       'issueDate_year_interestRate', 'issueDate_year_installment',
       'issueDate_year_grade', 'issueDate_year_subGrade',
       'issueDate_year_employmentTitle', 'issueDate_year_employmentLength',
       'issueDate_year_homeOwnersh

In [58]:
feature_names = list(train_X.columns)

#包外验证
df_oof = train_X[['id']].copy()
df_oof['label'] = train_y
df_oof['prob'] = 0

#存放测试结果
prediction = test_df[['id']]
prediction['prob'] = 0

#重要特征列表
df_importance_list = []

model = xgboost.XGBClassifier(
                            n_estimators = 500,
                            n_jobs = -1,
                            max_depth = 6,
                            learning_rate = 0.05,
                            subsample = 0.8,
                            gamma = 0.5,
                            reg_alpha = 3,
                            reg_lambda = 1,
                            random_state = seed_state)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed_state)

for fold_id,(train_idx, val_idx) in enumerate(kfold.split(train_X, train_y)):
    train_X_kfold = train_X.iloc[train_idx]
    train_y_kfold = train_y.iloc[train_idx]
    val_X_kfold = train_X.iloc[val_idx]
    val_y_kfold = train_y.iloc[val_idx]
    
    print('\nFold_{} Training ========================\n'.format(
        fold_id + 1))
    
    xgb_model = model.fit(train_X_kfold,
                          train_y_kfold,
                          eval_set=[(train_X_kfold, train_y_kfold), (val_X_kfold, val_y_kfold)],
                          verbose=100,
                          eval_metric='auc',
                          early_stopping_rounds=50)
    
    pred_val = xgb_model.predict_proba(val_X_kfold)[:, 1]
    df_oof.loc[val_idx, 'prob'] = pred_val
    
    
    pred_test = xgb_model.predict_proba(test_df)[:, 1]
    prediction['prob'] += pred_test / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': xgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del xgb_model, pred_val, pred_test, train_X_kfold, train_y_kfold, val_X_kfold, val_y_kfold
    gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prob'] = 0








[0]	validation_0-auc:0.70785	validation_1-auc:0.70498
[100]	validation_0-auc:0.73397	validation_1-auc:0.72569
[200]	validation_0-auc:0.74489	validation_1-auc:0.73055
[300]	validation_0-auc:0.75224	validation_1-auc:0.73234
[400]	validation_0-auc:0.75832	validation_1-auc:0.73339
[499]	validation_0-auc:0.76415	validation_1-auc:0.73405


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prob'] += pred_test / kfold.n_splits








[0]	validation_0-auc:0.70783	validation_1-auc:0.70548
[100]	validation_0-auc:0.73346	validation_1-auc:0.72710
[200]	validation_0-auc:0.74437	validation_1-auc:0.73214
[300]	validation_0-auc:0.75184	validation_1-auc:0.73419
[400]	validation_0-auc:0.75810	validation_1-auc:0.73522
[499]	validation_0-auc:0.76356	validation_1-auc:0.73583


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prob'] += pred_test / kfold.n_splits








[0]	validation_0-auc:0.70683	validation_1-auc:0.70700
[100]	validation_0-auc:0.73332	validation_1-auc:0.72803
[200]	validation_0-auc:0.74389	validation_1-auc:0.73287
[300]	validation_0-auc:0.75153	validation_1-auc:0.73500
[400]	validation_0-auc:0.75767	validation_1-auc:0.73627
[499]	validation_0-auc:0.76349	validation_1-auc:0.73715


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prob'] += pred_test / kfold.n_splits








[0]	validation_0-auc:0.70769	validation_1-auc:0.70525
[100]	validation_0-auc:0.73373	validation_1-auc:0.72600
[200]	validation_0-auc:0.74458	validation_1-auc:0.73079
[300]	validation_0-auc:0.75225	validation_1-auc:0.73286
[400]	validation_0-auc:0.75845	validation_1-auc:0.73392
[499]	validation_0-auc:0.76401	validation_1-auc:0.73449


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prob'] += pred_test / kfold.n_splits








[0]	validation_0-auc:0.70752	validation_1-auc:0.70491
[100]	validation_0-auc:0.73384	validation_1-auc:0.72546
[200]	validation_0-auc:0.74453	validation_1-auc:0.73046
[300]	validation_0-auc:0.75193	validation_1-auc:0.73261
[400]	validation_0-auc:0.75809	validation_1-auc:0.73381
[499]	validation_0-auc:0.76384	validation_1-auc:0.73456


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prob'] += pred_test / kfold.n_splits


In [59]:
df_importance_list

[                                    column  importance
 0                                       id    0.003147
 1                                 loanAmnt    0.005039
 2                                     term    0.026692
 3                             interestRate    0.004421
 4                              installment    0.006069
 ..                                     ...         ...
 92                      issueDate_year_n14    0.007922
 93          issueDate_year_issueDate_month    0.005823
 94            issueDate_year_issueDate_day    0.000000
 95  issueDate_year_earliesCreditLine_month    0.003463
 96   issueDate_year_earliesCreditLine_year    0.005083
 
 [97 rows x 2 columns],
                                     column  importance
 0                                       id    0.003113
 1                                 loanAmnt    0.005172
 2                                     term    0.024438
 3                             interestRate    0.004154
 4                    

In [60]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].mean().sort_values(ascending = False).reset_index()

In [61]:
df_importance.to_csv("./feature_importance_issueDate_year_others.csv")

In [32]:
pd.read_csv("./untitled.txt").to_csv("./ss.csv",encoding = 'gbk')

In [34]:
prediction.to_csv("./submition.csv")