In [1]:
import pandas as pd
import gc
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import xgboost
import lightgbm
seed_state = 2022

In [2]:
train_df = pd.read_pickle("./train_df.pkl")
test_df = pd.read_pickle("./test_df.pkl")

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 50 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       800000 non-null  int64  
 1   loanAmnt                 800000 non-null  float64
 2   term                     800000 non-null  int64  
 3   interestRate             800000 non-null  float64
 4   installment              800000 non-null  float64
 5   grade                    800000 non-null  int32  
 6   subGrade                 800000 non-null  int32  
 7   employmentTitle          800000 non-null  float64
 8   employmentLength         800000 non-null  int32  
 9   homeOwnership            800000 non-null  int64  
 10  annualIncome             800000 non-null  float64
 11  verificationStatus       800000 non-null  int64  
 12  isDefault                800000 non-null  int64  
 13  purpose                  800000 non-null  int64  
 14  post

In [4]:
train_X = train_df.copy()
train_y = train_X['isDefault']

train_X.drop(labels = ['isDefault'], axis = 1, inplace = True)

In [5]:
train_X.columns

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'purpose', 'postCode',
       'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
       'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
       'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7',
       'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'issueDate_year',
       'issueDate_month', 'issueDate_day', 'earliesCreditLine_month',
       'earliesCreditLine_year'],
      dtype='object')

In [6]:
test_df.columns

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'purpose', 'postCode',
       'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
       'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
       'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7',
       'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'issueDate_year',
       'issueDate_month', 'issueDate_day', 'earliesCreditLine_month',
       'earliesCreditLine_year'],
      dtype='object')

In [7]:
feature_names = list(train_X.columns)

#包外验证
df_oof = train_X[['id']].copy()
df_oof['label'] = train_y
df_oof['prob'] = 0

#存放测试结果
prediction = test_df[['id']]
prediction['prob'] = 0

#重要特征列表
df_importance_list = []

model = xgboost.XGBClassifier(
                            n_estimators = 500,
                            n_jobs = -1,
                            max_depth = 6,
                            learning_rate = 0.05,
                            subsample = 0.8,
                            gamma = 0.5,
                            reg_alpha = 3,
                            reg_lambda = 1,
                            random_state = seed_state)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed_state)

for fold_id,(train_idx, val_idx) in enumerate(kfold.split(train_X, train_y)):
    train_X_kfold = train_X.iloc[train_idx]
    train_y_kfold = train_y.iloc[train_idx]
    val_X_kfold = train_X.iloc[val_idx]
    val_y_kfold = train_y.iloc[val_idx]
    
    print('\nFold_{} Training ========================\n'.format(
        fold_id + 1))
    
    xgb_model = model.fit(train_X_kfold,
                          train_y_kfold,
                          eval_set=[(train_X_kfold, train_y_kfold), (val_X_kfold, val_y_kfold)],
                          verbose=100,
                          eval_metric='auc',
                          early_stopping_rounds=50)
    
    pred_val = xgb_model.predict_proba(val_X_kfold)[:, 1]
    df_oof.loc[val_idx, 'prob'] = pred_val
    
    
    pred_test = xgb_model.predict_proba(test_df)[:, 1]
    prediction['prob'] += pred_test / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': xgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del xgb_model, pred_val, pred_test, train_X_kfold, train_y_kfold, val_X_kfold, val_y_kfold
    gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prob'] = 0








[0]	validation_0-auc:0.70479	validation_1-auc:0.70244
[100]	validation_0-auc:0.73289	validation_1-auc:0.72533
[200]	validation_0-auc:0.74383	validation_1-auc:0.73026
[300]	validation_0-auc:0.75104	validation_1-auc:0.73218
[400]	validation_0-auc:0.75678	validation_1-auc:0.73322
[499]	validation_0-auc:0.76206	validation_1-auc:0.73395


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prob'] += pred_test / kfold.n_splits








[0]	validation_0-auc:0.70529	validation_1-auc:0.70377
[100]	validation_0-auc:0.73243	validation_1-auc:0.72665
[200]	validation_0-auc:0.74346	validation_1-auc:0.73225
[300]	validation_0-auc:0.75063	validation_1-auc:0.73430
[400]	validation_0-auc:0.75640	validation_1-auc:0.73526
[499]	validation_0-auc:0.76140	validation_1-auc:0.73599


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prob'] += pred_test / kfold.n_splits








[0]	validation_0-auc:0.70510	validation_1-auc:0.70513
[100]	validation_0-auc:0.73231	validation_1-auc:0.72761
[200]	validation_0-auc:0.74320	validation_1-auc:0.73276
[300]	validation_0-auc:0.75037	validation_1-auc:0.73488
[400]	validation_0-auc:0.75611	validation_1-auc:0.73596
[499]	validation_0-auc:0.76122	validation_1-auc:0.73653


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prob'] += pred_test / kfold.n_splits








[0]	validation_0-auc:0.70520	validation_1-auc:0.70393
[100]	validation_0-auc:0.73300	validation_1-auc:0.72560
[200]	validation_0-auc:0.74370	validation_1-auc:0.73065
[300]	validation_0-auc:0.75101	validation_1-auc:0.73270
[400]	validation_0-auc:0.75682	validation_1-auc:0.73360
[499]	validation_0-auc:0.76207	validation_1-auc:0.73429


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prob'] += pred_test / kfold.n_splits








[0]	validation_0-auc:0.70525	validation_1-auc:0.70229
[100]	validation_0-auc:0.73277	validation_1-auc:0.72518
[200]	validation_0-auc:0.74332	validation_1-auc:0.73035
[300]	validation_0-auc:0.75053	validation_1-auc:0.73259
[400]	validation_0-auc:0.75635	validation_1-auc:0.73365
[499]	validation_0-auc:0.76164	validation_1-auc:0.73438


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction['prob'] += pred_test / kfold.n_splits


In [8]:
df_importance_list

[                     column  importance
 0                        id    0.003391
 1                  loanAmnt    0.008201
 2                      term    0.036843
 3              interestRate    0.006147
 4               installment    0.007078
 5                     grade    0.376365
 6                  subGrade    0.294429
 7           employmentTitle    0.005521
 8          employmentLength    0.009643
 9             homeOwnership    0.024297
 10             annualIncome    0.007996
 11       verificationStatus    0.008404
 12                  purpose    0.005063
 13                 postCode    0.003910
 14               regionCode    0.006809
 15                      dti    0.008548
 16       delinquency_2years    0.005357
 17             ficoRangeLow    0.009779
 18            ficoRangeHigh    0.000000
 19                  openAcc    0.003466
 20                   pubRec    0.005117
 21       pubRecBankruptcies    0.005646
 22                 revolBal    0.006899
 23             

In [26]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].mean().sort_values(ascending = False).reset_index()

In [28]:
df_importance.to_csv("./feature_importance.csv")

In [32]:
pd.read_csv("./untitled.txt").to_csv("./ss.csv",encoding = 'gbk')

In [34]:
prediction.to_csv("./submition.csv")