In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import xgboost as xgb
warnings.filterwarnings("ignore")
%matplotlib inline

#导入经过预处理后的训练数据
train=pd.read_csv('C:/Users/HP/Desktop/train_clean.csv')

from sklearn.model_selection import KFold

#分离数据集，并剔除无关变量
x_train=train.drop(['isDefault','id','issueDate'],axis=1)
x_train=x_train.loc[:, :'issueDateDT']
y_train=train['isDefault']

#使用astype函数对grade,subgrade,employmentlength进行数据类型转换
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()
a=['grade','subGrade','employmentLength','earliesCreditLine']
for name in a:
    for col1 in list(x_train):
        if col1 == name:
            x_train[col1]=lbl.fit_transform(x_train[col1].astype(str))

print(x_train.shape,y_train.shape)

#进行五折交叉检验
folds=5
seed=2020
kf=KFold(n_splits=folds,shuffle=True,random_state=seed)

from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost as xgb

#对训练集数据进行划分，分成训练集和验证集
x_train_split,x_val,y_train_split,y_val=train_test_split(x_train,y_train,test_size=0.2)
train_matrix=xgb.DMatrix(x_train_split,label=y_train_split)
valid_matrix=xgb.DMatrix(x_val,label=y_val)

#设置xgb参数
params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'gamma': 0,
        'min_child_weight': 5,
        'max_depth': 7,
        'lambda': 1,
        'subsample': 1,
        'colsample_bytree': 1,
        'colsample_bylevel': 1,
        'eta': 0.3,
        'tree_method': 'exact',
        'seed': 2020,
        'n_jobs': -1,
        "silent": True,
    }
watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
model=xgb.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pred)
roc_auc = metrics.auc(fpr, tpr)
print('调参前xgboost单模型在验证集上的AUC：{}'.format(roc_auc))

#栅格搜索调整参数
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV

#根据调优结果将n_estimatores调整为1494,将max_depth调整为7,将min_child_weight调整为5
#进行gamma参数调优
xgb2 = XGBClassifier(max_depth=7,
                     learning_rate=0.1,
                     n_estimatores=1494,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0,
                     min_child_weight=5,
                     subsample=0.8,
                     colsample_bytree=0.8)

param_grid={
    'gamma':[i/10.0 for i in range(5,20)]
}
grid_search=GridSearchCV(xgb2,param_grid,scoring='roc_auc',iid=False,cv=5)

grid_search.fit(x_train_split,y_train_split)
print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)

(612742, 42) (612742,)
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.70961	eval-auc:0.70534
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.81928	eval-auc:0.72182
Stopping. Best iteration:
[52]	train-auc:0.76533	eval-auc:0.72687

调参前xgboost单模型在验证集上的AUC：0.7268708964538249
Parameters: { n_estimatores } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimatores } might not be used.

  This may not be accurat

In [4]:
#将参数gamma调整为1.7，并进行subsample和colsample_bytree的参数调优
xgb3 = XGBClassifier(max_depth=7,
                     learning_rate=0.1,
                     n_estimatores=1494,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=1.7,
                     min_child_weight=5,
                     subsample=0.8,
                     colsample_bytree=0.8)

param_grid={'subsample':[i/10.0 for i in range(5,11)],
           'colsample_bytree':[i/10.0 for i in range(5,11)]}
grid_search=GridSearchCV(xgb3,param_grid,scoring='roc_auc',iid=False,cv=5)

grid_search.fit(x_train_split,y_train_split)
print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)

Parameters: { n_estimatores } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimatores } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimatores } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimatores } might not be used.

  This may not be accurate due to some parameters are only used in language binding

In [5]:
#将colsample_bytree调整为0.7，将subsample调整为1.0，并进行正则化参数的调优
xgb4 = XGBClassifier(max_depth=7,
                     learning_rate=0.1,
                     n_estimatores=1494,
                     objective='binary:logistic',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=1.7,
                     min_child_weight=5,
                     subsample=1.0,
                     colsample_bytree=0.7)

param_grid={'reg_lambda':[i/5.0 for i in range(10,50)]}
grid_search=GridSearchCV(xgb4,param_grid,scoring='roc_auc',iid=False,cv=5)

grid_search.fit(x_train_split,y_train_split)
print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)

Parameters: { n_estimatores } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimatores } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimatores } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimatores } might not be used.

  This may not be accurate due to some parameters are only used in language binding