自定义目标函数和评估函数




In [25]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn import  datasets
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import  train_test_split

cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/5,random_state=8)


X_train


xgb_train = xgb.DMatrix(X_train, label=y_train)
xgb_test = xgb.DMatrix(X_test, label=y_test)


params = {
    "booster": "gbtree",
    "eta": "0.1",
    "min_child_weight": 1,
    "max_depth": 5,
}


num_round = 50
watchlist = [(xgb_train, 'train'), (xgb_test, 'test')]


# 自定义目标函数
def logregobj(preds,dtrain):
    labels  = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0 - preds)
    return grad,hess

# 自定义评估函数
def evalerror(preds,dtrain):
    labels = dtrain.get_label()
    return 'error',float(sum(labels != (preds > 0.0))) / len(labels)

bst = xgb.train(params, xgb_train, num_round, watchlist,obj=logregobj,feval=evalerror)

preds = bst.predict(xgb_test)

print(preds)


[0]	train-rmse:0.33664	train-error:0.36484	test-rmse:0.35246	test-error:0.40351
[1]	train-rmse:0.21671	train-error:0.36484	test-rmse:0.25314	test-error:0.40351
[2]	train-rmse:0.17132	train-error:0.04835	test-rmse:0.23050	test-error:0.07895
[3]	train-rmse:0.23711	train-error:0.04835	test-rmse:0.29001	test-error:0.08772
[4]	train-rmse:0.33790	train-error:0.03517	test-rmse:0.38392	test-error:0.07895
[5]	train-rmse:0.44899	train-error:0.01978	test-rmse:0.48731	test-error:0.07018
[6]	train-rmse:0.56289	train-error:0.01319	test-rmse:0.59604	test-error:0.05263
[7]	train-rmse:0.67488	train-error:0.00879	test-rmse:0.69874	test-error:0.05263
[8]	train-rmse:0.78577	train-error:0.00879	test-rmse:0.80545	test-error:0.05263
[9]	train-rmse:0.89346	train-error:0.00879	test-rmse:0.90685	test-error:0.05263
[10]	train-rmse:0.99708	train-error:0.00879	test-rmse:1.00828	test-error:0.05263
[11]	train-rmse:1.10112	train-error:0.00879	test-rmse:1.10701	test-error:0.06140
[12]	train-rmse:1.20159	train-error:0.

# 交叉验证


In [9]:
params = {
    "objective":"binary:logistic",
    "booster": "gbtree",
    "eta": "0.1",
    "min_child_weight": 1,
    "max_depth": 5,
}

res = xgb.cv(params,xgb_train,num_round,nfold=5,metrics={'auc'},seed=0,
             callbacks=[xgb.callback.print_evaluation(show_stdv=True)])

[0]	train-auc:0.99168+0.00342	test-auc:0.96369+0.01686
[1]	train-auc:0.99284+0.00336	test-auc:0.96484+0.01782
[2]	train-auc:0.99321+0.00357	test-auc:0.96753+0.02109
[3]	train-auc:0.99454+0.00291	test-auc:0.96765+0.02243
[4]	train-auc:0.99539+0.00213	test-auc:0.96844+0.02275
[5]	train-auc:0.99574+0.00197	test-auc:0.97498+0.02041
[6]	train-auc:0.99579+0.00194	test-auc:0.98059+0.01920
[7]	train-auc:0.99586+0.00195	test-auc:0.98148+0.01973
[8]	train-auc:0.99656+0.00221	test-auc:0.98133+0.01952
[9]	train-auc:0.99724+0.00208	test-auc:0.98191+0.02016




[10]	train-auc:0.99788+0.00184	test-auc:0.98234+0.02049
[11]	train-auc:0.99805+0.00191	test-auc:0.98258+0.02036
[12]	train-auc:0.99825+0.00190	test-auc:0.98371+0.02066
[13]	train-auc:0.99895+0.00078	test-auc:0.98372+0.02071
[14]	train-auc:0.99903+0.00073	test-auc:0.98390+0.02101
[15]	train-auc:0.99921+0.00055	test-auc:0.98473+0.02086
[16]	train-auc:0.99936+0.00050	test-auc:0.98493+0.02116
[17]	train-auc:0.99940+0.00051	test-auc:0.98502+0.02122
[18]	train-auc:0.99945+0.00044	test-auc:0.98503+0.02120
[19]	train-auc:0.99962+0.00029	test-auc:0.98470+0.02139
[20]	train-auc:0.99970+0.00021	test-auc:0.98524+0.02125
[21]	train-auc:0.99976+0.00015	test-auc:0.98523+0.02150
[22]	train-auc:0.99983+0.00008	test-auc:0.98522+0.02205
[23]	train-auc:0.99989+0.00003	test-auc:0.98522+0.02235
[24]	train-auc:0.99991+0.00003	test-auc:0.98490+0.02251
[25]	train-auc:0.99994+0.00004	test-auc:0.98501+0.02224
[26]	train-auc:0.99994+0.00004	test-auc:0.98492+0.02195
[27]	train-auc:0.99997+0.00002	test-auc:0.98523+

In [10]:

res = xgb.cv(params,xgb_train,num_round,nfold=5,metrics={'auc'},seed=0,
             callbacks=[xgb.callback.print_evaluation(show_stdv=False),
                        xgb.callback.early_stop(5)])

[0]	train-auc:0.99168	test-auc:0.96369
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.
Will train until test-auc hasn't improved in 5 rounds.
[1]	train-auc:0.99284	test-auc:0.96484
[2]	train-auc:0.99321	test-auc:0.96753
[3]	train-auc:0.99454	test-auc:0.96765
[4]	train-auc:0.99539	test-auc:0.96844
[5]	train-auc:0.99574	test-auc:0.97498
[6]	train-auc:0.99579	test-auc:0.98059
[7]	train-auc:0.99586	test-auc:0.98148
[8]	train-auc:0.99656	test-auc:0.98133
[9]	train-auc:0.99724	test-auc:0.98191
[10]	train-auc:0.99788	test-auc:0.98234
[11]	train-auc:0.99805	test-auc:0.98258
[12]	train-auc:0.99825	test-auc:0.98371
[13]	train-auc:0.99895	test-auc:0.98372
[14]	train-auc:0.99903	test-auc:0.98390
[15]	train-auc:0.99921	test-auc:0.98473




[16]	train-auc:0.99936	test-auc:0.98493
[17]	train-auc:0.99940	test-auc:0.98502
[18]	train-auc:0.99945	test-auc:0.98503
[19]	train-auc:0.99962	test-auc:0.98470
[20]	train-auc:0.99970	test-auc:0.98524
[21]	train-auc:0.99976	test-auc:0.98523
[22]	train-auc:0.99983	test-auc:0.98522
[23]	train-auc:0.99989	test-auc:0.98522
[24]	train-auc:0.99991	test-auc:0.98490
[25]	train-auc:0.99994	test-auc:0.98501
Stopping. Best iteration:
[20]	train-auc:0.99970+0.00021	test-auc:0.98524+0.02125


TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [16]:
# 自定义预处理函数
def fpreproc(xgb_train,xgb_test,params):
    label = xgb_train.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label == 1)
    params['scale_pos_weight'] = ratio
    return (xgb_train,xgb_test,params)


res = xgb.cv(params,xgb_train,num_round,nfold=5,metrics={'auc'},seed=0,
             fpreproc=fpreproc,callbacks=[xgb.callback.print_evaluation(show_stdv=False),
                        xgb.callback.early_stop(5)])



[0]	train-auc:0.99293	test-auc:0.95827
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.
Will train until test-auc hasn't improved in 5 rounds.
[1]	train-auc:0.99519	test-auc:0.96721
[2]	train-auc:0.99603	test-auc:0.97396
[3]	train-auc:0.99666	test-auc:0.97573
[4]	train-auc:0.99675	test-auc:0.97745
[5]	train-auc:0.99697	test-auc:0.97996
[6]	train-auc:0.99763	test-auc:0.98106
[7]	train-auc:0.99826	test-auc:0.98129
[8]	train-auc:0.99849	test-auc:0.98215
[9]	train-auc:0.99882	test-auc:0.98250
[10]	train-auc:0.99908	test-auc:0.98295
[11]	train-auc:0.99916	test-auc:0.98303




[12]	train-auc:0.99930	test-auc:0.98337
[13]	train-auc:0.99935	test-auc:0.98348
[14]	train-auc:0.99940	test-auc:0.98381
[15]	train-auc:0.99946	test-auc:0.98392
[16]	train-auc:0.99950	test-auc:0.98402
[17]	train-auc:0.99963	test-auc:0.98411
[18]	train-auc:0.99967	test-auc:0.98430
[19]	train-auc:0.99971	test-auc:0.98474
[20]	train-auc:0.99974	test-auc:0.98462
[21]	train-auc:0.99980	test-auc:0.98647
[22]	train-auc:0.99981	test-auc:0.98637
[23]	train-auc:0.99985	test-auc:0.98657
[24]	train-auc:0.99990	test-auc:0.98626
[25]	train-auc:0.99990	test-auc:0.98626
[26]	train-auc:0.99992	test-auc:0.98645
[27]	train-auc:0.99992	test-auc:0.98667
[28]	train-auc:0.99993	test-auc:0.98676
[29]	train-auc:0.99994	test-auc:0.98663
[30]	train-auc:0.99994	test-auc:0.98674
[31]	train-auc:0.99995	test-auc:0.98664
[32]	train-auc:0.99995	test-auc:0.98652
[33]	train-auc:0.99996	test-auc:0.98631
Stopping. Best iteration:
[28]	train-auc:0.99993+0.00003	test-auc:0.98676+0.01973


TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [18]:

res = xgb.cv(params,xgb_train,num_round,nfold=5,metrics={'auc'},seed=0,
             fpreproc=fpreproc,callbacks=[xgb.callback.print_evaluation(show_stdv=False)],
             obj=logregobj,feval=evalerror)


[0]	train-auc:0.99168	test-auc:0.96369	train-error:0.02692	test-error:0.06154
[1]	train-auc:0.99284	test-auc:0.96484	train-error:0.02033	test-error:0.06593
[2]	train-auc:0.99321	test-auc:0.96753	train-error:0.01868	test-error:0.07253
[3]	train-auc:0.99454	test-auc:0.96765	train-error:0.01593	test-error:0.07033
[4]	train-auc:0.99539	test-auc:0.96844	train-error:0.01374	test-error:0.06813
[5]	train-auc:0.99574	test-auc:0.97498	train-error:0.01044	test-error:0.06154




[6]	train-auc:0.99579	test-auc:0.98059	train-error:0.00824	test-error:0.06374
[7]	train-auc:0.99586	test-auc:0.98148	train-error:0.00824	test-error:0.06374
[8]	train-auc:0.99656	test-auc:0.98133	train-error:0.00659	test-error:0.05714
[9]	train-auc:0.99724	test-auc:0.98191	train-error:0.00495	test-error:0.05934
[10]	train-auc:0.99788	test-auc:0.98234	train-error:0.00495	test-error:0.06154
[11]	train-auc:0.99805	test-auc:0.98258	train-error:0.00549	test-error:0.06374
[12]	train-auc:0.99825	test-auc:0.98371	train-error:0.00549	test-error:0.06154
[13]	train-auc:0.99895	test-auc:0.98372	train-error:0.00495	test-error:0.06154
[14]	train-auc:0.99903	test-auc:0.98390	train-error:0.00495	test-error:0.05934
[15]	train-auc:0.99921	test-auc:0.98473	train-error:0.00495	test-error:0.05934
[16]	train-auc:0.99936	test-auc:0.98493	train-error:0.00440	test-error:0.05934
[17]	train-auc:0.99940	test-auc:0.98502	train-error:0.00440	test-error:0.05495
[18]	train-auc:0.99945	test-auc:0.98503	train-error:0.00

In [23]:


evals_result = {}

bst = xgb.train(params, xgb_train, num_round, watchlist,evals_result=evals_result)

evals_result

print("直接通过evals_result访问error指标")
print(evals_result)
print(evals_result['test']['logloss'])



[0]	train-logloss:0.60907	test-logloss:0.61263
[1]	train-logloss:0.53994	test-logloss:0.54883
[2]	train-logloss:0.48091	test-logloss:0.49563
[3]	train-logloss:0.43008	test-logloss:0.44823
[4]	train-logloss:0.38704	test-logloss:0.41053
[5]	train-logloss:0.35003	test-logloss:0.37543
[6]	train-logloss:0.31740	test-logloss:0.34200
[7]	train-logloss:0.28815	test-logloss:0.31464
[8]	train-logloss:0.26270	test-logloss:0.28995
[9]	train-logloss:0.24026	test-logloss:0.26890
[10]	train-logloss:0.21968	test-logloss:0.24842
[11]	train-logloss:0.20116	test-logloss:0.23137
[12]	train-logloss:0.18456	test-logloss:0.21769
[13]	train-logloss:0.16956	test-logloss:0.20432
[14]	train-logloss:0.15608	test-logloss:0.19283
[15]	train-logloss:0.14405	test-logloss:0.18216
[16]	train-logloss:0.13316	test-logloss:0.17173
[17]	train-logloss:0.12343	test-logloss:0.16304
[18]	train-logloss:0.11446	test-logloss:0.15523
[19]	train-logloss:0.10642	test-logloss:0.14745
[20]	train-logloss:0.09928	test-logloss:0.14159
[2

In [26]:
bst = xgb.train(params, xgb_train, num_round, watchlist)

label = xgb_test.get_label()
pred1 = bst.predict(xgb_test,ntree_limit=10)


pred2 = bst.predict(xgb_test)

print("前10颗树预测AUC：%f" % roc_auc_score(y_test,pred1))
print("所有树预测AUC：%f" % roc_auc_score(y_test,pred2))



[0]	train-rmse:0.45269	test-rmse:0.45815
[1]	train-rmse:0.41014	test-rmse:0.42210
[2]	train-rmse:0.37156	test-rmse:0.38886
[3]	train-rmse:0.33676	test-rmse:0.36104
[4]	train-rmse:0.30553	test-rmse:0.33805
[5]	train-rmse:0.27785	test-rmse:0.31417
[6]	train-rmse:0.25309	test-rmse:0.29338
[7]	train-rmse:0.23096	test-rmse:0.27406
[8]	train-rmse:0.21088	test-rmse:0.25753
[9]	train-rmse:0.19271	test-rmse:0.24410
[10]	train-rmse:0.17666	test-rmse:0.23204
[11]	train-rmse:0.16230	test-rmse:0.22171
[12]	train-rmse:0.14920	test-rmse:0.21284
[13]	train-rmse:0.13689	test-rmse:0.20546
[14]	train-rmse:0.12546	test-rmse:0.19925
[15]	train-rmse:0.11567	test-rmse:0.19410
[16]	train-rmse:0.10728	test-rmse:0.19015
[17]	train-rmse:0.09904	test-rmse:0.18676
[18]	train-rmse:0.09158	test-rmse:0.18374
[19]	train-rmse:0.08569	test-rmse:0.18114
[20]	train-rmse:0.08015	test-rmse:0.17886
[21]	train-rmse:0.07477	test-rmse:0.17719
[22]	train-rmse:0.06999	test-rmse:0.17588
[23]	train-rmse:0.06580	test-rmse:0.17474
[2

