/
scale_pos_weight.py
58 lines (48 loc) · 2.89 KB
/
scale_pos_weight.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# Author:马肖
# E-mail:maxiaoscut@aliyun.com
# Github:https://github.com/Albertsr
import numpy as np
from xgboost import XGBClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, recall_score, confusion_matrix
X, y = make_classification(n_samples=15000, n_classes=2, n_features=10,
n_informative=8, weights=[0.8, 0.2], random_state=2018)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2018)
# ratio为训练集中负样本数与正样本数之间的比率
ratio = sum(y_train==0) / sum(y_train==1)
# 构造gmeans指标,等价于recall与specificity的几何平均
def gmean_score(y_true, y_pred):
recall = recall_score(y_true, y_pred)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
specificity = tn / (fp + tn)
return np.sqrt(recall * specificity)
# 构造样本权重生成函数,fn_cost,fp_cost分别表示FN,FP的代价
# 在风控领域,FN是指没有检测出欺诈交易,FP是指将正常交易误判为欺诈交易,因此FN的代价应大于FP的代价
# 实际应用场景中,正样本为少数,因此应赋予更大的权重,即正负样本的权重应与它们被误分引致的代价成一定的比例关系
# the weights are in proportion to their corresponding misclassification costs
def get_weight(fn_cost, fp_cost, y_train=y_train):
weight = [fn_cost if i == 1 else fp_cost for i in y_train]
return np.array(weight)
# 构造模型评估函数,暂取AUC、F1、Recall、Gmeans作为模型评估指标
def model_perfomance(model, train_weight):
model.fit(X_train, y_train, sample_weight=train_weight)
y_pred, y_prob = model.predict(X_train), model.predict_proba(X_train)[:,-1]
auc = roc_auc_score(y_train, y_prob)
f1 = f1_score(y_train, y_pred)
recall = recall_score(y_train, y_pred)
gmeans = gmean_score(y_train, y_pred)
# print('AUC:{:.6f}, F_Score:{:.6f}, Recall:{:.6f}, GMeans:{:.6f}'.format(auc, f1, recall, gmeans))
return np.array([auc, f1, recall, gmeans])
# 在分类器clf_1中,设置参数scale_pos_weight的参数为ratio
# train_weight设置为get_weight(1, 1),表示正负训练样本的权重都设置为1,不进行区分
clf_1 = XGBClassifier(n_estimators=50, scale_pos_weight=ratio)
perfomance_1 = model_perfomance(model=clf_1, train_weight=get_weight(1, 1))
# 在分类器clf_2中,未设置参数scale_pos_weight的参数
# train_weight设置为get_weight(ratio, 1),表示正负训练样本的权重之比为ratio
clf_2 = XGBClassifier(n_estimators=50)
perfomance_2 = model_perfomance(clf_2, get_weight(ratio, 1))
contrast = np.allclose(perfomance_1, perfomance_2)
decription = 'The parameter scale_pos_weight can be equivalent to the sample_weight parameter in the fit method.'
assert contrast, decription
if contrast: print(decription)