In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import torch
import random

from sklearn.datasets import load_breast_cancer, load_svmlight_file
from sklearn.model_selection import train_test_split
from sklearn import metrics

def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
setup_seed(20)

In [2]:
def get_auc(y, pred):
    fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    return metrics.auc(fpr, tpr)

def eval_robust(bst, data, y, mask_attack, n=10):
    auc_list = [get_auc(y, bst.predict(mask_attack(data))) for i in range(n)]
    return {"auc mean": np.mean(auc_list),
            "auc min": np.min(auc_list),
            "auc max": np.max(auc_list),
            "auc std": np.std(auc_list)}

class MaskAttack(object):
    def __init__(self, mask_rate):
        self.mask_rate = mask_rate
    
    def __call__(self, data):
        mask_arr = np.random.rand(*data.shape) < self.mask_rate
        mask_data = data * (1 - mask_arr)
        return mask_data

In [3]:
X, y = load_svmlight_file("./lgb.data")
X = np.array(X.todense())
y = np.array([1 if v > 0 else 0 for v in y])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
X_valid,  X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=1)


In [4]:
X_test.shape, X_train.shape

((4540, 123), (13617, 123))

In [5]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.03,
    "num_leaves": 32,
    "max_depth": 5,
    "verbose": -1,
    "num_boost_round": 200,
    "subsample": 0.8
}

In [6]:
# 10%的概率mask掉特征值
mask_attack = MaskAttack(0.1)

### 正常训练

In [7]:
train_ds = lgb.Dataset(data=X_train, label=y_train)
valid_ds = lgb.Dataset(data=X_valid, label=y_valid)
bst = lgb.train(params=params, train_set=train_ds
                , early_stopping_rounds=10, valid_sets=valid_ds, verbose_eval=-1)




Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[197]	valid_0's auc: 0.905348


### 鲁棒训练

In [8]:
train_ds = lgb.Dataset(data=mask_attack(X_train), label=y_train)
valid_ds = lgb.Dataset(data=mask_attack(X_valid), label=y_valid)
mask_bst = lgb.train(params=params, train_set=train_ds
                     , early_stopping_rounds=10, valid_sets=valid_ds, verbose_eval=-1)


Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[199]	valid_0's auc: 0.896785


### 结果比较

In [9]:
# 在原始测试数据集合上：正常训练和鲁棒训练auc差距不大
get_auc(y_test, bst.predict(X_test)), get_auc(y_test, mask_bst.predict(X_test))

(0.8956432529661212, 0.8932529008930782)

In [10]:
# 在mask的测试集合上，正常训练Auc下降明显, 且auc的标准差比较大
eval_robust(bst, X_test, y_test, mask_attack=mask_attack, n=100)

{'auc mean': 0.8703457302543195,
 'auc min': 0.8634086579718365,
 'auc max': 0.8783485214988186,
 'auc std': 0.0029531635172742646}

In [11]:
# 在mask的测试集合上，鲁棒训练，auc下降不明显
eval_robust(mask_bst, X_test, y_test, mask_attack=mask_attack, n=100)

{'auc mean': 0.88663892207554,
 'auc min': 0.882800606048114,
 'auc max': 0.8904063428930892,
 'auc std': 0.00143452980789563}