In [1]:
import torch
import preprocess
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

In [2]:
# 设置任务和模型
task_name = "CoLA" # ["CoLA", "SST-2", "MRPC", "STS-B", "QQP", "MNLI-m", "MNLI-mm", "QNLI", "RTE", "WNLI"]
model_name = "models/bert-base-uncased" # [bert-base-cased, bert-base-uncased, roberta-base]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# 设置超参数
# 训练参数
SEED = 0
BATCH_SIZE = 8
EPOCH = 10
LEARNING_RATE = 1e-5
# 对抗训练参数
Adv_step = 5
Adv_epsilon = 1e-1
Adv_max_norm = 3e-2  # 0表示不限制扰动大小

In [4]:
model, dataloader, metric= preprocess.preprocess(task_name, model_name, BATCH_SIZE, SEED)
train_dataloader,eval_dataloader,test_dataloader = dataloader

Reusing dataset glue (/home/ailab/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


------------------ load dataset ------------------
[Notice]: loading dataset...


  0%|          | 0/3 [00:00<?, ?it/s]

[Notice]: dataset cola loaded.
--------------------------------------------------
------------ load tokenizer and model ------------
[Notice]: loading tokenizer and model...


Some weights of the model checkpoint at models/bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkp

[Notice]: tokenizer and model loaded.
--------------------------------------------------
------------- tokenize the dataset -------------
[Notice]: tokenizing the dataset...
[Notice]: the dataset is tokenized.
--------------------------------------------------
----------------- make dataloader ------------------
[Notice]: making dataloader...
[Notice]: make dataloader is done.
--------------------------------------------------


In [5]:
# 设置优化器
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
# lr_scheduler_name = ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=EPOCH * len(train_dataloader))

In [6]:
# Training
print("*"*20, "Training", "*"*20)  # 训练参数
print("TASK:", task_name)
print("MODEL:", model_name)
print("DEVICE:", device)
print("-"*50)
print("EPOCH_NUM:", EPOCH)
print("BATCH_SIZE:", BATCH_SIZE)
print("LEARNING_RATE:", LEARNING_RATE)
print("="*14, "Adversarial Training", "="*14)  # 对抗训练参数
print("Adversarial_Training_type:","FreeLB")
print("Adversarial_step:", Adv_step)
print("Adversarial_epsilon:", Adv_epsilon)
print("Adversarial_max_norm:", Adv_max_norm)
print("*"*50)
model.to(device)
progress_bar = tqdm(range(EPOCH * len(train_dataloader)))
eval_metric_list = []
for i in range(EPOCH):
    print("-"*20, "EPOCH:", i, "-"*20)
    print("Training...", end='')
    model.train()
    for batch in train_dataloader:
        for t in batch:
            batch[t] = batch[t].to(device)

        # [begin] Adversarial Training
        # 1.init delta
        if "bert-base" in model_name:
            word_embedding_init = model.bert.embeddings.word_embeddings(batch["input_ids"])
        elif "roberta-base" in model_name:
            word_embedding_init = model.roberta.embeddings.word_embeddings(batch["input_ids"])
        delta = torch.zeros_like(word_embedding_init)
        inputs = {"attention_mask" : batch["attention_mask"], "labels" : batch["labels"], "token_type_ids" : batch["token_type_ids"]}
        # 模型输入参数: "attention_mask","labels","token_type_ids","inputs_embeds"

        # 2.update delta
        for step in range(Adv_step):
            delta.requires_grad = True
            word_embedding = word_embedding_init.clone().detach()
            inputs["inputs_embeds"] = delta + word_embedding # 「inputs_embeds」和「input_ids」参数二选一，使用添加扰动的「inputs_embeds」
            outputs = model(**inputs)
            loss = outputs.loss / Adv_step
            loss.backward() # 叠加更新扰动时的梯度，因此不进行optimizer.zero_grad()

            delta_grad = delta.grad.clone().detach() # delta的梯度
            delta_grad_norm = torch.norm(delta_grad.view(delta_grad.size(0), -1), dim=1).view(-1, 1, 1) # .view：摊平  .norm：求L2范数  .view：重新设置shape
            delta_grad_norm = torch.clamp(delta_grad_norm, min=1e-8)  # 设置最小值，避免下一步除数为0
            delta = (delta + Adv_epsilon * delta_grad / delta_grad_norm).detach()  # 新的扰动 δ = δ + ε * g/||g||

            if Adv_max_norm > 0:  # 限制扰动大小  Adv_max_norm = 0 则不限制
                delta_norm = torch.norm(delta.view(delta.size(0), -1).float(), p=2, dim=1).detach()
                exceed_mask = (delta_norm > Adv_max_norm).to(delta_norm)
                if(sum(exceed_mask) != 0):  # 存在超出限制的扰动大小
                    reweights = (Adv_max_norm / delta_norm * exceed_mask + (1-exceed_mask)).view(-1, 1, 1) # 缩减比例
                    delta = (delta * reweights).detach()  # 按比例缩减到norm-ball内

        # 3.optimize model parameters
        optimizer.step() # 利用累加的梯度进行参数更新
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        # [end] Adversarial Training

    print("\r", "Eval...", end='')
    model.eval()
    for batch in eval_dataloader:
        for t in batch:
            batch[t] = batch[t].to(device)
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1) if task_name != "STS-B" else outputs.logits.squeeze()
        metric.add_batch(predictions=predictions, references=batch["labels"])
    score = metric.compute()
    eval_metric_list.append(score)
    print("\r","Metric:", score)
    print("-"*50)

# Best score in eval
score_list = []
for m in eval_metric_list:
    score_list.append(list(m.values())[0])
print("*"*19, "Best Score", "*"*19)
print("EPOCH:", score_list.index(max(score_list)))
print("Metric:", eval_metric_list[score_list.index(max(score_list))])
print("*"*50)

******************** Training ********************
TASK: CoLA
MODEL: models/bert-base-uncased
DEVICE: cuda:0
--------------------------------------------------
EPOCH_NUM: 10
BATCH_SIZE: 8
LEARNING_RATE: 1e-05
Adversarial_Training_type: FreeLB
Adversarial_step: 5
Adversarial_epsilon: 0.1
Adversarial_max_norm: 0.03
**************************************************


  0%|          | 0/10690 [00:00<?, ?it/s]

-------------------- EPOCH: 0 --------------------
 Metric: {'matthews_correlation': 0.5676052483913692}
--------------------------------------------------
-------------------- EPOCH: 1 --------------------
 Metric: {'matthews_correlation': 0.5548623104225698}
--------------------------------------------------
-------------------- EPOCH: 2 --------------------
 Metric: {'matthews_correlation': 0.5676609066599885}
--------------------------------------------------
-------------------- EPOCH: 3 --------------------
 Metric: {'matthews_correlation': 0.5726999708077573}
--------------------------------------------------
-------------------- EPOCH: 4 --------------------
 Metric: {'matthews_correlation': 0.5879831868448624}
--------------------------------------------------
-------------------- EPOCH: 5 --------------------
 Metric: {'matthews_correlation': 0.5905123335559759}
--------------------------------------------------
-------------------- EPOCH: 6 --------------------
 Metric: {'ma