#### 构建数据集
只抽取训练集中的前 22 万条数据，并从中划分出 2 万条数据作为验证集，然后将 translation2019zh 中的验证集作为测试集：

In [7]:
from torch.utils.data import Dataset, random_split
import json

max_dataset_size = 220000
train_set_size = 200000
valid_set_size = 20000

class TRANS(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
    
    def load_data(self, data_file):
        Data = {}
        with open(data_file, 'rt', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                if idx >= max_dataset_size:
                    break
                sample = json.loads(line.strip()) #Sample: dict()
                Data[idx] = sample
        return Data #A dict(idx) -> a dict(Chinese/English)
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

data = TRANS('data/translation2019zh_train.json')
train_data, valid_data = random_split(data, [train_set_size, valid_set_size])
test_data = TRANS('data/translation2019zh_valid.json')

In [8]:
print(f'train set size: {len(train_data)}')
print(f'valid set size: {len(valid_data)}')
print(f'test set size: {len(test_data)}')
print(next(iter(train_data)))

train set size: 200000
valid set size: 20000
test set size: 39323
{'english': '13 emotional drama, I could not act.', 'chinese': '感情的戏，我没演技。'}


#### 数据预处理
**注意**: 默认情况下分词器会采用源语言(zh)的设定来编码文本，要编码目标语言(en)则需要通过上下文管理器 `tokenizer.as_target_tokenizer()`：
模型的输入包括一个字典，关键字含
+ `attention_mask`(attention机制0/1)
+ `input_ids`(incoder输入的字符编号)
+ `labels`(decoder输出字符的编号)
+ `decoder_input_ids`(decoder输入的编号，即上一个cell的输出)，直接调用函数`model.prepare_decoder_input_ids_from_labels`

In [12]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
torch.cuda.empty_cache() #清空缓存

max_input_length = 128
max_target_length = 128
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model = model.to(device)

def collote_fn(batch_samples):
    batch_inputs, batch_targets = [], []
    for sample in batch_samples: #Sample: dict() 套 dict()
        batch_inputs.append(sample['chinese']) # [Set_size]
        batch_targets.append(sample['english']) # [Set_size]
    batch_data = tokenizer( #处理中文
        batch_inputs, 
        padding=True, 
        max_length=max_input_length,
        truncation=True, 
        return_tensors="pt"
    ) #返回["input_ids"] ["Attention_mask"]
    with tokenizer.as_target_tokenizer(): #处理英文
        labels = tokenizer(
            batch_targets, 
            padding=True, 
            max_length=max_target_length,
            truncation=True, 
            return_tensors="pt"
        )["input_ids"] #只取["input_ids"]
        batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels) #错位读上一个的输出
        end_token_index = torch.where(labels == tokenizer.eos_token_id)[1] #将输出最后一位改为"<\s>"，即-100
        for idx, end_idx in enumerate(end_token_index):
            labels[idx][end_idx+1:] = -100
        batch_data['labels'] = labels
    return batch_data

train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collote_fn)
valid_dataloader = DataLoader(valid_data, batch_size=32, shuffle=False, collate_fn=collote_fn)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collote_fn)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]



Using cpu device


Downloading pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [20]:
x =torch.randn(2,2,2)
print(x)
print(torch.where(x >0))

tensor([[[ 1.1239,  0.8234],
         [-0.4331,  1.0536]],

        [[-1.7165,  0.2575],
         [ 0.5476,  0.2206]]])
(tensor([0, 0, 0, 1, 1, 1]), tensor([0, 0, 1, 0, 1, 1]), tensor([0, 1, 1, 1, 0, 1]))


#### 训练&验证

使用 `AutoModelForSeq2SeqLM` 构造的模型已经封装好了对应的损失函数，并且计算出的损失会直接包含在模型的输出 outputs 中，可以直接通过 outputs.loss 获得

评测指标：BLEU，对于中文需手动指定`tokenize='zh'`，英文不需要

训练：直接将含四个参数的字典喂进model
验证：使用`.generate()`方法喂前两个参数，获得输出idx后再用`.batch_decode()`批量处理两维Tensor，最终得到一个 list\[ str() \]

In [11]:
from tqdm.auto import tqdm
from transformers import get_scheduler
from torch.optim import AdamW
from sacrebleu.metrics import BLEU
import numpy as np
bleu = BLEU()


def train_loop(trainloader, model, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(range(len(trainloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_batch_num = (epoch-1) * len(trainloader) #上一epoch结束后，最终的batch编号
    
    model.train()
    for batch, batch_data in enumerate(trainloader, start=1):
        batch_data = batch_data.to(device)
        outputs = model(**batch_data) # ** 相当于是将字典内每个部分当成一个参数，输入进model内
        loss = outputs.loss
        print(loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_description(f'[TRAIN] loss: {total_loss/(finish_batch_num + batch):>7f}')
        progress_bar.update(1)
    return total_loss


def test_loop(valloader, model):
    preds, labels = [], []

    model.eval()
    for batch_data in tqdm(valloader, desc='[VAL]'):
        batch_data = batch_data.to(device)
        with torch.no_grad():
            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_length=max_target_length,
            ).cpu().numpy() #验证/测试中，只需要"input_ids"及"attention_mask"
        label_tokens = batch_data["labels"].cpu().numpy()
        
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) #模型输出（idx->str）
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True) #数据集输出（idx->str）

        preds += [pred.strip() for pred in decoded_preds]
        labels += [[label.strip()] for label in decoded_labels] #注意:BLEU计算的labels需要以list套list的形式储存
    bleu_score = bleu.corpus_score(preds, labels).score
    print(f"BLEU: {bleu_score:>0.2f}\n")
    return bleu_score



learning_rate = 1e-5
epoch_num = 8

optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler( 
    "linear",
    optimizer=optimizer,
    num_warmup_steps=len(train_dataloader)//10,
    num_training_steps=epoch_num*len(train_dataloader),
)
train_loss_list, val_loss_list = [], []
total_loss = 0.
best_bleu = 0.
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)
    train_loss_list.append(total_loss)
    valid_bleu = test_loop(valid_dataloader, model)
    if valid_bleu > best_bleu:
        best_bleu = valid_bleu
        print('saving new weights...\n')
        torch.save(model.state_dict(), f'epoch_{t+1}_valid_bleu_{valid_bleu:0.2f}_model_weights.bin')
print("Done!")

Epoch 1/8
-------------------------------


[TRAIN] loss: 2.440131:  14%|█▍        | 1/7 [00:00<00:02,  2.72it/s]

tensor(2.4401, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3544, device='cuda:0', grad_fn=<NllLossBackward0>)


[TRAIN] loss: 2.321745:  43%|████▎     | 3/7 [00:00<00:00,  4.59it/s]

tensor(2.1707, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.1890, device='cuda:0', grad_fn=<NllLossBackward0>)


[TRAIN] loss: 2.240935:  71%|███████▏  | 5/7 [00:01<00:00,  5.36it/s]

tensor(2.0504, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(2.3202, device='cuda:0', grad_fn=<NllLossBackward0>)


[TRAIN] loss: 2.221717: 100%|██████████| 7/7 [00:01<00:00,  5.33it/s]


tensor(2.0272, device='cuda:0', grad_fn=<NllLossBackward0>)


[VAL]: 100%|██████████| 1/1 [00:01<00:00,  1.41s/it]


BLEU: 12.37

saving new weights...

Epoch 2/8
-------------------------------


[TRAIN] loss: 2.207874:   0%|          | 0/7 [00:00<?, ?it/s]

tensor(2.1110, device='cuda:0', grad_fn=<NllLossBackward0>)


[TRAIN] loss: 2.207874:  14%|█▍        | 1/7 [00:00<00:01,  4.85it/s]

tensor(2.4743, device='cuda:0', grad_fn=<NllLossBackward0>)


[TRAIN] loss: 2.237481:  29%|██▊       | 2/7 [00:00<00:00,  5.35it/s]

tensor(2.3060, device='cuda:0', grad_fn=<NllLossBackward0>)


[TRAIN] loss: 2.244335:  43%|████▎     | 3/7 [00:00<00:00,  5.76it/s]

tensor(2.1836, device='cuda:0', grad_fn=<NllLossBackward0>)


[TRAIN] loss: 2.238816:  57%|█████▋    | 4/7 [00:00<00:00,  5.76it/s]

tensor(2.1543, device='cuda:0', grad_fn=<NllLossBackward0>)


[TRAIN] loss: 2.231774:  71%|███████▏  | 5/7 [00:00<00:00,  5.85it/s]

tensor(1.8052, device='cuda:0', grad_fn=<NllLossBackward0>)


[TRAIN] loss: 2.183195:  86%|████████▌ | 6/7 [00:01<00:00,  5.79it/s]

tensor(1.9783, device='cuda:0', grad_fn=<NllLossBackward0>)


[TRAIN] loss: 2.183195: 100%|██████████| 7/7 [00:01<00:00,  6.16it/s]
[VAL]: 100%|██████████| 1/1 [00:01<00:00,  1.41s/it]


BLEU: 12.37

Epoch 3/8
-------------------------------


loss: 0.000000:   0%|          | 0/7 [00:00<?, ?it/s]

tensor(1.9941, device='cuda:0', grad_fn=<NllLossBackward0>)


KeyboardInterrupt: 

In [7]:
from tqdm.auto import tqdm
#from sacrebleu.metrics import BLEU
import evaluate
import numpy as np
bleu = evaluate.load("bleu")


def test(test_dataloader, model, model_file):    
    model.load_state_dict(torch.load(model_file)) #根据具体输出值，读入相应文件

    model.eval()
    with torch.no_grad():
        print('evaluating on test set...')
        sources, preds, labels = [], [], []
        for batch_data in tqdm(test_dataloader):
            batch_data = batch_data.to(device)
            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_length=max_target_length,
            ).cpu().numpy() #喂进模型，获得idx
            label_tokens = batch_data["labels"].cpu().numpy() #test数据集上的label

            decoded_sources = tokenizer.batch_decode(
                batch_data["input_ids"].cpu().numpy(), 
                skip_special_tokens=True, 
                use_source_tokenizer=True
            )# 将原输入的idx解码为label
            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) # 将预测的idx解码为label
            label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id) # 将label的替换编号重新改为pad_token_id
            decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True) # 将原标签的idx解码为label

            ### 储存:
            sources += [source.strip() for source in decoded_sources]
            preds += [pred.strip() for pred in decoded_preds]
            labels += [[label.strip()] for label in decoded_labels]
        bleu_score = bleu.compute(predictions=preds, references=labels)["bleu"] #计算得分
        print(f"Test BLEU: {bleu_score:>0.2f}\n")
        results = []
        print('saving predicted results...')
        for source, pred, label in zip(sources, preds, labels):
            results.append({
                "sentence": source, 
                "prediction": pred, 
                "translation": label[0]
            })
        with open('test_data_pred.json', 'wt', encoding='utf-8') as f:
            for exapmle_result in results:
                f.write(json.dumps(exapmle_result, ensure_ascii=False) + '\n') #将结果输出为json

model_file = "epoch_1_valid_bleu_58.44_model_weights.bin"
test(test_dataloader, model, model_file)

evaluating on test set...


100%|██████████| 1229/1229 [46:02<00:00,  2.25s/it]


Test BLEU: 0.16

saving predicted results...
