微调T5模型，使其具备中文问答能力
e.g.
{"context": "年基准利率4.35%。 从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。", "answer": "年基准利率4.35%", "question": "2017年银行贷款基准利率", "id": 0}

# import外部库
主要是torch和transformers相关，包括：
1. from torch.utils.data import Dataset, DataLoader
 -- 用于读取数据
2. from torch.optim import AdamW -- 优化器
3. from transformers import T5Tokenizer, AutoConfig -- 用于分词和读取参数
4. from transformers import T5ForConditionalGeneration -- 模型本身


In [None]:
import random
import os
import numpy as np
import json
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import T5Tokenizer, AutoConfig
from transformers import T5ForConditionalGeneration
from transformers import get_scheduler
from tqdm.auto import tqdm
import evaluate

# 设置随机数
确保结果可以复现

In [None]:
def seed_everything(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

# 构建Dataloader
## 构建Dataset
继承于pytorch.Dataset，读取train.json和dev.json分别用作训练集和测试集。需要重写__init__ __len__和__getitem__方法
## 构建collate_fn
输入是字典格式的数据
把一个batch内的context和question读入列表。
- padding: 填充到相同长度。在attention机制里会通过mask表示哪些是真的哪些是填充的
- truncation：截断到最大长度inputs = {
    'input_ids': torch.Size([20, 512]),      # (20, 512)
    'attention_mask': torch.Size([20, 512])  # (20, 512)
}
    with tokenizer.as_target_tokenizer():作用是给序列添加上特殊的开始和结束符号

        labels.input_ids[labels.input_ids == tokenizer.pad_token_id] = -100
将padding部分的值设置为-100 这样就不会参与交叉熵计算


In [None]:
class T5Dataset(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
    
    def load_data(self, data_file):
        Data = {}
        with open(data_file, 'rt') as f:
            for idx, line in enumerate(f):
                sample = json.loads(line.strip())
                Data[idx] = sample
        return Data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


def collate_fn(batch_samples):
    batch_sentence_1, batch_sentence_2 = [], []
    batch_label = []
    for sample in batch_samples:
        batch_sentence_1.append(sample['context'])
        batch_sentence_2.append(sample['question'])
        batch_label.append(sample['answer'])
    
    inputs = tokenizer(
        [f"question: {s2} context: {s1}" for s1, s2 in zip(batch_sentence_1, batch_sentence_2)],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch_label,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        )
    
    # T5ForConditionalGeneration handles the right-shifting of decoder_input_ids internally
    # when labels are provided. The parts of labels with padding token (0) will be replaced by -100 to be ignored in loss calculation.
    labels.input_ids[labels.input_ids == tokenizer.pad_token_id] = -100
    
    inputs['labels'] = labels.input_ids
    return inputs

# 设置train
- 把input丢入model中，然后把batch转换到cuda上。
- outputs = model(**batch)表示按照key value传值
- 损失函数用交叉熵，自动忽略padding

In [None]:
def train_loop(dataloader, model, optimizer, lr_scheduler, epoch):
    progress_bar = tqdm(range(len(dataloader)))
    progress_bar.set_description(f'loss: {0:>7f}')
    
    model.train()
    epoch_loss = 0.
    for step, batch in enumerate(dataloader, start=1):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        epoch_loss += loss.item()
        progress_bar.set_description(f'loss: {epoch_loss/step:>7f}')
        progress_bar.update(1)
    return epoch_loss / len(dataloader)

# Test loop
1. 从测试集中读取数据，放到GPU上
2. 执行推理，拿到generated_tokens
3. with torch.no_grad() 不计算梯度 labels = batch.pop('labels').to(device) 不保留label
4.  model.eval()       # 设置为评估模式（关闭dropout等）
5. model.generate 自回归生成 一直生成到结束
6. 执行decode，拿到decoded_preds [batch_size, length, vocal_size]
7. 把label拿回CPU上，并且还原-100
8. 计算BLEU-4和 rouge_L；分别是准确率和召回率


In [None]:
def test_loop(dataloader, model, tokenizer, mode='Valid'):
    bleu_metric = evaluate.load("bleu")
    rouge_metric = evaluate.load("rouge")
    assert mode in ['Valid', 'Test']
    all_preds = []
    all_labels = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Evaluating {mode}"):
            labels = batch.pop('labels').to(device)
            batch = {k: v.to(device) for k, v in batch.items()}
            
            generated_tokens = model.generate(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                max_length=512
            ).cpu().numpy()
            
            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            
            labels = labels.cpu().numpy()
            labels[labels == -100] = tokenizer.pad_token_id
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            all_preds.extend(decoded_preds)
            all_labels.extend(decoded_labels)

    bleu_results = bleu_metric.compute(predictions=all_preds, references=[[l] for l in all_labels])
    rouge_results = rouge_metric.compute(predictions=all_preds, references=all_labels)

    results = {
        "bleu": bleu_results['bleu'],
        "rougeL": rouge_results['rougeL']
    }
    
    print(f"{mode} BLEU-4: {results['bleu']:.4f}")
    print(f"{mode} Rouge-L: {results['rougeL']:.4f}\n")
    
    return results

# 训练
1. 20个epoch
2. 保留bleu-4最大的权重
3. 保留每个epoch的loss和bleu-4 rogel

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')
seed_everything(42)

learning_rate = 1e-5
batch_size = 20
epoch_num = 20

checkpoint = "./model"
tokenizer = T5Tokenizer.from_pretrained(checkpoint)

train_data = T5Dataset('train.json')
valid_data = T5Dataset('dev.json')

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader= DataLoader(valid_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

config = AutoConfig.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint, config=config).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)

all_losses = []
all_valid_metrics = []
best_rouge = 0.
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    epoch_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1)
    all_losses.append(epoch_loss)
    
    # Save losses after each epoch
    with open('train_losses.json', 'w') as f:
        json.dump(all_losses, f, indent=4)
    
    valid_metrics = test_loop(valid_dataloader, model, tokenizer, mode='Valid')
    all_valid_metrics.append(valid_metrics)
    with open('valid_metrics.json', 'w') as f:
        json.dump(all_valid_metrics, f, indent=4)

    if valid_metrics['rougeL'] > best_rouge:
        best_rouge = valid_metrics['rougeL']
        print('saving new best weights...\n')
        torch.save(model.state_dict(), f'epoch_{t+1}_rougeL_{best_rouge:.4f}_model_weights.bin')
    
    print('saving current epoch weights...\n')
    torch.save(model.state_dict(), f'epoch_{t+1}_model_weights.bin')

print("Done!")