In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
from transformers import AdamW, get_scheduler

from torch import nn
from transformers import T5Model

from tqdm.auto import tqdm
# from sacrebleu.metrics import BLEU
from nltk.translate.bleu_score import corpus_bleu
import jieba

import random
import numpy as np
import os
import json
from collections import defaultdict

from accelerate import Accelerator

import wandb

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
max_dataset_size = 200000
max_input_length = 512
max_target_length = 32
stride = 128
train_batch_size = 8
valid_batch_size = 8
learning_rate = 1e-5
epoch_num = 10
beam_size = 4
no_repeat_ngram_size = 2

seed = 5
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

save_dir = "/home/med/selflearning/stage1-t5/model/bin"
output_dir = '/home/med/selflearning/stage1-t5/data/DuReaderQG/output'
os.makedirs(output_dir, exist_ok=True)


# 1. 准备数据
## 1.1. 构建数据集
先编写继承自 Dataset 类的自定义数据集类用于组织样本和标签

### 合并验证集
训练集只有一个答案，验证集一个问题可能对应有多个参考答案，将相同question和context的所有answer合并进一个answers列表。

验证集数据格式：

`{"context": "还没有最后确定，暂定2017年", "question": "余罪第三季开播时间", "answers": ["暂定2017年", "2017年"], "id": 9}`

In [3]:
class T5(Dataset):
    def __init__(self, data_file, merge=False):
        """
        data_file: 输入文件
        merge: 是否合并相同 (context, question) 的答案
        """
        self.data = self.load_data(data_file, merge)

    def load_data(self, data_file, merge):
        if not merge:
            # 不合并，直接一行一个样本
            Data = []
            with open(data_file, 'rt', encoding='utf-8') as f:
                for idx, line in enumerate(f, start=1):
                    sample = json.loads(line.strip())
                    sample["id"] = idx
                    Data.append(sample)
            return Data
        else:
            # 合并，相同 (context, question) 聚合答案
            merged = defaultdict(list)
            with open(data_file, 'rt', encoding='utf-8') as f:
                for line in f:
                    sample = json.loads(line.strip())
                    key = (sample["context"], sample["question"])
                    merged[key].append(sample["answer"])
            Data = []
            for new_id, ((context, question), answers) in enumerate(merged.items(), start=1):
                Data.append({
                    "context": context,
                    "question": question,
                    "answers": answers,  # list of answers
                    "id": new_id
                })
            return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

### 分割测试集
train 的 10% 作为测试集
- Train data size: 13068
- Valid data size: 700
- Test data size: 1452

In [4]:
train_dataset = T5('/home/med/selflearning/stage1-t5/data/DuReaderQG/train.json', merge=False)
valid_data = T5('/home/med/selflearning/stage1-t5/data/DuReaderQG/dev.json', merge=True)

indices = list(range(len(train_dataset)))
n_test = int(len(indices) * 0.1)
test_data  = Subset(train_dataset, indices[:n_test])
train_data = Subset(train_dataset, indices[n_test:])

In [5]:
# Check data size and sample
print(f'Train data size: {len(train_data)}')
print(f'Valid data size: {len(valid_data)}')
print(f'Test data size: {len(test_data)}')

print(next(iter(train_data)))
print(next(iter(valid_data)))
print(next(iter(test_data)))

Train data size: 13068
Valid data size: 700
Test data size: 1452
{'context': '晚上十点。以下是身体器官工作时间表： 一、晚上9-11点为免疫系统（淋巴）排毒时间，此段时间应安静或听音乐。二、晚间11-凌晨1点，肝的排毒，需在熟睡中进行。三、凌晨1-3点，胆的排毒，亦同。四、凌晨3-5点，肺的排毒。此即为何咳嗽的人在这段时间咳得最剧烈，因排毒动作已走到肺。五、凌晨5-7点，大肠的排毒，应上厕所排便。六、凌晨7-9点，小肠大量吸收营养的时段，应吃早餐。疗病者最好早吃，在6点半前，养生者在7点半前，不吃早餐者应改变习惯，即使拖到9、10点吃都比不吃好。七、半夜至凌晨4点为脊椎造血时段，必须熟睡，不宜熬夜', 'answer': '晚上十点', 'question': '几点开始算熬夜', 'id': 1453}
{'context': '年基准利率4.35%。 从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。', 'question': '2017年银行贷款基准利率', 'answers': ['年基准利率4.35%', '4.35%'], 'id': 1}
{'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。', 'answer': '第35集', 'question': '仙剑奇侠传3第几集上天界', 'i

## 1.2. 数据预处理
### T5Tokenizer
AutoTokenizer -> T5Tokenizer
> 第五章：模型与分词器
> 
> 调用 Tokenizer.save_pretrained() 函数会在保存路径下创建三个文件：
> 
> - special_tokens_map.json：映射文件，里面包含 unknown token 等特殊字符的映射关系；
> - tokenizer_config.json：分词器配置文件，存储构建分词器需要的参数；
> - vocab.txt：词表，一行一个 token，行号就是对应的 token ID（从 0 开始）。
>
- ❌ mengzi使用modelscope pipeline，没有tokenizer.json，Autotokenizer加载失败
- ✅ 明确使用 T5Tokenizer，避免 AutoTokenizer 转换失败

In [6]:
model_checkpoint = "/home/med/selflearning/stage1-t5/model/mengzi-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# Check the tokenization result
sample = train_data[0]
context = sample['context']
question = sample['question']
answer = sample['answer']

inputs = tokenizer(
    question,
    context,
    max_length=300,
    truncation="only_second",
    return_overflowing_tokens=True,
    stride=50,
)

print(inputs)
print(tokenizer.convert_ids_to_tokens(inputs.input_ids))

{'overflowing_tokens': [], 'num_truncated_tokens': -110, 'input_ids': [9595, 172, 1044, 7419, 1, 1263, 750, 101, 4, 10687, 802, 7146, 19063, 715, 13, 7, 39, 6, 1263, 280, 18425, 101, 18, 19444, 22, 17152, 25, 11445, 133, 3, 439, 976, 133, 268, 5544, 74, 24432, 4, 195, 6, 16956, 506, 114, 7650, 94, 101, 3, 2425, 5, 11445, 3, 1617, 8, 2003, 1533, 16, 103, 4, 105, 6, 7650, 18214, 101, 3, 4208, 5, 11445, 3, 1295, 365, 4, 244, 6, 7650, 14539, 101, 3, 2967, 5, 11445, 4, 439, 397, 3089, 7466, 123, 8, 8484, 9898, 116, 126, 10700, 3, 293, 11445, 1824, 231, 4338, 2967, 4, 270, 6, 7650, 108, 13357, 101, 3, 15454, 5, 11445, 3, 268, 16511, 18685, 4, 544, 6, 7650, 219, 14660, 101, 3, 23982, 1444, 2534, 1986, 5, 10416, 3, 268, 25920, 4, 9670, 389, 117, 1191, 927, 208, 3, 8, 173, 10160, 111, 3, 3772, 117, 8, 219, 10160, 111, 3, 20, 25920, 117, 268, 829, 1403, 3, 1335, 3044, 38, 280, 6, 165, 101, 208, 47, 151, 7258, 67, 4, 744, 6, 8572, 229, 7650, 138, 101, 18, 22461, 22901, 10416, 3, 557, 2003, 1533, 

### T5训练模板
> 第四章：开箱即用的 pipelines / 这些 pipeline 背后做了什么？
> 1. 预处理 (preprocessing)，将原始文本转换为模型可以接受的输入格式；
> 2. 将处理好的输入送入模型；
> 3. 对模型的输出进行后处理 (postprocessing)，将其转换为人类方便阅读的格式
>

- inputs: `question <extra_id_0> context`
- targets: `<extra_id_0> answer <extra_id_1>`

### collate_fn
> 直接使用 Transformers 库自带的 AutoModelForSeq2SeqLM 函数来构建模型，因此我们需要将每一个 batch 中的数据处理为该模型可接受的格式：一个包含 'input_ids'、'attention_mask'、'labels' 和 'decoder_input_ids' 键的字典。

> 与我们之前任务中使用的纯 Encoder 模型不同，Seq2Seq 任务对应的模型采用的是 Encoder-Decoder 框架：Encoder 负责编码输入序列，Decoder 负责循环地逐个生成输出 token。因此，对于每一个样本，我们还需要额外准备 decoder input IDs 作为 Decoder 的输入。decoder input IDs 是标签序列的移位，在序列的开始位置增加了一个特殊的“序列起始符”。
>

这里用了 `text_target=`，它会在返回的 `batch_data` 里 自动生成：
- `input_ids`（来自 `batch_inputs`）
- `labels`（来自 `text_target`，已经右移处理）
训练时，`T5ForConditionalGeneration` 的 `forward()` 会根据 `labels` 自动构造 `decoder_input_ids`（内部调用 `shift_tokens_right`）。不用手动传 `decoder_input_ids`。

In [8]:
def collate_fn(batch_samples):
    batch_inputs, batch_targets = [], []

    for sample in batch_samples:
        batch_inputs.append(sample['question'].strip() + ' <extra_id_0> ' + sample['context'].strip())
        batch_targets.append('<extra_id_0> ' + sample['answer'].strip() + ' <extra_id_1>')

    # print(batch_inputs)
    # print(batch_targets)
    
    batch_data = tokenizer(
        batch_inputs,               # 作为 encoder 的输入文本列表
        text_target=batch_targets,  # 作为 decoder 的目标文本（transformers 的 T5 支持 text_target 直接生成 labels，已经右移处理）
        padding=True,               # 对 batch 内样本进行 padding 到相同长度（默认 pad 到最长样本长度）
        max_length=max_input_length,    # 超过该长度则截断
        truncation=True,            # 允许截断
        return_tensors="pt"         # 返回 PyTorch 张量（dict 中包含 input_ids, attention_mask, labels 等）
    )

    batch_data['labels'][batch_data['labels'] == tokenizer.pad_token_id] = -100 # 将 labels 中等于 pad_token_id 的位置替换为 -100
    
    return batch_data               # 返回一个 dict（PyTorch 张量），可以直接被模型的 forward(**batch_data) 或 Trainer 使用。

train_dataloader = DataLoader(
    train_data,                     # 训练数据集（实现了 __len__ 和 __getitem__）
    batch_size=train_batch_size,    # 每个 batch 的样本数
    shuffle=True,                   # 每个 epoch 前打乱数据（训练时常用）
    collate_fn=collate_fn           # 自定义的 collate 函数（用于将多个样本合并为一个 batch）
    )
    
test_dataloader = DataLoader(
    test_data,                     # 测试数据集
    batch_size=train_batch_size,
    shuffle=True,
    collate_fn=collate_fn
    )


### collate_fn_valid
- tensor_data：tensor 类型的数据（模型输入、labels）

- list_data：list 类型的数据（多参考答案 all_answers）

In [9]:
def collate_fn_valid(batch_samples):
    """
    专门用于验证集的 collate_fn，处理多参考答案
    """
    batch_inputs, batch_targets, batch_all_answers = [], [], []

    for sample in batch_samples:
        batch_inputs.append(sample['question'].strip() + ' <extra_id_0> ' + sample['context'].strip())
        batch_targets.append('<extra_id_0> ' + sample['answers'][0].strip() + ' <extra_id_1>')  # 用第一个答案作为 decoder target 占位

        batch_all_answers.append([ans.strip() for ans in sample['answers']])  # 保存该样本的所有参考答案列表

    tensor_data = tokenizer(
        batch_inputs,
        text_target=batch_targets,
        padding=True,
        max_length=max_input_length,
        truncation=True,
        return_tensors="pt"
    )

    tensor_data['labels'][tensor_data['labels'] == tokenizer.pad_token_id] = -100

    # 返回字典，区分 tensor 和 list 数据
    return {
        'tensor_data': tensor_data,       # 包含 input_ids, attention_mask, labels
        'list_data': {
            'all_answers': batch_all_answers
        }
    }

valid_dataloader = DataLoader(
    valid_data,                     # 验证数据集
    batch_size=valid_batch_size,    
    shuffle=False, 
    collate_fn=collate_fn_valid
)

In [10]:
# Check the batch data
batch = next(iter(train_dataloader))
print(batch.keys())
print('batch shape:', {k: v.shape for k, v in batch.items()})
print(batch)

dict_keys(['input_ids', 'attention_mask', 'labels'])
batch shape: {'input_ids': torch.Size([8, 487]), 'attention_mask': torch.Size([8, 487]), 'labels': torch.Size([8, 7])}
{'input_ids': tensor([[ 1616, 15223,   523,  ...,     0,     0,     0],
        [12203,    24,  6374,  ...,     0,     0,     0],
        [ 1944,   164,  7424,  ...,     0,     0,     0],
        ...,
        [ 2747,  1263,  2886,  ...,     0,     0,     0],
        [10361,   789,   164,  ...,     0,     0,     0],
        [  623,   500,   956,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[32127,   173,  1752, 32126,     1,  -100,  -100],
        [32127,  6736,   670, 32126,     1,  -100,  -100],
        [32127,  1026,  1236,   547, 32126,     1,  -100],
        [32127,  2747, 2030

# 2. 训练模型
## 2.1. 构建模型
直接使用 Transformers 库自带的 AutoModelForSeq2SeqLM 函数来构建模型
- checkpoint 已经包含 encoder + decoder + lm_head
- 可以直接调用 forward 训练，或者 generate 生成文本
- 不需要自己再加 lm_head 或写自定义 forward

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

Using cuda device


In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

  return torch.load(checkpoint_file, map_location=map_location)


## 2.2. Train Loop
使用 AutoModelForSeq2SeqLM 构造的模型已经封装好了对应的损失函数，并且计算出的损失会直接包含在模型的输出 outputs 中，可以直接通过 outputs.loss 获得，因此训练循环为：

In [13]:
def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(dataloader, desc=f'Epoch {epoch}', ncols=100)

    finish_batch_num = (epoch-1) * len(dataloader)
    
    model.train()
    
    for batch, batch_data in enumerate(dataloader, start=1):
        batch_data = {k: v.to(device) for k, v in batch_data.items() if isinstance(v, torch.Tensor)}
                
        outputs = model(**batch_data)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        avg_loss = total_loss / (finish_batch_num + batch)
        progress_bar.set_description(f'loss: {avg_loss:>7f}')
        progress_bar.update(1)

        wandb.log({"train/loss": avg_loss}, step=finish_batch_num + batch)

    return total_loss


## 2.3. Test Loop

### 解码
> 在本文中，我们使用 AutoModelForSeq2SeqLM 模型自带的 generate() 函数，通过柱搜索 (Beam search) 解码出翻译结果（使用模型默认解码参数）。
> 
- `generate()` 使用 AutoModelForSeq2SeqLM 构造的模型同样对 Decoder 的解码过程进行了封装，我们只需要调用模型的 generate() 函数就可以自动地逐个生成预测 token

- `tokenizer.batch_decode()` 在 generate() 生成 token ID 之后，我们通过分词器自带的 tokenizer.batch_decode() 函数将 batch 中所有的 token ID 序列都转换为文本

### jieba
- ❌ 如果用`bleu = BLUE(tokenize = 'zh')`，英文分词按照字母切
- ✅ jieba.cut(text.strip()) 用 jieba 对中文进行分词，英文和数字默认按字符切

### BLEU
- 计算 BLEU-1~4
  - ❌ `sacrebleu` 默认标准 BLEU-4
  - ✅ `nltk.translate.bleu_score` 可自定义权重

- weights 决定 n-gram 权重：
  - BLEU-1：只计算 unigram
  - BLEU-2：计算 unigram + bigram
  - BLEU-3：计算 1~3 gram
  - BLEU-4：标准 BLEU-4

- `corpus_bleu` 会对整个 corpus 做平均，使用 list of list 自动选择最匹配的label

> 问题：BLEU对短文本效果不好
> BLEU 依赖于词序的精确 N-gram 重叠。但在简短回答或摘要任务中：
> 1. 答案多样性高： 一个问题的正确答案可能有多种表达方式，即使语义完全一致，词序和词汇也可能不同。
> 2. 长度短： 简短回答通常很短，导致 4-gram 匹配难度极大，BLEU 分数容易不合理地降为 0。
> 3. 强调召回率： 简短回答更看重关键信息的覆盖率（即信息是否都答到了），而 BLEU 本身是一个精度指标，对漏掉关键信息惩罚不足。

In [14]:
def zh_tokenize(text):
    """中文 + 混合英文分词"""
    return list(jieba.cut(text.strip())) if text.strip() else ['empty']

def test_loop(dataloader, model, mode='valid', debug_sample_num=10):
    preds, labels = [], []
    
    model.eval()

    for batch, batch_data in enumerate(tqdm(dataloader, desc=f"Evaluating {mode}")):
        # tensor 类型的数据（input_ids、labels等） 和 list 类型的数据（all_answers）分开处理
        batch_data_tensor = {k: v.to(device) for k, v in batch_data['tensor_data'].items() if isinstance(v, torch.Tensor)}
        batch_data_list = batch_data['list_data']
        
        # 生成预测
        with torch.no_grad():
            generated_tokens = model.generate(
                input_ids=batch_data_tensor["input_ids"],
                attention_mask=batch_data_tensor["attention_mask"],
                max_length=max_target_length,
                num_beams=beam_size,
                no_repeat_ngram_size=no_repeat_ngram_size,
            )
        
        if isinstance(generated_tokens, tuple):
            generated_tokens = generated_tokens[0]
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)  # batch_decode 把 token id 转成文本字符串

        # 遍历每条样本
        for i, pred_text in enumerate(decoded_preds):
            pred_tokens = zh_tokenize(pred_text)
            label_tokens = [zh_tokenize(r) for r in batch_data_list['all_answers'][i]]  # list of list
            preds.append(pred_tokens)   # preds：存放模型预测的 token 列表（每条样本一个 list）
            labels.append(label_tokens) # labels：存放该预测对应的 所有参考答案 token 列表（list of list）

    # 随机打印debug_sample_num个样本
    if debug_sample_num > 0:
        sample_indices = random.sample(range(len(preds)), min(debug_sample_num, len(preds)))
        print("\nRandom debug samples:")
        for idx in sample_indices:
            print(f"PRED: {preds[idx]} REF:  {labels[idx]}")

    # 计算 BLEU-1 ~ BLEU-4
    P1 = corpus_bleu(labels, preds, weights=(1,0,0,0))
    P2 = corpus_bleu(labels, preds, weights=(0.5,0.5,0,0))
    P3 = corpus_bleu(labels, preds, weights=(0.33,0.33,0.33,0))
    P4 = corpus_bleu(labels, preds, weights=(0.25,0.25,0.25,0.25))

    class Result:
        pass

    result = Result()
    result.score = P4 * 100
    result.precisions = (P1 * 100, P2 * 100, P3 * 100, P4 * 100)

    return result

## 2.4. Save Model


In [15]:
wandb.init(
    project="text2text_generation",
    name="t5_train",
)
wandb.config.update({
    "learning_rate": learning_rate,
    "epochs": epoch_num,
    "batch_size": train_batch_size,
})

[34m[1mwandb[0m: Currently logged in as: [33mcocora14[0m ([33mcocora14-the-university-of-sydney[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


### 优化器
使用 AdamW 优化器，适合 Transformer 类模型。
- model.parameters()：告诉优化器哪些参数需要更新。
- lr=learning_rate：学习率。
- 作用：控制参数更新策略（带权重衰减的 Adam）。

### Accelerator 封装
```bash
(t5) med@server:~$ nvidia-smi
Sun Sep 28 01:46:54 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:18:00.0 Off |                  N/A |
| 49%   84C    P2             239W / 350W |  15720MiB / 24576MiB |     96%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        Off | 00000000:3B:00.0 Off |                  N/A |
| 30%   33C    P8              11W / 350W |      3MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   2  NVIDIA GeForce RTX 3090        Off | 00000000:5E:00.0 Off |                  N/A |
|  0%   27C    P8               9W / 370W |      3MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   3  NVIDIA GeForce RTX 3090        Off | 00000000:86:00.0 Off |                  N/A |
| 30%   28C    P8              10W / 350W |      3MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A    179783      C   ...ed/anaconda3/envs/t5/bin/python3.11    15714MiB |
+---------------------------------------------------------------------------------------+
```

### 学习率调度器
使用 线性衰减学习率：从初始 lr 线性下降到 0。

- 参数：
  - num_warmup_steps=0 → 没有预热阶段
  - num_training_steps=epoch_num * len(train_dataloader) → 总训练步数

In [16]:
# 使用 AdamW 优化器
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Accelerator 封装
accelerator = Accelerator()
model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, valid_dataloader
)

# 学习率调度器
num_training_steps = epoch_num * len(train_dataloader)
num_warmup_steps = int(0.05 * num_training_steps)  # 前 5% steps 做 warmup

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)

# 初始化指标
total_loss = 0.
best_bleu = 0.
patience = 3
patience_counter = 0

# 训练 + 验证循环
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)
    valid_bleu = test_loop(valid_dataloader, model, mode='valid', debug_sample_num=10)

    print(f"Validation BLEU scores: {valid_bleu.score:.2f}")
    P1, P2, P3, P4 = valid_bleu.precisions
    print(f"BLEU-1: {P1:.2f}, BLEU-2: {P2:.2f}, BLEU-3: {P3:.2f}, BLEU-4: {P4:.2f}")

    # 判断是否保存最优模型
    if valid_bleu.score > best_bleu:
        best_bleu = valid_bleu.score
        patience_counter = 0  # 重置耐心计数器
        print('saving new weights...\n')
        model_path = os.path.join(save_dir, f"epoch_{t+1}_valid_BLEU_{valid_bleu.score:0.2f}_model_weights.bin")
        torch.save(model.state_dict(), model_path)
        print(f"Model weights saved to: {model_path}\n")
    else:
        patience_counter += 1
        print(f"No improvement, patience_counter: {patience_counter}/{patience}\n")
        if patience_counter >= patience:
            print(f"Early stopping triggered at epoch {t+1}.")
            break
    
print("Done!")


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch 1/10
-------------------------------


loss: 0.945684: 100%|███████████████████████████████████████████| 1634/1634 [07:26<00:00,  3.66it/s]
Evaluating valid:   0%|                                  | 0/88 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.598 seconds.
Prefix dict has been built successfully.
Evaluating valid: 100%|█████████████████████████| 88/88 [00:32<00:00,  2.69it/s]



Random debug samples:
PRED: ['3788', '-', '5588', '元'] REF:  [['破', '亿'], ['首日', '销量', '过', '亿']]
PRED: ['50', '个'] REF:  [['50'], ['50', '个']]
PRED: ['阜阳', '友好', '医院'] REF:  [['阜阳', '友好', '医院']]
PRED: ['90', '年代'] REF:  [['20', '世纪', '90', '年代']]
PRED: ['95543'] REF:  [['95543']]
PRED: ['神经科'] REF:  [['神经科']]
PRED: ['3', '个', '月'] REF:  [['3', '个', '月']]
PRED: ['一周', '多'] REF:  [['一周', '多', '时间']]
PRED: ['二本'] REF:  [['二本']]
PRED: ['138.3', 'x67.1', 'mm'] REF:  [['138.3', 'x67.1', 'x7.1', 'mm']]
Validation BLEU scores: 49.90
BLEU-1: 76.00, BLEU-2: 69.36, BLEU-3: 60.28, BLEU-4: 49.90
saving new weights...

Model weights saved to: /home/med/selflearning/stage1-t5/model/bin/epoch_1_valid_BLEU_49.90_model_weights.bin

Epoch 2/10
-------------------------------


loss: 0.656441: 100%|███████████████████████████████████████████| 1634/1634 [07:51<00:00,  3.46it/s]
Evaluating valid: 100%|█████████████████████████| 88/88 [00:34<00:00,  2.56it/s]



Random debug samples:
PRED: ['我', '想'] REF:  [['我', '想']]
PRED: ['飞利浦'] REF:  [['飞利浦']]
PRED: ['放风', '草'] REF:  [['放风', '草'], ['防风', '草']]
PRED: ['三四根'] REF:  [['两根']]
PRED: ['几千', '到', '上万'] REF:  [['几千', '到', '上万', '不', '等'], ['几千', '到', '上万']]
PRED: ['倍', '浓', '植物', '奶牛', '系列'] REF:  [['倍', '浓', '植物', '奶牛', '系列']]
PRED: ['五一', '至', '十一', '之间'] REF:  [['五一', '至', '十一', '之间'], ['五一', '至', '十一']]
PRED: ['两到', '三年'] REF:  [['两到', '三年']]
PRED: ['白起'] REF:  [['白起']]
PRED: ['一周', '多'] REF:  [['一周', '多', '时间']]
Validation BLEU scores: 52.75
BLEU-1: 79.15, BLEU-2: 72.33, BLEU-3: 63.15, BLEU-4: 52.75
saving new weights...

Model weights saved to: /home/med/selflearning/stage1-t5/model/bin/epoch_2_valid_BLEU_52.75_model_weights.bin

Epoch 3/10
-------------------------------


loss: 0.520224: 100%|███████████████████████████████████████████| 1634/1634 [07:53<00:00,  3.45it/s]
Evaluating valid: 100%|█████████████████████████| 88/88 [00:33<00:00,  2.64it/s]



Random debug samples:
PRED: ['90', '-', '140', '/', '60', '-', '90mmhg'] REF:  [['90', '-', '140', '/', '60', '-', '90mmhg', '之间'], ['90', '-', '140', '/', '60', '-', '90mmhg']]
PRED: ['半年', '后'] REF:  [['半年', '后']]
PRED: ['二十五', '到', '三十分钟'] REF:  [['差不多', '二十五', '到', '三十分钟'], ['二十五', '到', '三十分钟']]
PRED: ['安化', '金花', '黑茶'] REF:  [['安化', '金花', '黑茶']]
PRED: ['新', '工匠', '镇'] REF:  [['新', '工匠', '镇']]
PRED: ['MIUI', '+', '原生', 'Android'] REF:  [['MIUI', '+', '原生', 'Android', '(', '基于', 'Android', ' ', 'OS', ' ', 'v2.3', '.', '5', ')', '双系统'], ['MIUI', '操作系统']]
PRED: ['不能', '超过', '10%'] REF:  [['地面', '的', '不能', '超过', '10%']]
PRED: ['90', '/', '60', '-', '139', '/', '89'] REF:  [['90', '/', '60', '-', '139', '/', '89']]
PRED: ['12', '月', '31', '日', '24', ':', '00', '点'] REF:  [['12', '月', '31', '日', '24', ':', '00']]
PRED: ['5000', '元', '左右'] REF:  [['平均工资', '约', '5000', '元', '左右'], ['平均工资', '约', '5000', '元']]
Validation BLEU scores: 54.42
BLEU-1: 79.79, BLEU-2: 73.50, BLEU-3: 64.73, BLEU-4

loss: 0.435098: 100%|███████████████████████████████████████████| 1634/1634 [07:46<00:00,  3.51it/s]
Evaluating valid: 100%|█████████████████████████| 88/88 [00:34<00:00,  2.58it/s]



Random debug samples:
PRED: ['约', '5000', '元', '左右'] REF:  [['平均工资', '约', '5000', '元', '左右'], ['平均工资', '约', '5000', '元']]
PRED: ['170cm'] REF:  [['170cm']]
PRED: ['1000', '-', '1500', '元', '左右'] REF:  [['1000', '-', '1500', '元'], ['1000', '-', '1500', '元', '左右']]
PRED: ['二本'] REF:  [['2', '本'], ['二本']]
PRED: ['U', '系列'] REF:  [['U', '系列']]
PRED: ['密歇根州'] REF:  [['密歇根州'], ['密歇根州', '（', 'Michigan', '）'], ['Michigan']]
PRED: ['2.7', '亿美元'] REF:  [['2.7', '亿美元']]
PRED: ['每周六', '晚上', '8', '点', '-', '9', '点', '9', '点'] REF:  [['每周六', ' ', '周日', '晚上', '8', '点', '-', '9', '点']]
PRED: ['罗琦'] REF:  [['黄小琥'], ['应该', '是', '黄小琥']]
PRED: ['2017', '-', '05', '-', '02'] REF:  [['下周']]
Validation BLEU scores: 53.87
BLEU-1: 79.56, BLEU-2: 72.91, BLEU-3: 64.02, BLEU-4: 53.87
No improvement, patience_counter: 1/3

Epoch 5/10
-------------------------------


loss: 0.375054: 100%|███████████████████████████████████████████| 1634/1634 [07:56<00:00,  3.43it/s]
Evaluating valid: 100%|█████████████████████████| 88/88 [00:36<00:00,  2.44it/s]



Random debug samples:
PRED: ['3%'] REF:  [['3%']]
PRED: ['4', '万元', '左右'] REF:  [['4', ' ', '万元'], ['4', ' ', '万元', '左右']]
PRED: ['85.5', '*', '54'] REF:  [['85.5', '*', '54']]
PRED: ['2', '时', '30', '分至', '5', '时', '50', '分'] REF:  [['2', '时', '30', '分']]
PRED: ['属鸡'] REF:  [['鸡']]
PRED: ['15000', '-', '200000IU', '/', 'L'] REF:  [['15000', '-', '200000IU', '/', 'L']]
PRED: ['14pt'] REF:  [['12pt']]
PRED: ['新', '工匠', '镇'] REF:  [['新', '工匠', '镇']]
PRED: ['巾'] REF:  [['巾']]
PRED: ['农历', '六月', '二十四日'] REF:  [['农历', '六月', '二十四日']]
Validation BLEU scores: 55.91
BLEU-1: 81.07, BLEU-2: 74.57, BLEU-3: 65.81, BLEU-4: 55.91
saving new weights...

Model weights saved to: /home/med/selflearning/stage1-t5/model/bin/epoch_5_valid_BLEU_55.91_model_weights.bin

Epoch 6/10
-------------------------------


loss: 0.329643: 100%|███████████████████████████████████████████| 1634/1634 [07:52<00:00,  3.46it/s]
Evaluating valid: 100%|█████████████████████████| 88/88 [00:32<00:00,  2.75it/s]



Random debug samples:
PRED: ['天津', '阳光', '男科', '医院'] REF:  [['天津', '阳光', '男科', '医院']]
PRED: ['4', '～', '6', '周'] REF:  [['早期', '梅毒', '4', '～', '6', '周']]
PRED: ['16', '岁'] REF:  [['16', '岁']]
PRED: ['美利达', '公爵', '600'] REF:  [['美利达', '公爵', '600']]
PRED: ['黄小琥'] REF:  [['黄小琥'], ['应该', '是', '黄小琥']]
PRED: ['两三天', '之后'] REF:  [['两三天', '之后']]
PRED: ['你们'] REF:  [['你', '的', '朋友', '们'], ['你们', '的', '这些', '家伙']]
PRED: ['1921', '年', '7', '月'] REF:  [['1921', '年', '7', '月']]
PRED: ['80', '厘米'] REF:  [['80', '厘米'], ['80', '（', '厘米', '）'], ['80']]
PRED: ['40', '-', '60', '度', '左右', '洁净', '的', '温水'] REF:  [['40', '-', '60', '度', '左右'], ['40', '-', '60', '度']]
Validation BLEU scores: 53.90
BLEU-1: 79.92, BLEU-2: 73.21, BLEU-3: 64.14, BLEU-4: 53.90
No improvement, patience_counter: 1/3

Epoch 7/10
-------------------------------


loss: 0.294509: 100%|███████████████████████████████████████████| 1634/1634 [07:52<00:00,  3.46it/s]
Evaluating valid: 100%|█████████████████████████| 88/88 [00:32<00:00,  2.69it/s]



Random debug samples:
PRED: ['200', '元到', '1000', '元', '每平米'] REF:  [['大概', '在', '200', '元到', '1000', '元'], ['200', '元到', '1000', '元']]
PRED: ['PE', '材料', '加', '保鲜膜', '复合', '而成'] REF:  [['XPE']]
PRED: ['广州', '东方', '英文', '书院'] REF:  [['广州', '东方', '英文', '书院']]
PRED: ['无锡', '嘉仕', '恒信', '医院'] REF:  [['无锡', '嘉仕', '恒信', '医院']]
PRED: ['属鸡'] REF:  [['鸡']]
PRED: ['55', '公斤'] REF:  [['55', '公斤']]
PRED: ['2016', '年', '9', '月', '10', '日至', '12', '日'] REF:  [['2016', '年', '9', '月', '10', '日至', '12', '日']]
PRED: ['750', '分'] REF:  [['750', '分']]
PRED: ['140MB', '/', 's'] REF:  [['闪迪酷', '豆', 'USB3.0'], ['闪迪', 'CZ43', '酷豆', '系列'], ['闪迪', 'CZ43', '酷豆', 'USB3.0'], ['闪迪']]
PRED: ['MIUI', '/', 'notes'] REF:  [['MIUI', '/', 'notes', '文件夹']]
Validation BLEU scores: 53.77
BLEU-1: 79.96, BLEU-2: 73.33, BLEU-3: 64.16, BLEU-4: 53.77
No improvement, patience_counter: 2/3

Epoch 8/10
-------------------------------


loss: 0.266385: 100%|███████████████████████████████████████████| 1634/1634 [07:55<00:00,  3.43it/s]
Evaluating valid: 100%|█████████████████████████| 88/88 [00:33<00:00,  2.67it/s]


Random debug samples:
PRED: ['长', '64cm', '×', '宽', '41cm'] REF:  [['长', '64cm', '×', '宽', '41cm', '×', '厚', '26cm']]
PRED: ['十天'] REF:  [['一般', '十天'], ['十天']]
PRED: ['1.48', '吨'] REF:  [['约', '为', '1.35', '-', '1.45', '吨'], ['1.35', '-', '1.45', '吨'], ['1.35', '-', '1.45']]
PRED: ['华康', '利爽'] REF:  [['华康', '利爽'], ['左', '氧氟沙星']]
PRED: ['宋欣佳怡'] REF:  [['宋欣佳怡']]
PRED: ['666.67', '平方米'] REF:  [['30000.15', '平方米']]
PRED: ['紫背', '菜'] REF:  [['紫背', '菜']]
PRED: ['122'] REF:  [['122']]
PRED: ['平安', '智慧', '星', '少儿', '万能', '险'] REF:  [['平安', '智慧', '星', '少儿', '万能', '险']]
PRED: ['今年', '中旬'] REF:  [['今年', '中旬']]
Validation BLEU scores: 53.56
BLEU-1: 79.65, BLEU-2: 73.11, BLEU-3: 63.85, BLEU-4: 53.56
No improvement, patience_counter: 3/3

Early stopping triggered at epoch 8.
Done!





### 训练数据
[wandb](https://wandb.ai/cocora14-the-university-of-sydney/text2text_generation/runs/v9s5tgxa?nw=nwusercocora14)

| Epoch | Loss     | BLEU-1 | BLEU-2 | BLEU-3 | BLEU-4 | Notes                    |
| ----- | -------- | ------ | ------ | ------ | ------ | ------------------------ |
| 1     | 0.945684 | 76.00  | 69.36  | 60.28  | 49.90  | save                     |
| 2     | 0.656441 | 79.15  | 72.33  | 63.15  | 52.75  | save                     |
| 3     | 0.520224 | 79.79  | 73.50  | 64.73  | 54.42  | save                     |
| 4     | 0.435098 | 79.56  | 72.91  | 64.02  | 53.87  | -                        |
| 5     | 0.375054 | 81.07  | 74.57  | 65.81  | 55.91  | save                     |
| 6     | 0.329643 | 79.92  | 73.21  | 64.14  | 53.90  | -                        |
| 7     | 0.294509 | 79.96  | 73.33  | 64.16  | 53.77  | -                        |
| 8     | 0.266385 | 79.65  | 73.11  | 63.85  | 53.56  | Early stop |


![loss](t5-base/train/Snipaste_2025-09-28_03-18-03.png)


# 3. Test Model
训练完成后，我们加载在验证集上性能最优的模型权重，汇报其在测试集上的性能，并且将模型的预测结果保存到文件中。

In [17]:
def test_model(model, output_path):

    model.eval()

    sources, preds, labels = [], [], []

    with torch.no_grad():
        print('evaluating on test set...')
        for batch_data in tqdm(test_dataloader):
            batch_data = {k: v.to(device) for k, v in batch_data.items() if isinstance(v, torch.Tensor)}

            generated_tokens = model.generate(
                batch_data["input_ids"],
                attention_mask=batch_data["attention_mask"],
                max_length=max_target_length,
                num_beams=beam_size,
                no_repeat_ngram_size=no_repeat_ngram_size,
            )

            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]

            # 转 numpy
            generated_tokens = generated_tokens.cpu().numpy()
            label_tokens = batch_data["labels"].cpu().numpy()
            label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)

            # 解码
            decoded_sources = tokenizer.batch_decode(
                batch_data["input_ids"].cpu().numpy(),
                skip_special_tokens=True,
                use_source_tokenizer=True
            )
            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

            # 分词 & 保存
            sources += [s.strip() for s in decoded_sources]
            preds += [zh_tokenize(p.strip()) if p.strip() else ['empty'] for p in decoded_preds]
            labels += [[zh_tokenize(l.strip())] if l.strip() else [['empty']] for l in decoded_labels]

    P1 = corpus_bleu(labels, preds, weights=(1,0,0,0))
    P2 = corpus_bleu(labels, preds, weights=(0.5,0.5,0,0))
    P3 = corpus_bleu(labels, preds, weights=(0.33,0.33,0.33,0))
    P4 = corpus_bleu(labels, preds, weights=(0.25,0.25,0.25,0.25))

    print(f"Test BLEU-1: {P1*100:.2f}, BLEU-2: {P2*100:.2f}, BLEU-3: {P3*100:.2f}, BLEU-4: {P4*100:.2f}")

    results = []
    print('saving predicted results...')
    for source, pred, label in zip(sources, preds, labels):
        results.append({
            "sentence": source, 
            "prediction": pred, 
            "translation": label 
        })

    with open(output_path, 'wt', encoding='utf-8') as f:
        for example_result in results:
            f.write(json.dumps(example_result, ensure_ascii=False) + '\n')


In [18]:
# test before training
output_path = f'{output_dir}/test_before_training.json'
model.load_state_dict(torch.load('/home/med/selflearning/stage1-t5/model/mengzi-t5-base/pytorch_model.bin'))
test_model(model, output_path)

  model.load_state_dict(torch.load('/home/med/selflearning/stage1-t5/model/mengzi-t5-base/pytorch_model.bin'))


evaluating on test set...


100%|█████████████████████████████████████████| 182/182 [03:09<00:00,  1.04s/it]

Test BLEU-1: 6.76, BLEU-2: 4.51, BLEU-3: 3.07, BLEU-4: 1.94
saving predicted results...





In [20]:
# test after training
output_path = f'{output_dir}/test_after_training.json'
model.load_state_dict(torch.load(model_path))
test_model(model, output_path)

  model.load_state_dict(torch.load(model_path))


evaluating on test set...


100%|█████████████████████████████████████████| 182/182 [02:53<00:00,  1.05it/s]

Test BLEU-1: 76.61, BLEU-2: 67.71, BLEU-3: 57.07, BLEU-4: 45.76
saving predicted results...





### 测试结果
经过微调，模型在测试集上的 BLEU 值从 1.69 上升到 49.49，证明了我们对模型的微调是成功的。

训练前：Test BLEU-1: 6.76, BLEU-2: 4.51, BLEU-3: 3.07, BLEU-4: 1.94

训练后：Test BLEU-1: 76.61, BLEU-2: 67.71, BLEU-3: 57.07, BLEU-4: 45.76
