In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM
from transformers import AdamW, get_scheduler
from tqdm.auto import tqdm
# from sacrebleu.metrics import BLEU
from nltk.translate.bleu_score import corpus_bleu
import jieba

import random
import numpy as np
import os
import json
from collections import defaultdict

from accelerate import Accelerator

import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
max_dataset_size = 200000
max_input_length = 512
max_target_length = 32
stride = 128
train_batch_size = 8
valid_batch_size = 8
learning_rate = 2e-5
epoch_num = 10
beam_size = 4
no_repeat_ngram_size = 2

seed = 5
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

save_dir = "/home/med/selflearning/stage1-t5/model/bin"
output_dir = '/home/med/selflearning/stage1-t5/data/DuReaderQG/output'
os.makedirs(output_dir, exist_ok=True)


# 1. 准备数据
## 1.1. 构建数据集
### 数据集
先编写继承自 Dataset 类的自定义数据集类用于组织样本和标签

DuReaderQG: 
- Train data size: 14520
- valid data size: 700

### 合并answers
训练集只有一个答案，验证集一个问题可能对应有多个参考答案，将相同question和context的所有answer合并进一个answers列表。

验证集数据格式：

`{"context": "还没有最后确定，暂定2017年", "question": "余罪第三季开播时间", "answers": ["暂定2017年", "2017年"], "id": 9}`

In [3]:
class T5(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)

    def load_data(self, data_file):
        Data = {}

        with open(data_file, 'rt', encoding = 'utf-8') as f:
            for idx, line in enumerate(f):
                sample = json.loads(line.strip())
                Data[idx] = sample
        return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

train_data = T5('/home/med/selflearning/stage1-t5/data/DuReaderQG/train.json')
valid_data = T5('/home/med/selflearning/stage1-t5/data/DuReaderQG/dev_merged.json')

In [4]:
# Check data size and sample
print(f'Train data size: {len(train_data)}')
print(f'valid data size: {len(valid_data)}')

print(next(iter(train_data)))
print(next(iter(valid_data)))

Train data size: 14520
valid data size: 700
{'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。', 'answer': '第35集', 'question': '仙剑奇侠传3第几集上天界', 'id': 0}
{'context': '年基准利率4.35%。 从实际看,贷款的基本条件是: 一是中国大陆居民,年龄在60岁以下; 二是有稳定的住址和工作或经营地点; 三是有稳定的收入来源; 四是无不良信用记录,贷款用途不能作为炒股,赌博等行为; 五是具有完全民事行为能力。', 'question': '2017年银行贷款基准利率', 'answers': ['年基准利率4.35%', '4.35%'], 'id': 1}


## 1.2. 数据预处理
### T5Tokenizer
AutoTokenizer -> T5Tokenizer
> 第五章：模型与分词器
> 
> 调用 Tokenizer.save_pretrained() 函数会在保存路径下创建三个文件：
> 
> - special_tokens_map.json：映射文件，里面包含 unknown token 等特殊字符的映射关系；
> - tokenizer_config.json：分词器配置文件，存储构建分词器需要的参数；
> - vocab.txt：词表，一行一个 token，行号就是对应的 token ID（从 0 开始）。
>
- ❌ mengzi使用modelscope pipeline，没有tokenizer.json，Autotokenizer加载失败
- ✅ 明确使用 T5Tokenizer，避免 AutoTokenizer 转换失败

In [5]:
model_checkpoint = "/home/med/selflearning/stage1-t5/model/mengzi-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
# Check the tokenization result
sample = train_data[0]
context = sample['context']
question = sample['question']
answer = sample['answer']

inputs = tokenizer(
    question,
    context,
    max_length=300,
    truncation="only_second",
    return_overflowing_tokens=True,
    stride=50,
)

print(inputs)
print(tokenizer.convert_ids_to_tokens(inputs.input_ids))

{'overflowing_tokens': [], 'num_truncated_tokens': -34, 'input_ids': [1707, 1467, 992, 3979, 707, 100, 379, 645, 647, 9724, 1252, 1, 379, 2838, 647, 843, 408, 10694, 17973, 1763, 3, 1276, 87, 84, 2744, 84, 1419, 6141, 3, 122, 6409, 9, 2177, 17534, 5, 1707, 1468, 11725, 229, 3, 408, 5542, 119, 28060, 3, 18440, 3844, 4, 5542, 2190, 1468, 3, 54, 1069, 12914, 83, 5665, 335, 215, 9, 4514, 17339, 69, 4, 843, 408, 1200, 3771, 1902, 10, 3, 6625, 8435, 603, 1100, 3, 266, 119, 1954, 4, 5542, 145, 711, 27530, 11755, 3, 13737, 21, 304, 3779, 68, 843, 408, 5, 21769, 7074, 4, 122, 6409, 5184, 299, 854, 5, 21769, 3, 299, 854, 720, 267, 448, 756, 58, 807, 2037, 87, 1252, 481, 15, 2125, 4, 122, 6409, 3412, 1707, 1468, 3, 5542, 559, 6861, 532, 478, 3, 978, 87, 1252, 9948, 4, 5542, 1904, 39, 4380, 144, 3, 122, 6409, 2078, 3, 1990, 1252, 9, 87, 1252, 9694, 4, 153, 1990, 1252, 446, 137, 260, 1990, 73, 2088, 3, 12287, 2190, 87, 4, 5542, 229, 1990, 1252, 5903, 3, 4848, 578, 39, 13026, 20402, 1877, 3, 57, 227

### T5训练模板
> 第四章：开箱即用的 pipelines / 这些 pipeline 背后做了什么？
> 1. 预处理 (preprocessing)，将原始文本转换为模型可以接受的输入格式；
> 2. 将处理好的输入送入模型；
> 3. 对模型的输出进行后处理 (postprocessing)，将其转换为人类方便阅读的格式
>

- inputs: `question <extra_id_0> context`
- targets: `<extra_id_0> answer <extra_id_1>`

### collate_fn
> 直接使用 Transformers 库自带的 AutoModelForSeq2SeqLM 函数来构建模型，因此我们需要将每一个 batch 中的数据处理为该模型可接受的格式：一个包含 'input_ids'、'attention_mask'、'labels' 和 'decoder_input_ids' 键的字典。

> 与我们之前任务中使用的纯 Encoder 模型不同，Seq2Seq 任务对应的模型采用的是 Encoder-Decoder 框架：Encoder 负责编码输入序列，Decoder 负责循环地逐个生成输出 token。因此，对于每一个样本，我们还需要额外准备 decoder input IDs 作为 Decoder 的输入。decoder input IDs 是标签序列的移位，在序列的开始位置增加了一个特殊的“序列起始符”。
>

这里用了 `text_target=`，它会在返回的 `batch_data` 里 自动生成：
- `input_ids`（来自 `batch_inputs`）
- `labels`（来自 `text_target`，已经右移处理）
训练时，`T5ForConditionalGeneration` 的 `forward()` 会根据 `labels` 自动构造 `decoder_input_ids`（内部调用 `shift_tokens_right`）。不用手动传 `decoder_input_ids`。

In [7]:
def collate_fn(batch_samples):
    batch_inputs, batch_targets = [], []

    for sample in batch_samples:
        batch_inputs.append(sample['question'].strip() + ' <extra_id_0> ' + sample['context'].strip())
        batch_targets.append('<extra_id_0> ' + sample['answer'].strip() + ' <extra_id_1>')

    # print(batch_inputs)
    # print(batch_targets)
    
    batch_data = tokenizer(
        batch_inputs,               # 作为 encoder 的输入文本列表
        text_target=batch_targets,  # 作为 decoder 的目标文本（transformers 的 T5 支持 text_target 直接生成 labels，已经右移处理）
        padding=True,               # 对 batch 内样本进行 padding 到相同长度（默认 pad 到最长样本长度）
        max_length=max_input_length,    # 超过该长度则截断
        truncation=True,            # 允许截断
        return_tensors="pt"         # 返回 PyTorch 张量（dict 中包含 input_ids, attention_mask, labels 等）
    )

    batch_data['labels'][batch_data['labels'] == tokenizer.pad_token_id] = -100 # 将 labels 中等于 pad_token_id 的位置替换为 -100
    
    return batch_data               # 返回一个 dict（PyTorch 张量），可以直接被模型的 forward(**batch_data) 或 Trainer 使用。

train_dataloader = DataLoader(
    train_data,                     # 训练数据集（实现了 __len__ 和 __getitem__）
    batch_size=train_batch_size,    # 每个 batch 的样本数
    shuffle=True,                   # 每个 epoch 前打乱数据（训练时常用）
    collate_fn=collate_fn           # 自定义的 collate 函数（用于将多个样本合并为一个 batch）
    )


### collate_fn_valid
- tensor_data：tensor 类型的数据（模型输入、labels）

- list_data：list 类型的数据（多参考答案 all_answers）

In [8]:
def collate_fn_valid(batch_samples):
    """
    专门用于验证集的 collate_fn，处理多参考答案
    """
    batch_inputs, batch_targets, batch_all_answers = [], [], []

    for sample in batch_samples:
        batch_inputs.append(sample['question'].strip() + ' <extra_id_0> ' + sample['context'].strip())
        batch_targets.append('<extra_id_0> ' + sample['answers'][0].strip() + ' <extra_id_1>')  # 用第一个答案作为 decoder target 占位

        batch_all_answers.append([ans.strip() for ans in sample['answers']])  # 保存该样本的所有参考答案列表

    tensor_data = tokenizer(
        batch_inputs,
        text_target=batch_targets,
        padding=True,
        max_length=max_input_length,
        truncation=True,
        return_tensors="pt"
    )

    tensor_data['labels'][tensor_data['labels'] == tokenizer.pad_token_id] = -100

    # 返回字典，区分 tensor 和 list 数据
    return {
        'tensor_data': tensor_data,       # 包含 input_ids, attention_mask, labels
        'list_data': {
            'all_answers': batch_all_answers
        }
    }

valid_dataloader = DataLoader(
    valid_data,                     # 验证数据集
    batch_size=valid_batch_size,    
    shuffle=False, 
    collate_fn=collate_fn_valid
)

In [9]:
# Check the batch data
batch = next(iter(train_dataloader))
print(batch.keys())
print('batch shape:', {k: v.shape for k, v in batch.items()})
print(batch)

dict_keys(['input_ids', 'attention_mask', 'labels'])
batch shape: {'input_ids': torch.Size([8, 366]), 'attention_mask': torch.Size([8, 366]), 'labels': torch.Size([8, 12])}
{'input_ids': tensor([[  789,  4671,  2119,  ...,     0,     0,     0],
        [16500,   586,    50,  ...,     0,     0,     0],
        [ 9856,   813, 15130,  ...,     0,     0,     0],
        ...,
        [  875,  5603,  9807,  ...,  1152, 10646,     1],
        [   39,  7085, 27922,  ...,     0,     0,     0],
        [  253,    39,  2421,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[32127,  5332, 21781, 32126,     1,  -100,  -100,  -100,  -100,  -100,
          -100,  -100],
        [32127,  9373,  3680,   840, 32126,     1,  -100,  -100,  -100,  -100,
          -100,  -100

# 2. 训练模型
## 2.1. 构建模型
直接使用 Transformers 库自带的 AutoModelForSeq2SeqLM 函数来构建模型

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

Using cuda device


  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)


## 2.2. Train Loop
使用 AutoModelForSeq2SeqLM 构造的模型已经封装好了对应的损失函数，并且计算出的损失会直接包含在模型的输出 outputs 中，可以直接通过 outputs.loss 获得，因此训练循环为：

In [11]:
def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
    progress_bar = tqdm(dataloader, desc=f'Epoch {epoch}', ncols=100)

    finish_batch_num = (epoch-1) * len(dataloader)
    
    model.train()
    
    for batch, batch_data in enumerate(dataloader, start=1):
        batch_data = {k: v.to(device) for k, v in batch_data.items() if isinstance(v, torch.Tensor)}
                
        outputs = model(**batch_data)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        avg_loss = total_loss / (finish_batch_num + batch)
        progress_bar.set_description(f'loss: {avg_loss:>7f}')
        progress_bar.update(1)

        wandb.log({"train/loss": avg_loss}, step=finish_batch_num + batch)

    return total_loss


## 2.3. Test Loop

### 生成预测
> 使用 AutoModelForSeq2SeqLM 构造的模型同样对 Decoder 的解码过程进行了封装，我们只需要调用模型的 generate() 函数就可以自动地逐个生成预测 token。
> 
- `generate()` 使用 AutoModelForSeq2SeqLM 构造的模型同样对 Decoder 的解码过程进行了封装，我们只需要调用模型的 generate() 函数就可以自动地逐个生成预测 token

- `tokenizer.batch_decode()` 在 generate() 生成 token ID 之后，我们通过分词器自带的 tokenizer.batch_decode() 函数将 batch 中所有的 token ID 序列都转换为文本

### jieba
- ❌ 如果用`bleu = BLUE(tokenize = 'zh')`，英文分词按照字母切
- ✅ jieba.cut(text.strip()) 用 jieba 对中文进行分词，英文和数字默认按字符切

### BLEU
- 计算 BLEU-1~4
  - ❌ `sacrebleu` 默认标准 BLEU-4
  - ✅ `nltk.translate.bleu_score` 可自定义权重

- weights 决定 n-gram 权重：
  - BLEU-1：只计算 unigram
  - BLEU-2：计算 unigram + bigram
  - BLEU-3：计算 1~3 gram
  - BLEU-4：标准 BLEU-4

- `corpus_bleu` 会对整个 corpus 做平均，使用 list of list 自动选择最匹配的label

> 问题：BLEU对短文本效果不好
> BLEU 依赖于词序的精确 N-gram 重叠。但在简短回答或摘要任务中：
> 1. 答案多样性高： 一个问题的正确答案可能有多种表达方式，即使语义完全一致，词序和词汇也可能不同。
> 2. 长度短： 简短回答通常很短，导致 4-gram 匹配难度极大，BLEU 分数容易不合理地降为 0。
> 3. 强调召回率： 简短回答更看重关键信息的覆盖率（即信息是否都答到了），而 BLEU 本身是一个精度指标，对漏掉关键信息惩罚不足。

In [None]:
def zh_tokenize(text):
    """中文 + 混合英文分词"""
    return list(jieba.cut(text.strip())) if text.strip() else ['empty']

def test_loop(dataloader, model, mode='valid', debug_sample_num=10):
    preds, labels = [], []
    
    model.eval()

    for batch_idx, batch_data in enumerate(tqdm(dataloader, desc=f"Evaluating {mode}")):
        # tensor 类型的数据（input_ids、labels等） 和 list 类型的数据（all_answers）分开处理
        batch_data_tensor = {k: v.to(device) for k, v in batch_data['tensor_data'].items() if isinstance(v, torch.Tensor)}
        batch_data_list = batch_data['list_data']
        
        # 生成预测
        with torch.no_grad():
            generated_tokens = model.generate(
                input_ids=batch_data_tensor["input_ids"],
                attention_mask=batch_data_tensor["attention_mask"],
                max_length=max_target_length,
                num_beams=beam_size,
                no_repeat_ngram_size=no_repeat_ngram_size,
            )
        
        if isinstance(generated_tokens, tuple):
            generated_tokens = generated_tokens[0]
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)  # batch_decode 把 token id 转成文本字符串

        # 遍历每条样本
        for i, pred_text in enumerate(decoded_preds):
            pred_tokens = zh_tokenize(pred_text)
            label_tokens = [zh_tokenize(r) for r in batch_data_list['all_answers'][i]]  # list of list
            preds.append(pred_tokens)   # preds：存放模型预测的 token 列表（每条样本一个 list）
            labels.append(label_tokens) # labels：存放该预测对应的 所有参考答案 token 列表（list of list）

    # 随机打印debug_sample_num个样本
    if debug_sample_num > 0:
        sample_indices = random.sample(range(len(preds)), min(debug_sample_num, len(preds)))
        print("\nRandom debug samples:")
        for idx in sample_indices:
            print(f"PRED: {preds[idx]}\nREF:  {labels[idx]}\n")

    # 计算 BLEU-1 ~ BLEU-4
    P1 = corpus_bleu(labels, preds, weights=(1,0,0,0))
    P2 = corpus_bleu(labels, preds, weights=(0.5,0.5,0,0))
    P3 = corpus_bleu(labels, preds, weights=(0.33,0.33,0.33,0))
    P4 = corpus_bleu(labels, preds, weights=(0.25,0.25,0.25,0.25))

    class Result:
        pass

    result = Result()
    result.score = P4 * 100
    result.precisions = (P1 * 100, P2 * 100, P3 * 100, P4 * 100)

    return result

## 2.4. Save Model


In [14]:
wandb.init(
    project="text2text_generation",
    name="t5_train",
)
wandb.config.update({
    "learning_rate": learning_rate,
    "epochs": epoch_num,
    "batch_size": train_batch_size,
})

[34m[1mwandb[0m: Currently logged in as: [33mcocora14[0m ([33mcocora14-the-university-of-sydney[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


### 优化器
使用 AdamW 优化器，适合 Transformer 类模型。
- model.parameters()：告诉优化器哪些参数需要更新。
- lr=learning_rate：学习率。
- 作用：控制参数更新策略（带权重衰减的 Adam）。

### Accelerator 封装
```bash
(t5) med@server:~$ nvidia-smi
Sun Sep 28 01:46:54 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:18:00.0 Off |                  N/A |
| 49%   84C    P2             239W / 350W |  15720MiB / 24576MiB |     96%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        Off | 00000000:3B:00.0 Off |                  N/A |
| 30%   33C    P8              11W / 350W |      3MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   2  NVIDIA GeForce RTX 3090        Off | 00000000:5E:00.0 Off |                  N/A |
|  0%   27C    P8               9W / 370W |      3MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   3  NVIDIA GeForce RTX 3090        Off | 00000000:86:00.0 Off |                  N/A |
| 30%   28C    P8              10W / 350W |      3MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A    179783      C   ...ed/anaconda3/envs/t5/bin/python3.11    15714MiB |
+---------------------------------------------------------------------------------------+
```

### 学习率调度器
使用 线性衰减学习率：从初始 lr 线性下降到 0。

- 参数：
  - num_warmup_steps=0 → 没有预热阶段
  - num_training_steps=epoch_num * len(train_dataloader) → 总训练步数
- 作用：帮助训练更稳定，尤其在 Transformer 模型中常用。

In [None]:
# 使用 AdamW 优化器
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Accelerator 封装
accelerator = Accelerator()
model, optimizer, train_dataloader, valid_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, valid_dataloader
)

# 学习率调度器
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num*len(train_dataloader),
)

# 初始化指标
total_loss = 0.
best_bleu = 0.

# 训练 + 验证循环
for t in range(epoch_num):
    print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
    total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)
    valid_bleu = test_loop(valid_dataloader, model, mode = 'valid', debug_sample_num=10)

    print(f"Validation BLEU scores: {valid_bleu.score:.2f}")
    P1, P2, P3, P4 = valid_bleu.precisions
    print(f"BLEU-1: {P1:.2f}, BLEU-2: {P2:.2f}, BLEU-3: {P3:.2f}, BLEU-4: {P4:.2f}")

    if valid_bleu.score >= best_bleu:
        best_bleu = valid_bleu.score
        print('saving new weights...\n')
        model_path = os.path.join(save_dir, f"epoch_{t+1}_valid_BLEU_{valid_bleu.score:0.2f}_model_weights.bin")
        torch.save(model.state_dict(), model_path)
        print(f"Model weights saved to: {model_path}\n")
    
print("Done!")


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch 1/10
-------------------------------


loss: 0.523445: 100%|███████████████████████████████████████████| 1815/1815 [08:10<00:00,  3.70it/s]
Evaluating valid:   0%|                                  | 0/88 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.590 seconds.
Prefix dict has been built successfully.
Evaluating valid:   1%|▎                         | 1/88 [00:01<01:43,  1.19s/it]

PRED: ['B', '.', 'C', '.'] REF: [['B', '.', 'C', '.', 'E', '.'], ['B', '.', 'C', '.']]


Evaluating valid:   2%|▌                         | 2/88 [00:01<01:01,  1.39it/s]

PRED: ['密歇根州'] REF: [['密歇根州'], ['密歇根州', '（', 'Michigan', '）'], ['Michigan']]


Evaluating valid:   3%|▉                         | 3/88 [00:01<00:44,  1.93it/s]

PRED: ['4.35%'] REF: [['年', '基准利率', '4.35%'], ['4.35%']]


Evaluating valid: 100%|█████████████████████████| 88/88 [00:35<00:00,  2.47it/s]


Validation BLEU scores: 53.62
BLEU-1: 78.40, BLEU-2: 72.08, BLEU-3: 63.51, BLEU-4: 53.62
saving new weights...

Model weights saved to: /home/med/selflearning/stage1-t5/model/bin/epoch_1_valid_BLEU_53.62_model_weights.bin

Epoch 2/10
-------------------------------


loss: 0.392152: 100%|███████████████████████████████████████████| 1815/1815 [08:42<00:00,  3.48it/s]
Evaluating valid:   1%|▎                         | 1/88 [00:00<00:27,  3.12it/s]

PRED: ['10.5'] REF: [['9']]


Evaluating valid:   2%|▌                         | 2/88 [00:00<00:30,  2.82it/s]

PRED: ['广州', '东方', '英文', '书院'] REF: [['广州', '东方', '英文', '书院']]


Evaluating valid:   3%|▉                         | 3/88 [00:00<00:27,  3.08it/s]

PRED: ['4.35%'] REF: [['年', '基准利率', '4.35%'], ['4.35%']]


Evaluating valid: 100%|█████████████████████████| 88/88 [00:31<00:00,  2.76it/s]


Validation BLEU scores: 52.66
BLEU-1: 78.37, BLEU-2: 71.59, BLEU-3: 62.69, BLEU-4: 52.66
Epoch 3/10
-------------------------------


loss: 0.313256: 100%|███████████████████████████████████████████| 1815/1815 [08:37<00:00,  3.51it/s]
Evaluating valid:   1%|▎                         | 1/88 [00:00<00:32,  2.65it/s]

PRED: ['550', '--', '600MM'] REF: [['以', '40', '至', '60', '厘米', '为宜'], ['40', '至', '60', '厘米']]


Evaluating valid:   2%|▌                         | 2/88 [00:00<00:31,  2.74it/s]

PRED: ['U', '系列'] REF: [['U', '系列']]


Evaluating valid:   3%|▉                         | 3/88 [00:01<00:27,  3.04it/s]

PRED: ['密歇根州'] REF: [['密歇根州'], ['密歇根州', '（', 'Michigan', '）'], ['Michigan']]


Evaluating valid: 100%|█████████████████████████| 88/88 [00:32<00:00,  2.71it/s]


Validation BLEU scores: 52.74
BLEU-1: 78.69, BLEU-2: 71.89, BLEU-3: 62.88, BLEU-4: 52.74
Epoch 4/10
-------------------------------


loss: 0.259682: 100%|███████████████████████████████████████████| 1815/1815 [08:41<00:00,  3.48it/s]
Evaluating valid:   1%|▎                         | 1/88 [00:00<00:29,  2.99it/s]

PRED: ['10.5'] REF: [['9']]


Evaluating valid:   2%|▌                         | 2/88 [00:00<00:30,  2.84it/s]

PRED: ['广州', '东方', '英文', '书院'] REF: [['广州', '东方', '英文', '书院']]


Evaluating valid:   3%|▉                         | 3/88 [00:01<00:28,  3.04it/s]

PRED: ['5000', '以上'] REF: [['5000', '以上']]


Evaluating valid: 100%|█████████████████████████| 88/88 [00:35<00:00,  2.50it/s]


Validation BLEU scores: 55.64
BLEU-1: 80.75, BLEU-2: 74.13, BLEU-3: 65.58, BLEU-4: 55.64
saving new weights...

Model weights saved to: /home/med/selflearning/stage1-t5/model/bin/epoch_4_valid_BLEU_55.64_model_weights.bin

Epoch 5/10
-------------------------------


loss: 0.221540: 100%|███████████████████████████████████████████| 1815/1815 [08:39<00:00,  3.50it/s]
Evaluating valid:   1%|▎                         | 1/88 [00:00<00:31,  2.81it/s]

PRED: ['U', '系列'] REF: [['U', '系列']]


Evaluating valid:   2%|▌                         | 2/88 [00:00<00:30,  2.81it/s]

PRED: ['广州', '东方', '英文', '书院'] REF: [['广州', '东方', '英文', '书院']]


Evaluating valid:   3%|▉                         | 3/88 [00:01<00:28,  3.02it/s]

PRED: ['4.35%'] REF: [['年', '基准利率', '4.35%'], ['4.35%']]


Evaluating valid: 100%|█████████████████████████| 88/88 [00:31<00:00,  2.77it/s]


Validation BLEU scores: 53.37
BLEU-1: 79.09, BLEU-2: 72.48, BLEU-3: 63.67, BLEU-4: 53.37
Epoch 6/10
-------------------------------


loss: 0.211150:  33%|██████████████▍                             | 596/1815 [02:51<04:25,  4.59it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

loss: 0.170801: 100%|███████████████████████████████████████████| 1815/1815 [08:43<00:00,  3.47it/s]
Evaluating valid:   1%|▎                         | 1/88 [00:00<00:27,  3.17it/s]

PRED: ['550', '--', '600MM'] REF: [['以', '40', '至', '60', '厘米', '为宜'], ['40', '至', '60', '厘米']]


Evaluating valid:   2%|▌                         | 2/88 [00:00<00:30,  2.78it/s]

PRED: ['5000', '以上'] REF: [['5000', '以上']]


Evaluating valid:   3%|▉                         | 3/88 [00:01<00:28,  2.95it/s]

PRED: ['550', '--', '600MM'] REF: [['以', '40', '至', '60', '厘米', '为宜'], ['40', '至', '60', '厘米']]


Evaluating valid: 100%|█████████████████████████| 88/88 [00:31<00:00,  2.80it/s]


Validation BLEU scores: 52.36
BLEU-1: 79.18, BLEU-2: 72.28, BLEU-3: 62.86, BLEU-4: 52.36
Epoch 8/10
-------------------------------


loss: 0.152993: 100%|███████████████████████████████████████████| 1815/1815 [08:44<00:00,  3.46it/s]
Evaluating valid:   1%|▎                         | 1/88 [00:00<00:28,  3.01it/s]

PRED: ['U', '系列'] REF: [['U', '系列']]


Evaluating valid:   2%|▌                         | 2/88 [00:00<00:29,  2.89it/s]

PRED: ['550', '--', '600MM'] REF: [['以', '40', '至', '60', '厘米', '为宜'], ['40', '至', '60', '厘米']]


Evaluating valid:   3%|▉                         | 3/88 [00:00<00:26,  3.17it/s]

PRED: ['10.5'] REF: [['9']]


Evaluating valid: 100%|█████████████████████████| 88/88 [00:30<00:00,  2.85it/s]


Validation BLEU scores: 53.05
BLEU-1: 80.36, BLEU-2: 73.36, BLEU-3: 63.86, BLEU-4: 53.05
Epoch 9/10
-------------------------------


loss: 0.138688: 100%|███████████████████████████████████████████| 1815/1815 [08:42<00:00,  3.47it/s]
Evaluating valid:   1%|▎                         | 1/88 [00:00<00:29,  2.98it/s]

PRED: ['550', '--', '600MM'] REF: [['以', '40', '至', '60', '厘米', '为宜'], ['40', '至', '60', '厘米']]


Evaluating valid:   2%|▌                         | 2/88 [00:00<00:30,  2.83it/s]

PRED: ['550', '--', '600MM'] REF: [['以', '40', '至', '60', '厘米', '为宜'], ['40', '至', '60', '厘米']]


Evaluating valid:   3%|▉                         | 3/88 [00:00<00:27,  3.06it/s]

PRED: ['4.35%'] REF: [['年', '基准利率', '4.35%'], ['4.35%']]


Evaluating valid: 100%|█████████████████████████| 88/88 [00:31<00:00,  2.82it/s]


Validation BLEU scores: 53.68
BLEU-1: 80.60, BLEU-2: 73.80, BLEU-3: 64.31, BLEU-4: 53.68
Epoch 10/10
-------------------------------


loss: 0.133046:  45%|███████████████████▋                        | 810/1815 [03:50<04:05,  4.10it/s]

# 3. Test Model

In [None]:
model.load_state_dict(torch.load('/home/med/selflearning/stage1-t5/model/bin/epoch_10_valid_BLEU_100.0000_model_weights.bin'))

model.eval()

sources, preds, labels = [], [], []

with torch.no_grad():
    print('evaluating on test set...')
    for batch_data in tqdm(valid_dataloader):
        batch_data = batch_data.to(device)
        generated_tokens = model.generate(
            batch_data["input_ids"],
            attention_mask=batch_data["attention_mask"],
            max_length=max_target_length,
            num_beams=beam_size,
            no_repeat_ngram_size=no_repeat_ngram_size,
        ).cpu().numpy()

        if isinstance(generated_tokens, tuple):
            generated_tokens = generated_tokens[0]

        label_tokens = batch_data["labels"].cpu().numpy()
        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)

        decoded_sources = tokenizer.batch_decode(
            batch_data["input_ids"].cpu().numpy(), 
            skip_special_tokens=True, 
            use_source_tokenizer=True
        )
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)

        sources += [s.strip() for s in decoded_sources]
        preds += [p.strip() if p.strip() else 'empty' for p in decoded_preds]
        labels += [l.strip() if l.strip() else 'empty' for l in decoded_labels]

refs = [[l] for l in labels]
bleu_score = corpus_bleu(preds, refs).score
print(f"Test BLEU: {bleu_score:>0.2f}\n")

results = []
print('saving predicted results...')
for source, pred, label in zip(sources, preds, labels):
    results.append({
        "sentence": source, 
        "prediction": pred, 
        "translation": label 
    })

with open(f'{output_dir}/test_data_pred.json', 'wt', encoding='utf-8') as f:
    for example_result in results:
        f.write(json.dumps(example_result, ensure_ascii=False) + '\n')


  model.load_state_dict(torch.load('/home/med/selflearning/stage1-t5/model/bin/epoch_10_valid_BLEU_100.0000_model_weights.bin'))


FileNotFoundError: [Errno 2] No such file or directory: '/home/med/selflearning/stage1-t5/model/bin/epoch_10_valid_BLEU_100.0000_model_weights.bin'

[1;34mwandb[0m: 
[1;34mwandb[0m: 🚀 View run [33mt5_train[0m at: [34m[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20250928_012245-qhxw4nri/logs[0m
[1;34mwandb[0m: 
[1;34mwandb[0m: 🚀 View run [33mt5_train[0m at: [34m[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20250928_012255-rjqifjt5/logs[0m
