In [6]:
import json
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer


# 1. 加载 预训练模型 和 分词器
model_name="langboat/mengzi-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


# 2. 读取数据集
def load_json(file_path):
    data=[]
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_data = load_json('data/train.json')
valid_data = load_json('data/dev.json')

print(train_data[:5])   

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


[{'context': '第35集雪见缓缓张开眼睛，景天又惊又喜之际，长卿和紫萱的仙船驶至，见众人无恙，也十分高兴。众人登船，用尽合力把自身的真气和水分输给她。雪见终于醒过来了，但却一脸木然，全无反应。众人向常胤求助，却发现人世界竟没有雪见的身世纪录。长卿询问清微的身世，清微语带双关说一切上了天界便有答案。长卿驾驶仙船，众人决定立马动身，往天界而去。众人来到一荒山，长卿指出，魔界和天界相连。由魔界进入通过神魔之井，便可登天。众人至魔界入口，仿若一黑色的蝙蝠洞，但始终无法进入。后来花楹发现只要有翅膀便能飞入。于是景天等人打下许多乌鸦，模仿重楼的翅膀，制作数对翅膀状巨物。刚佩戴在身，便被吸入洞口。众人摔落在地，抬头发现魔界守卫。景天和众魔套交情，自称和魔尊重楼相熟，众魔不理，打了起来。', 'answer': '第35集', 'question': '仙剑奇侠传3第几集上天界', 'id': 0}, {'context': '选择燃气热水器时，一定要关注这几个问题：1、出水稳定性要好，不能出现忽热忽冷的现象2、快速到达设定的需求水温3、操作要智能、方便4、安全性要好，要装有安全报警装置 市场上燃气热水器品牌众多，购买时还需多加对比和仔细鉴别。方太今年主打的磁化恒温热水器在使用体验方面做了全面升级：9秒速热，可快速进入洗浴模式；水温持久稳定，不会出现忽热忽冷的现象，并通过水量伺服技术将出水温度精确控制在±0.5℃，可满足家里宝贝敏感肌肤洗护需求；配备CO和CH4双气体报警装置更安全（市场上一般多为CO单气体报警）。另外，这款热水器还有智能WIFI互联功能，只需下载个手机APP即可用手机远程操作热水器，实现精准调节水温，满足家人多样化的洗浴需求。当然方太的磁化恒温系列主要的是增加磁化功能，可以有效吸附水中的铁锈、铁屑等微小杂质，防止细菌滋生，使沐浴水质更洁净，长期使用磁化水沐浴更利于身体健康。', 'answer': '方太', 'question': '燃气热水器哪个牌子好', 'id': 1}, {'context': '迈克尔.乔丹在NBA打了15个赛季。他在84年进入nba，期间在1993年10月6日第一次退役改打棒球，95年3月18日重新回归，在99年1月13日第二次退役，后于2001年10月31日复出，在03年最终退役。迈克尔·乔丹（Michael Jorda

In [7]:
print(len(train_data))
print(len(valid_data))

14520
984


In [8]:
# 3. 转换为HUGGINGFACE格式
from datasets import Dataset


train_dataset=Dataset.from_list(train_data)
valid_dataset=Dataset.from_list(valid_data)
print(train_dataset.shape)
print(train_dataset)

(14520, 4)
Dataset({
    features: ['context', 'answer', 'question', 'id'],
    num_rows: 14520
})


In [9]:
# 4. 预处理
# 拼接question和context作为输入，answer作为输出，训练模型

max_input_length = 512
max_output_length = 64

def preprocess_function(data):
    # 拼接question和context作为输入
    inputs = ["question: " + q + " context: " + c for q, c in zip(data["question"], data["context"])]
    # 转换为token_id
    model_inputs = tokenizer(
        inputs, 
        max_length=max_input_length, 
        padding="max_length", 
        truncation=True
        )
    # 标签
    labels = tokenizer(
        data["answer"], 
        max_length=max_output_length, 
        padding="max_length", 
        truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 调用预处理函数，将原始文本数据转换为模型输入，input_ids和attention_mask，以及labels
train_datasets = train_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=train_dataset.column_names
    )
valid_datasets = valid_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=valid_dataset.column_names
    )
print(train_datasets.column_names)


Map: 100%|██████████| 14520/14520 [00:09<00:00, 1552.31 examples/s]
Map: 100%|██████████| 984/984 [00:00<00:00, 1588.93 examples/s]

['input_ids', 'attention_mask', 'labels']





In [10]:
import jieba
from transformers import DataCollatorForSeq2Seq
import evaluate
# 5. 数据收集器 (DataCollatorForSeq2Seq)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 6. 评价指标 (BLEU)
bleu_metric = evaluate.load("bleu")

def compute_metrics(eval_pred):
    # trainer会把每一轮的结果传给compute_metrics
    # eval_pred包含两个元素，第一个元素是模型预测的结果，第二个元素是标签
    predictions, labels = eval_pred
    # 把token id转换为文本
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 分词 (BLEU 需要以词为单位)
    decoded_preds = [list(jieba.cut(pred)) for pred in decoded_preds]
    decoded_labels = [[list(jieba.cut(label))] for label in decoded_labels]

    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["bleu"]}


In [14]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./t5-qa-output",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
)

# 🔹 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_datasets,
    eval_dataset=valid_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [15]:
trainer.train()

# 🔹 10. 训练完成后，保存模型
trainer.save_model("./t5-qa-finetuned")
tokenizer.save_pretrained("./t5-qa-finetuned")

Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 7.11 GiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 11.29 GiB is allocated by PyTorch, and 2.24 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)