# 训练翻译模型

## Step1 导入相关包

In [None]:
!pip install datasets rouge_score evaluate peft sacrebleu

In [None]:
from google.colab import drive

drive.mount('/content/drive')

## Step2 加载数据集

In [None]:
import json

from datasets import Dataset

#input_file = "./translate_trans_data.jsonl"
input_file = "/content/drive/MyDrive/ai-learning/build_xinyi/translate_trans_data.jsonl"


def load_jsonl_data(file_path):
    json_data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            json_data.append(json.loads(line.strip()))
    return json_data


# 将数据加载为 Dataset 格式
data = load_jsonl_data(input_file)
dataset = Dataset.from_list(data)
dataset

In [None]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

## Step3 数据预处理

- 在输入前加上提示符，以便 T5 知道这是一项翻译任务。一些能够执行多个 NLP 任务的模型需要提示特定任务。

- 在 text_target 参数中设置目标语言 （中文） 以确保分词器正确处理目标文本。如果未设置 text_target，则分词器会将目标文本处理为英语。

- 截断序列，使其长度不超过 max_length 参数设置的最大长度。

In [None]:
from transformers import AutoTokenizer

checkpoint = "Langboat/mengzi-t5-base"
# 初始化 tokenizer 和模型
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
source_lang = "zh"
target_lang = "zh"
prefix = "翻译 中文 为 信宜话: "


def preprocess_function(examples):
    inputs = [prefix + example for example in examples["input_text"]]
    targets = examples["target_text"]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

In [None]:
preprocess_function(datasets["train"][0])

In [None]:
tokenized_datasets = datasets.map(preprocess_function, batched=True)
tokenized_datasets

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## Step4 创建模型

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

## Step5 配置Lora

In [None]:
# from peft import LoraConfig, TaskType, get_peft_model
#
# lora_config = LoraConfig(
#     task_type=TaskType.SEQ_2_SEQ_LM,
#     r=8,
#     lora_alpha=16,
#     lora_dropout=0.1,
#     target_modules=["q_proj", "v_proj"]
# )

In [None]:
# # 应用 LoRA 配置到模型
# model = get_peft_model(model, lora_config)

## Step6 创建模型评估函数

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Step7 配置训练参数

In [None]:
# 设置训练参数
import os

os.environ["WANDB_DISABLED"] = "true"
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,  #change to bf16=True for XPU
)


In [None]:
## Step7 创建Trainer

In [None]:
# 定义 Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## Step8 训练模型


In [None]:
trainer.train()

## Step9 评估模型

In [None]:
# 输入普通话文本
input_sentence = prefix + "你好，你喜欢吃什么小吃？"

# 进行转换（普通话到方言）
from transformers import pipeline

translator = pipeline("translation_zh_to_zh", model=model, tokenizer=tokenizer)

print("普通话:", input_sentence)
print("方言:", translator(input_sentence)[0]["translation_text"])

## Step10 保存模型

In [None]:
model.save_pretrained("/content/drive/MyDrive/ai-learning/dialect_model/model/t5-02")
tokenizer.save_pretrained("/content/drive/MyDrive/ai-learning/dialect_model/model/t5-02")