# 模型测试

In [None]:
from google.colab import drive

drive.mount('/content/drive')

## 翻译模型测试

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# 本地
checkpoint = "./model/t5-02"

# google drive
#checkpoint = "/content/drive/MyDrive/ai-learning/dialect_model/model/t5-02"

# 初始化 tokenizer 和模型
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# 加载模型，本地文件导入
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
# 输入普通话文本
prefix = "翻译 中文 为 信宜话: "
text = "你好，你喜欢吃什么小吃？"
input_sentence = prefix + text

# 进行转换（普通话到方言）
from transformers import pipeline

translator = pipeline("translation_zh_to_zh", model=model, tokenizer=tokenizer)

print("普通话:", text)
print("方言:", translator(input_sentence)[0]["translation_text"])

## 翻译语料

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# 本地
#checkpoint = "./model/t5-02"

# google drive
checkpoint = "/content/drive/MyDrive/ai-learning/dialect_model/model/t5-02"

# 初始化 tokenizer 和模型
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# 加载模型，本地文件导入
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
from transformers import pipeline
import json, torch, tqdm

# 使用 GPU 加速
translator = pipeline("translation_zh_to_zh", model=model, tokenizer=tokenizer, device=0)

# 加载 JSONL 文件为 Dataset 格式
input_file = "/content/drive/MyDrive/ai-learning/dialect_model/dataset/train.jsonl"
output_file = "/content/drive/MyDrive/ai-learning/dialect_model/dataset/chat_trans2.jsonl"
prefix = "翻译 中文 为 信宜话:"


# 从 JSONL 文件中加载数据
def load_jsonl_data(file_path):
    json_data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            json_data.append(json.loads(line.strip()))
    return json_data


# 加载数据并转换为 Hugging Face Dataset
data = load_jsonl_data(input_file)
data = data[10000:23000]


# 将翻译结果分批次写入文件
def translate_and_write_batch(batch, f):
    with torch.no_grad():
        questions = [prefix + q["question"] for q in batch]
        answers = [a["answer"] for a in batch]

        # 批量翻译问题和答案
        translated_questions = translator(questions, max_length=300)
        translated_answers = translator(answers, max_length=300)

        # 更新原问题和答案为翻译后的文本
        for idx, item in enumerate(batch):
            item["question"] = translated_questions[idx]["translation_text"]
            item["answer"] = translated_answers[idx]["translation_text"]
            # 写入文件
            f.write(json.dumps({"question": item["question"], "answer": item["answer"]}, ensure_ascii=False) + "\n")


# 打开文件准备写入
with open(output_file, 'w', encoding='utf-8') as f:
    for i in tqdm.tqdm(range(0, len(data), 24), total=len(data) // 24):
        # 获取当前批次的数据
        batch = data[i:i + 24]
        translate_and_write_batch(batch, f)
