In [3]:
from datasets import DatasetDict, Dataset

def load_local_data():

    file_paths = {
        "train": {"en": "../data/nejm.train.en", "zh": "../data/nejm.train.zh"},
        "dev": {"en": "../data/nejm.dev.en", "zh": "../data/nejm.dev.zh"},
        "test": {"en": "../data/nejm.test.en", "zh": "../data/nejm.test.zh"}
    }
    # 定义文件路径的字典
    
    data = {}
    for split, paths in file_paths.items():
        # 遍历字典 取出相应的数据集和路径
        
        with open(paths['en'], encoding='utf-8') as f_en, open(paths['zh'], encoding='utf-8') as f_zh:
            en_lines = f_en.readlines()
            zh_lines = f_zh.readlines()

        examples = [{"translation": {"en": en.strip(), "zh": zh.strip()}} for en, zh in zip(en_lines, zh_lines)]
        data[split] = Dataset.from_dict({"translation": examples})
        # 取出三个数据集(train dev test)中的每条对应的中英文数据 并去除空白字符 对应split所分别代表的数据集来构建数据集对象

    return DatasetDict(data)

In [4]:
from datasets import load_metric

raw_datasets = load_local_data()
metric = load_metric("sacrebleu")
# 加载数据集和评估指标

In [5]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 62127
    })
    dev: Dataset({
        features: ['translation'],
        num_rows: 2036
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2102
    })
})

In [7]:
metric

Metric(name: "sacrebleu", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, usage: """
Produces BLEU scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
    references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
    smooth_method (`str`): The smoothing method to use, defaults to `'exp'`. Possible values are:
        - `'none'`: no smoothing
        - `'floor'`: increment zero counts
        - `'add-k'`: increment num/deno

In [8]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-zh"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# 定义需要微调的预训练模型和该模型的分词器

max_word_length = 128
input_lang = "en"
output_lang = "zh"
# 设定最大长度和数据集对应的语言

In [9]:
def token_f(examples):
    inputs = [ex["translation"][input_lang] for ex in examples["translation"]]
    outputs = [ex["translation"][output_lang] for ex in examples["translation"]]
    # 取出对应数据集中的全部英文和中文的语句
    
    model_inputs = tokenizer(inputs, max_length = max_word_length,truncation=True)
    # 用分词器对输入的英文进行自动分词 超过128个字符就进行截断
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(outputs, max_length = max_word_length,truncation=True)
        labels["input_ids"] = [[token for token in label if token != 8] for label in labels["input_ids"]]
    # 切换为目标语言中文的分词器进行分词 也是超过128个字符就进行截断 因为这种方法会使得每个被处理过的中文单词中间出现空格填充(与数据有关) 所以需要将全部的空格填充手动去掉
        
    model_inputs["labels"] = labels["input_ids"]
    # 将处理好的目标语言的token ID添加到输入字典的labels键下
    
    return model_inputs

In [10]:
token_f(raw_datasets["train"][:2])

{'input_ids': [[5961, 56, 8, 37, 1550, 11931, 22602, 1766, 4, 1541, 1582, 11, 5119, 44083, 1669, 3196, 8, 6, 0], [1557, 8, 3376, 16, 3376, 465, 2686, 4, 42731, 3196, 22, 98, 2725, 149, 38435, 5097, 8, 17, 32, 59, 435, 30222, 695, 4, 37399, 9608, 8, 6, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[7057, 10373, 37, 1390, 12363, 33392, 16334, 9019, 24383, 106, 2039, 4734, 33164, 26456, 199, 1402, 21767, 6, 0], [67, 22249, 2554, 1189, 12, 42544, 42508, 13859, 1098, 295, 848, 3940, 12, 1729, 5160, 2068, 6, 0]]}

In [11]:
token_id_to_find = [7057, 10373, 37, 1390, 12363, 33392, 16334, 9019, 24383, 106, 2039, 4734, 33164, 26456, 199, 1402, 21767, 6, 0]
token_str = tokenizer.convert_ids_to_tokens(token_id_to_find)
print(token_str,end='')

['▁也许', '▁不能', ':', '分析', '▁结果', '提示', '激', '素', '疗法', '▁在', '维持', '▁去', '脂', '体重', '方面', '作用', '很小', '.', '</s>']

In [12]:
token_id_to_find = [5961, 56, 8, 37, 1550, 11931, 22602, 1766, 4, 1541, 1582, 11, 5119, 44083, 1669, 3196, 8, 6, 0]
token_str = tokenizer.convert_ids_to_tokens(token_id_to_find)
print(token_str,end='')

['▁probably', '▁not', '▁', ':', '▁analysis', '▁suggests', '▁minimal', '▁effect', '▁of', '▁H', 'T', '▁in', '▁maintaining', '▁lean', '▁body', '▁mass', '▁', '.', '</s>']

In [13]:
tokenized_datasets = raw_datasets.map(token_f, batched=True)
# 遍历数据集中的样本使用定义好的token_f函数进行处理 并启用批量化操作

Map:   0%|          | 0/62127 [00:00<?, ? examples/s]

Map:   0%|          | 0/2036 [00:00<?, ? examples/s]

Map:   0%|          | 0/2102 [00:00<?, ? examples/s]

In [14]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# 加载定义的预训练模型

In [15]:
from transformers import Seq2SeqTrainingArguments

batch_size = 16
args = Seq2SeqTrainingArguments(
    "translation_dir",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=False,
)
# 配置模型的训练参数 设置fp16为False主要是为了确保稳定性和精度

In [16]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# 定义数据整理器 用来填充和对齐即将传入的数据

In [17]:
def process_text(preds,labels):
    
    decoded_preds = [pred.split() for pred in preds]
    decoded_labels = [label.split() for label in labels]
    pred = [pred.strip() for pred in decoded_preds]
    label = [label.strip() for label in decoded_labels]
    # 将句子拆分为列表并去除空白字符
    
    return pred, label

In [29]:
import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # 对预测的序列进行解码 转换为中文 并忽略特殊标记
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # 因为填充在训练过程中需要被忽略 所以模型会将其用特殊值-100来进行标记 需要将这些-100的标记转换为填充所对应的id
    
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # 对真实的序列进行解码 用于计算bleu

    decoded_preds = [pred.split() for pred in decoded_preds]
    decoded_labels = [label.split() for label in decoded_labels]
    # 把解码后的句子拆分成词列表

    decoded_preds, decoded_labels = process_text(decoded_preds, decoded_labels)
    
    result = metric.compute(predictions=decoded_preds, references=[[dl] for dl in decoded_labels])
    result = {"bleu": result["score"]}
    # 计算bleu分数

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    # 忽略标记 计算平均生成长度

    return result

In [30]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
# 定义训练流程

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.1862,1.115826,31.696818,38.078585
2,1.0198,1.033558,32.705277,37.960707
3,0.9157,0.998998,33.534264,38.028978
4,0.8552,0.983651,33.690372,38.041257
5,0.8235,0.979454,33.624423,37.871316


TrainOutput(global_step=19415, training_loss=1.0095811803102064, metrics={'train_runtime': 11656.0171, 'train_samples_per_second': 26.65, 'train_steps_per_second': 1.666, 'total_flos': 8115711890817024.0, 'train_loss': 1.0095811803102064, 'epoch': 5.0})

In [36]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_path = "translation_dir/checkpoint-19000" 
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
# 定义使用的微调后的模型 使用该模型的token并加载

def translate_to_chinese(text):
    
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    # 将输入的文本词汇转成tensor形的token ID序列

    translated = model.generate(**inputs)
    # 将字典序列直接用**解包传入给model 避免了后续如果有添加参数 再手动更改的麻烦

    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    # 取出得分最高的第一个序列 也就是最有可能的序列 并忽略特殊标记
    
    return translated_text

english_text = "the maximum tolerated dose of asciminib was not reached ."
chinese_translation = translate_to_chinese(english_text)
chinese_sentence = chinese_translation.replace(" ", "")
# 去除翻译中多余的空格

print(chinese_sentence)

未达到阿司匹尼的最大耐受剂量.
