# 数据准备


## 数据加载

In [1]:
import pandas as pd

data_path = 'D:/model/web/nlp01'

with open(data_path + '/corpus/tico-19.en-zh.en', 'r', encoding='utf-8') as f:
    txt = f.read().split('\n')
    
with open(data_path + '/corpus/tico-19.en-zh.zh', 'r', encoding='utf-8') as f:
    txt2 = f.read().split('\n')
data = pd.DataFrame({'en': txt[:-1], 'zh': txt2[:-1]})
data.head(10)

Unnamed: 0,en,zh
0,about how long have these symptoms been going on?,这些症状已持续多长时间？
1,and all chest pain should be treated this way ...,各种胸痛均应采取这种方法进行治疗，尤其要考虑年龄因素
2,and along with a fever,并伴有发热症状
3,and also needs to be checked your cholesterol ...,还需要检查一下胆固醇和血压
4,and are you having a fever now?,您现在有发热吗？
5,and are you having any of the following sympto...,您的胸痛伴有以下任何症状吗
6,and are you having a runny nose?,您有流鼻涕吗？
7,and are you having this chest pain now?,现在您有这种胸痛症状吗？
8,and besides do you have difficulty breathing,另外您有呼吸困难吗
9,and can you tell me what other symptoms are yo...,您能描述一下除此之外还有什么其他症状吗？


## 数据划分

In [2]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(data, test_size=0.2, random_state=0)
print(data.shape, train.shape, val.shape)

(3071, 2) (2456, 2) (615, 2)


## 加载tokenizer

In [3]:
from transformers import AutoTokenizer

In [4]:
model_ckpt = data_path + '/Helsinki-NLP--opus-mt-zh-en/'
# 使用该tokenizer需要pip install sentencepiece
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)



In [5]:
tokenizer

MarianTokenizer(name_or_path='D:/model/web/nlp01/Helsinki-NLP--opus-mt-zh-en/', vocab_size=65001, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	65000: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
print(tokenizer('hello, this is a sentence.'))

{'input_ids': [5566, 26607, 2, 56, 30, 12, 95, 4509, 8233, 5, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [7]:
print(tokenizer('您好，这是一个句子。'))

{'input_ids': [42473, 2, 12654, 10054, 863, 9, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


## 把数据封装成datasets类

In [8]:
from datasets import Dataset

In [9]:
def get_ds(train, tokenizer):
    max_input_length = 128
    max_target_length = 128

    inputs = list(train['zh'].values)
    targets = list(train['en'].values)

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs['labels'] = labels['input_ids']    
    model_inputs['translation'] = [{'en':e, 'zh':c} for c, e in zip(inputs, targets)]
    model_inputs = Dataset.from_dict(model_inputs)
    return model_inputs

In [10]:
train_dataset = get_ds(train, tokenizer)



In [11]:
val_dataset = get_ds(val, tokenizer)

In [12]:
print(train_dataset, 
      '\n',val_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'translation'],
    num_rows: 2456
}) 
 Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'translation'],
    num_rows: 615
})


# 预训练模型加载

In [13]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

In [14]:
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

## 使用预训练模型直接进行翻译
不微调先看效果

In [15]:
# 1.需要翻译的文本
train.iloc[0,1]

'企业对员工实施出行限制，取消会议，并鼓励员工居家办公。'

In [16]:
# 2.转换为token id
inputs = tokenizer.encode(train.iloc[0,1], return_tensors='pt')
print(inputs)

tensor([[    7,  1667,    63, 11747,   492,   854,  1699,  1323,     2,  4135,
           336,     2,  5826, 11747,  6694,  1208,  8763,     9,     0]])


In [17]:
# 3 模型预测
decoder_inputs = model.generate(inputs)
print(decoder_inputs)

tensor([[65000, 39173, 10729,  2322,  4801,    18,   353,     2, 31137,   563,
             6,  2441,   353,     8,   118,    46,  1430,     5,     0]])


In [18]:
# 4 将预测结果转化为文本
print(''.join(tokenizer.convert_ids_to_tokens(decoder_inputs[0])).replace('▁', ' ')) # 注意这里需要用空格替换下划线

<pad> Enterprises impose travel restrictions on staff, cancel meetings and encourage staff to work at home.</s>


In [19]:
print(train.iloc[0,0])

Corporations imposed employee travel restrictions, cancelled conferences, and encouraged employees to work from home.


# 微调训练

## 定义模型训练参数

In [20]:
from transformers import Seq2SeqTrainingArguments

In [21]:
batch_size = 8

args = Seq2SeqTrainingArguments(
    'D:/model/web/nlp01/ckpt',  # 模型checkpoint文件保存的路径
    eval_strategy='epoch',  # 是否使用验证集进行模型评估。设置为epoch表示每个epcoh会做一次验证评估。
    learning_rate=2e-5,  # 学习率
    per_device_train_batch_size=batch_size,   # 训练过程中的批处理样本个数
    per_device_eval_batch_size=batch_size,   # 验证过程中的批处理样本个数
    weight_decay=0.01,  #不为零的情况下，要应用于除AdamW优化器中的所有偏置和LayerNorm权重之外的所有层的权重衰减。
    save_total_limit=1,  # 模型保存的个数。至多保存3个模型
    num_train_epochs=10,  # 训练次数
    predict_with_generate=True,  # 是否使用生成的数据计算度量指标（如BLUE）
)

## 定义数据收集器

In [22]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(
    tokenizer,  # 分词器
    model,  # 预训练模型
    label_pad_token_id=-100,   # padding对应的id，默认-100
)

In [23]:
help(data_collator)

Help on DataCollatorForSeq2Seq in module transformers.data.data_collator object:

class DataCollatorForSeq2Seq(builtins.object)
 |  DataCollatorForSeq2Seq(tokenizer: transformers.tokenization_utils_base.PreTrainedTokenizerBase, model: Optional[Any] = None, padding: Union[bool, str, transformers.utils.generic.PaddingStrategy] = True, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, label_pad_token_id: int = -100, return_tensors: str = 'pt') -> None
 |  
 |  Data collator that will dynamically pad the inputs received, as well as the labels.
 |  
 |  Args:
 |      tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
 |          The tokenizer used for encoding the data.
 |      model ([`PreTrainedModel`], *optional*):
 |          The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
 |          prepare the *decoder_input_ids*
 |  
 |          This is useful when using *label_smoothing* to avoid calcu

## 定义评估指标

In [27]:
from evaluate import load
# pip install evaluate
# pip install sacrebleu
sacrebleu_path = r'D:\model\web\nlp01\huggingface\modules\evaluate_modules\metrics\evaluate-metric--sacrebleu\28676bf65b4f88b276df566e48e603732d0b4afd237603ebdf92acaacf5be99b\sacrebleu.py'
metric = load(sacrebleu_path)

In [28]:
metric

EvaluationModule(name: "sacrebleu", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Produces BLEU scores along with its sufficient statistics
from a source against one or more references.

Args:
    predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
    references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
    smooth_method (`str`): The smoothing method to use, defaults to `'e

In [29]:
import numpy as np

In [30]:
# tokenizer.pad_token_id
def process_text(preds, labels):
    preds = [i.strip() for i in preds]
    labels = [i.strip() for i in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
        
    decoder_inputs = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    labels = np.where(labels!= -100, labels, tokenizer.pad_token_id)
    decoder_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    res = metric.compute(predictions=decoder_inputs, references=decoder_labels)
    result = {'bleu': round(res['score'], 4)}
    
    # 添加评估指标：预测的平均长度
    predict_len = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result['gen_len'] = round(np.mean(predict_len), 4)
    
    return result

## 创建trainer对象，进行模型微调

In [32]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [33]:
trainer.train()  # 微调训练

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mautumnnn[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,1.344449,30.4804,31.8959
2,1.397000,1.29111,31.9509,31.7154
3,1.397000,1.276032,32.4859,31.4715
4,0.996400,1.275565,32.9718,31.7902
5,0.786400,1.277581,32.8453,31.9935
6,0.786400,1.296788,33.1111,31.935
7,0.637800,1.302871,33.4281,31.787
8,0.637800,1.312782,33.4168,31.9236
9,0.555100,1.316131,33.4519,31.987
10,0.502700,1.320113,33.5451,31.8976


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


TrainOutput(global_step=3070, training_loss=0.8052966586929967, metrics={'train_runtime': 857.2399, 'train_samples_per_second': 28.65, 'train_steps_per_second': 3.581, 'total_flos': 402481642733568.0, 'train_loss': 0.8052966586929967, 'epoch': 10.0})

## 模型推理

In [34]:
model_test = AutoModelForSeq2SeqLM.from_pretrained('D:/model/web/nlp01/ckpt/checkpoint-3070/')
tokenizer_test = AutoTokenizer.from_pretrained('D:/model/web/nlp01/ckpt/checkpoint-3070/')



In [36]:
# 试一下pipeline
from transformers import pipeline

In [43]:
# help(pipeline)

In [39]:
zh2en = pipeline('translation_zh_to_en', 
                 model=model_test, 
                 tokenizer=tokenizer_test,
                 device = 'cuda')
# 不指定device会有下方提示
# Hardware accelerator e.g. GPU is available in the environment, 
# but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.

In [40]:
result = zh2en('您的胸痛伴有以下任何症状吗?')

In [41]:
result

[{'translation_text': 'do you have any of the following symptoms with your chest pain?'}]

In [42]:
result[0]['translation_text']

'do you have any of the following symptoms with your chest pain?'

In [45]:
## 稍微封装一下
def pipeline_output(text, model):
    result = model(text)
    return result[0]['translation_text']

In [46]:
pipeline_output('我觉得我的肺部很痛，无法入睡。', zh2en)

"i feel a pain in my lungs and can't sleep."