基于T5的文本摘要

Step1 导入相关包

In [1]:
import torch
from datasets import Dataset
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

Step2 加载数据集

In [2]:
ds = load_from_disk("D:\Transformer实战\文本摘要\文本数据集")
ds

Dataset({
    features: ['title', 'content'],
    num_rows: 5000
})

In [3]:
ds = ds.train_test_split(100, seed=42)
ds

DatasetDict({
    train: Dataset({
        features: ['title', 'content'],
        num_rows: 4900
    })
    test: Dataset({
        features: ['title', 'content'],
        num_rows: 100
    })
})

In [4]:
ds["train"][0]

{'title': '组图:黑河边防军人零下30℃户外训练,冰霜沾满眉毛和睫毛,防寒服上满是冰霜。',
 'content': '中国军网2014-12-1709:08:0412月16日,黑龙江省军区驻黑河某边防团机动步兵连官兵,冒着-30℃严寒气温进行体能训练,挑战极寒,锻造钢筋铁骨。该连素有“世界冠军的摇篮”之称,曾有5人24人次登上世界军事五项冠军的领奖台。(魏建顺摄)黑龙江省军区驻黑河某边防团机动步兵连官兵冒着-30℃严寒气温进行体能训练驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜官兵睫毛上都被冻上了冰霜官兵们睫毛上都被冻上了冰霜驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练'}

Step3 数据处理

In [5]:
tokenizer = AutoTokenizer.from_pretrained("Langboat/mengzi-t5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
def process_func(exmaples):
    contents = ["摘要生成: " + e for e in exmaples["content"]]
    inputs = tokenizer(contents, max_length=384, truncation=True)
    labels = tokenizer(text_target=exmaples["title"], max_length=64, truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

In [7]:
tokenized_ds = ds.map(process_func, batched=True)
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['title', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4900
    })
    test: Dataset({
        features: ['title', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [8]:
tokenizer.decode(tokenized_ds["train"][0]["input_ids"])

'摘要生成: 中国军网2014-12-1709:08:0412月16日,黑龙江省军区驻黑河某边防团机动步兵连官兵,冒着-30°C严寒气温进行体能训练,挑战极寒,锻造钢筋铁骨。该连素有“世界冠军的摇篮”之称,曾有5人24人次登上世界军事五项冠军的领奖台。(魏建顺摄)黑龙江省军区驻黑河某边防团机动步兵连官兵冒着-30°C严寒气温进行体能训练驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜官兵睫毛上都被冻上了冰霜官兵们睫毛上都被冻上了冰霜驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练</s>'

In [9]:
tokenizer.decode(tokenized_ds["train"][0]["labels"])

'组图:黑河边防军人零下30°C户外训练,冰霜沾满眉毛和睫毛,防寒服上满是冰霜。</s>'

Step4 创建模型

In [10]:

model = AutoModelForSeq2SeqLM.from_pretrained("Langboat/mengzi-t5-base")

Step5 创建评估函数

In [11]:
import numpy as np
from rouge_chinese import Rouge

rouge = Rouge()

def compute_metric(evalPred):
    predictions, labels = evalPred
    decode_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id) # != 不等于
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decode_preds = [" ".join(p) for p in decode_preds]
    decode_labels = [" ".join(l) for l in decode_labels]
    scores = rouge.get_scores(decode_preds, decode_labels, avg=True)
    return {
        "rouge-1": scores["rouge-1"]["f"],
        "rouge-2": scores["rouge-2"]["f"],
        "rouge-l": scores["rouge-l"]["f"],
    }

Step6 配置训练参数

In [12]:
args = Seq2SeqTrainingArguments(
    output_dir="./summary",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    metric_for_best_model="rouge-l",
    predict_with_generate=True
)

Step7 创建训练器

In [13]:
trainer = Seq2SeqTrainer(
    args=args,
    model=model,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    compute_metrics=compute_metric,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer)
)

Step8 模型训练

In [14]:

trainer.train()

  0%|          | 0/152 [00:00<?, ?it/s]

{'loss': 3.4483, 'grad_norm': 2.3756158351898193, 'learning_rate': 3.355263157894737e-05, 'epoch': 0.65}




  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.2606351375579834, 'eval_rouge-1': 0.47809926073530457, 'eval_rouge-2': 0.30836012668846213, 'eval_rouge-l': 0.4043526391018948, 'eval_runtime': 6.3677, 'eval_samples_per_second': 15.704, 'eval_steps_per_second': 2.042, 'epoch': 0.99}
{'loss': 2.5543, 'grad_norm': 2.2226336002349854, 'learning_rate': 1.7105263157894737e-05, 'epoch': 1.31}
{'loss': 2.3722, 'grad_norm': 2.459825038909912, 'learning_rate': 6.578947368421053e-07, 'epoch': 1.96}




  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.1557536125183105, 'eval_rouge-1': 0.4861410404137299, 'eval_rouge-2': 0.31394431366336817, 'eval_rouge-l': 0.4071789141775612, 'eval_runtime': 7.1857, 'eval_samples_per_second': 13.917, 'eval_steps_per_second': 1.809, 'epoch': 1.98}
{'train_runtime': 3514.0259, 'train_samples_per_second': 2.789, 'train_steps_per_second': 0.043, 'train_loss': 2.7836627677867285, 'epoch': 1.98}


TrainOutput(global_step=152, training_loss=2.7836627677867285, metrics={'train_runtime': 3514.0259, 'train_samples_per_second': 2.789, 'train_steps_per_second': 0.043, 'total_flos': 4155712127004672.0, 'train_loss': 2.7836627677867285, 'epoch': 1.9836867862969005})

Step9 模型推理

In [15]:
from transformers import pipeline
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)
pipe("摘要生成:" + ds["test"][-1]["content"], max_length=64, do_sample=True)


[{'generated_text': '中国船舶重工拟将其所持动力相关资产对外投资,参与的,其控股股东中船重工等。'}]

In [16]:
ds["test"][-1]["title"]

'中国重工拟以持有的动力相关资产进行对外投资,参与中船重工拟打造的动力业务平台公司,将继续停牌。'