In [None]:
%pip install transformers datasets evaluate peft accelerate gradio optimum sentencepiece 
%pip scikit-learn tensorboard nltk rouge rouge-chinese

### Step 1: 导入前置依赖

In [5]:
# Import Libraries
import pandas as pd
import torch
import numpy as np
from rouge_chinese import Rouge
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

### Step 2: 构建数据集

In [6]:
# 读取数据
df = pd.read_csv('../data/keysum_data_A.csv')
# 构建 DataFrame
df = pd.DataFrame(df)
# 构建数据集
dataset = Dataset.from_pandas(df)
# 划分数据集
dataset = dataset.train_test_split(2000, seed=42)
# 显示数据集
dataset

DatasetDict({
    train: Dataset({
        features: ['Content', 'Keywords'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['Content', 'Keywords'],
        num_rows: 2000
    })
})

In [7]:
dataset['train'][0]

{'Content': '基于游程递归的连通区域标记算法。在研究已有算法的基础上,提出一种基于游程递归的标记算法,该算法可以对二值图像实现快速标记.顺序扫描图像,寻找未标记的游程,并递归搜索与之连通的游程,直到一个连通区域生成.在游程搜索过程中,在当前游程的相邻两行上,以其左端点为起始点分别向前向后进行连通游程的搜索;同时根据游程之间的位置关系对搜索策略进行优化,减少了重复搜索,提高了处理速度.该算法只需经过一次扫描图像,就能快速、准确地标记连通区域.在与已有算法的实验结果比较中,该算法具有较快的执行速度和较高的准确率,并且占用较少的内存,可以满足在施工现场中运动目标实时检测的需要. 基于游程递归的连通区域标记算法',
 'Keywords': '运动目标; 连通区域标记; 二值图像'}

### Step 3: 数据预处理

In [None]:
# 设置 Tokenizer
tokenizer = AutoTokenizer.from_pretrained("Langboat/mengzi-t5-base")
tokenizer

In [None]:
# 数据处理函数
# TODO: 改为传入Tokenizer
def process_func(examples):
    contents = ["摘要生成: \n" + e for e in examples["Content"]]
    # 对输入(Content)进行 Tokenization
    inputs = tokenizer(contents, max_length=512, truncation=True)
    # 对输出(Keywords)进行 Tokenization
    labels = tokenizer(text_target=examples["Keywords"], max_length=64, truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

In [None]:
# 对数据集进行 Tokenization
tokenized_dataset = dataset.map(process_func, batched=True)
tokenized_dataset

In [None]:
# Example
print(tokenizer.decode(tokenized_dataset["train"][0]["input_ids"]))
print(tokenizer.decode(tokenized_dataset["train"][0]["labels"]))

#### Step 4: 设置预训练模型

In [None]:
# 加载预训练模型
model = AutoModelForSeq2SeqLM.from_pretrained("Langboat/mengzi-t5-base")

#### Step 5: 设置评估函数

In [None]:
rouge = Rouge()

def compute_metric(evalPred):
    predictions, labels = evalPred
    # 有
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = [" ".join(p) for p in decoded_preds]
    decoded_labels = [" ".join(p) for p in decoded_labels]
    scores = rouge.get_scores(decoded_preds, decoded_labels, avg=True)
    return {
        "rouge-1": scores["rouge-1"]["f"],
        "rouge-2": scores["rouge-2"]["f"],
        "rouge-l": scores["rouge-l"]["f"],
    }

### Step 6: 设置训练参数

In [None]:
# 配置训练参数
args = Seq2SeqTrainingArguments(
    output_dir="./summary",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    logging_steps=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="rouge-l",
    predict_with_generate=True,  
)

### Step 7: 设置训练器

In [None]:
trainer = Seq2SeqTrainer(
    args=args,
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metric,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer)
)

### Step 8: 模型训练

In [None]:
trainer.train()

### Step 9: 模型推理

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

In [None]:
pipe("摘要生成:\n" + dataset["test"][-1]["Content"], max_length=64, do_sample=True)

In [None]:
dataset["test"][-1]["title"]