### 步骤零、安装依赖并挂载谷歌云盘

In [None]:
%pip install transformers datasets evaluate peft accelerate gradio optimum sentencepiece scikit-learn tensorboard nltk rouge rouge-chinese

In [None]:
# 加载谷歌云盘
from google.colab import drive
drive.mount('/content/drive')

### 步骤一、导入前置依赖

In [1]:
import pandas as pd
import torch
import numpy as np
from rouge_chinese import Rouge
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

### 步骤二、构建数据集

In [None]:
# 设置训练集路径
dataset_local_path = '../data/keysum_data_A.csv'
dataset_gdrive_path = '/content/drive/MyDrive/keysum_data_B.csv'
dataset_path = dataset_local_path
# 读取数据
df = pd.read_csv(dataset_path)
# 构建 DataFrame
df = pd.DataFrame(df)
# 构建数据集
dataset = Dataset.from_pandas(df)
# 划分数据集
dataset = dataset.train_test_split(5000, seed=42)
# 显示数据集
dataset

In [None]:
dataset['train'][0]

### 步骤三、数据预处理

In [None]:
# 设置 Tokenizer
tokenizer = AutoTokenizer.from_pretrained("Langboat/mengzi-t5-base")
# 查看 Tokenizer
# tokenizer

In [None]:
# 数据处理函数
def process_func(examples):
    contents = ["摘要生成: \n" + e for e in examples["Content"]]
    # 对输入(Content)进行 Tokenization
    inputs = tokenizer(contents, max_length=512, truncation=True)
    # 对输出(Keywords)进行 Tokenization
    labels = tokenizer(text_target=examples["Keywords"], max_length=64, truncation=True)
    inputs["labels"] = labels["input_ids"]
    return inputs

In [None]:
# 构建 tokenized_dataset
tokenized_dataset = dataset.map(process_func, batched=True)
# 查看 tokenized_dataset
# tokenized_dataset

In [None]:
# Example
print(tokenizer.decode(tokenized_dataset["train"][0]["input_ids"]))
print(tokenizer.decode(tokenized_dataset["train"][0]["labels"]))

### 步骤四、加载预训练模型

In [None]:
# 加载预训练模型
model = AutoModelForSeq2SeqLM.from_pretrained("Langboat/mengzi-t5-base")

### 步骤五、设置评估函数

In [None]:
rouge = Rouge()

def compute_metric(evalPred):
    predictions, labels = evalPred
    # 解码 Predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # 解码 Labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # 处理 Predictions
    decoded_preds = [" ".join(p) for p in decoded_preds]
    # 处理 Labels
    decoded_labels = [" ".join(p) for p in decoded_labels]
    # 计算分数
    scores = rouge.get_scores(decoded_preds, decoded_labels, avg=True)
    return {
        "rouge-1": scores["rouge-1"]["f"],
        "rouge-2": scores["rouge-2"]["f"],
        "rouge-l": scores["rouge-l"]["f"],
    }

### 步骤六、设置训练参数

In [None]:
# 配置训练参数
args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/t5/checkpoints",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    logging_steps=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=5,
    metric_for_best_model="rouge-l",
    load_best_model_at_end=True,
    predict_with_generate=True,
)

### 步骤七、设置训练器

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metric
)

### 步骤八、训练模型

In [None]:
trainer.train()

### 步骤九、保存模型和分词器

In [None]:
# Save Model
trainer.save_model('/content/drive/MyDrive/t5/model')
# Save Tokenizer
tokenizer.save_pretrained('/content/drive/MyDrive/t5/tokenizer')

### 步骤十、模型推理与测试

In [6]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

model_path = '../model/t5-model'
tokenizer_path = '../model/t5-tokenizer/'
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

def generate_keywords(content, max_gen_len=128):
    keywords = pipe("摘要生成:\n" + content, max_length=max_gen_len, do_sample=True)
    return keywords[0]['generated_text']

In [10]:
# Test
df = pd.read_csv('../data/keysum_data_A.csv')
idx = 5
test_content = df['Content'][idx]
test_keywords = df['Keywords'][idx]
test_results = generate_keywords(test_content)
print("Real Keywords: ", test_keywords)
print("Results: ", test_results)

Real Keywords:  城镇职工重大疾病保障水平; 方差分析; Logistic回归; 对比分析
Results:  城镇职工; 重大疾病保障; 资料统计
