# 基于GLM的文本摘要

## Step1 导入相关包

In [1]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

## Step2 加载数据集

In [2]:
ds = Dataset.load_from_disk("./nlpcc_2017/")
ds

Dataset({
    features: ['title', 'content'],
    num_rows: 5000
})

In [3]:
ds = ds.train_test_split(100, seed=42)#从原始数据集中取出100个样本作为测试集剩余的所有样本作为训练集
#seed=42含义：把当前的划分状态设置成seed=42，以后每次训练当seed=42时，划分结果相同，seed=别的数，划分状态就不同。42没有任何特殊含义，只是一个习惯用词
'''train_test_split(train_data, train_target, test_size, random_state, shuffle)
train_data:还未划分的数据集
train_target:还未划分的标签
test_size：分割比例，默认为0.25，即测试集占完整数据集的比例
random_state:随机数种子，应用于分割前对数据的洗牌。可以是int，默认值=None。设成定值意味着，对于同一个数据集，只有第一次运行是随机的，随后多次分割只要rondom_state相同，则划分结果也相同。
shuffle:是否在分割前对完整数据进行洗牌（打乱），默认为True，打乱
'''

ds

DatasetDict({
    train: Dataset({
        features: ['title', 'content'],
        num_rows: 4900
    })
    test: Dataset({
        features: ['title', 'content'],
        num_rows: 100
    })
})

In [4]:
ds["train"][0]

{'title': '组图:黑河边防军人零下30℃户外训练,冰霜沾满眉毛和睫毛,防寒服上满是冰霜。',
 'content': '中国军网2014-12-1709:08:0412月16日,黑龙江省军区驻黑河某边防团机动步兵连官兵,冒着-30℃严寒气温进行体能训练,挑战极寒,锻造钢筋铁骨。该连素有“世界冠军的摇篮”之称,曾有5人24人次登上世界军事五项冠军的领奖台。(魏建顺摄)黑龙江省军区驻黑河某边防团机动步兵连官兵冒着-30℃严寒气温进行体能训练驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜官兵睫毛上都被冻上了冰霜官兵们睫毛上都被冻上了冰霜驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练'}

## Step3 数据处理

In [7]:
# 对于高版本的Transformers加载会报错，需要修改源码
# 文件地址 ~/.cache\huggingface\modules\transformers_modules\THUDM\glm-large-chinese\230f54e413fab4bc8f29bd3508aab301d757ef3e\tokenization_glm.py
# 231行 super().__init__(**kwargs) 移动至 235行，放至self.sp_model.Load(vocab_file)的后面一行
tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-large-chinese", trust_remote_code=True)
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


GLMChineseTokenizer(name_or_path='THUDM/glm-large-chinese', vocab_size=50000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='left', special_tokens={'eos_token': '<|endoftext|>', 'unk_token': '[UNK]', 'pad_token': '<|endoftext|>', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['<|startofpiece|>', '<|endofpiece|>', '[gMASK]', '[sMASK]']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50001: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50002: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50003: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50004: AddedToken("[UNUSED1]", rstrip=False, lstrip=False, single_word=Fa

In [8]:
def process_func(exmaples):
    contents = ["摘要生成: \n" + e + tokenizer.mask_token for e in exmaples["content"]]
    inputs = tokenizer(contents, max_length=384, truncation=True, padding="max_length", return_tensors="pt")
    #padding = 'max_length' 强制所有序列都填充到 max_length 参数指定的固定长度。    padding= True 自动填充批次中最长序列的长度

    #GLM特有的格式构建
    inputs = tokenizer.build_inputs_for_generation(inputs, targets=exmaples['title'], padding=True, max_gen_length=64)  #生成文本的最大token长度为64
        #targets=exmaples['title']  意为将数据集中'title'列的内容作为训练目标（标签）。

    return inputs

In [9]:
tokenized_ds = ds.map(process_func, batched=True, remove_columns=ds["train"].column_names)
tokenized_ds

Map:   0%|          | 0/4900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'position_ids', 'attention_mask', 'labels'],
        num_rows: 4900
    })
    test: Dataset({
        features: ['input_ids', 'position_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [10]:
tokenizer.decode(tokenized_ds["train"][0]["input_ids"])

'[CLS] 摘要生成: 中国军网2014-12-1709:08:0412月16日,黑龙江省军区驻黑河某边防团机动步兵连官兵,冒着-30°C严寒气温进行体能训练,挑战极寒,锻造钢筋铁骨。该连素有“世界冠军的摇篮”之称,曾有5人24人次登上世界军事五项冠军的领奖台。(魏建顺摄)黑龙江省军区驻黑河某边防团机动步兵连官兵冒着-30°C严寒气温进行体能训练驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜驻黑河某边防团机动步兵连官兵严寒中户外训练,防寒服上满是冰霜官兵睫毛上都被冻上了冰霜官兵们睫毛上都被冻上了冰霜驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练驻黑河某边防团机动步兵连官兵严寒中进行户外体能训练[MASK]<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endofte

In [11]:
tokenized_ds["train"][0]["labels"]

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,

In [12]:
print(tokenized_ds["train"][0]["position_ids"])

'''

input_ids是这个字在词汇表里面对应的token索引
position_ids是这个字在该example里面对应的位置，但是position_ids是二维的，第一维代表在这个example里的绝对位置，第二维代表块位置，其中：
        0：源文本部分（条件上下文）
        ≥1：生成文本部分（自回归生成）
        从[sop]开始递增
第一维的211代表生成文本
实例：
input_ids =    [101, 2345, 6789, ..., 150001, 150004, 150005, 3456, 7890, 150005]
# 词汇表索引: [CLS]  摘要   生成   ...  [MASK] [gMASK]  [sop]   黑河   边防   [eop]

position_ids = [
    [0, 1, 2, 3, 4, 5, 6, 7, 8],    # 在example中的绝对位置
    [0, 0, 0, 0, 0, 1, 2, 3, 4]     # 块位置：0=源文本，≥1=生成文本
]


'''

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221

## Step4 创建模型

In [None]:
from transformers import AutoModel

model = AutoModel.from_pretrained("zai-org/glm-large-chinese", trust_remote_code=True)

## Step6 配置训练参数

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir="./summary_glm",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    logging_steps=8,
    num_train_epochs=1
)

## Step7 创建训练器

In [None]:
trainer = Seq2SeqTrainer(
    args=args,
    model=model,
    train_dataset=tokenized_ds["train"],
    tokenizer=tokenizer,
)  

## Step8 模型训练

In [None]:
trainer.train()

## Step9 模型推理

In [None]:
input_text = ds["test"][-1]["content"]
#模型推理是单个样本，所以不需要填充，不影响结果
inputs = tokenizer("摘要生成: \n" + input_text + tokenizer.mask_token, return_tensors="pt")
inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=64)
inputs = inputs.to("cuda")
output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenizer.eop_token_id, do_sample=True)
'''
**inputs将 inputs 字典展开为关键字参数,相当于
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    position_ids=inputs["position_ids"],

tokenizer.eop_token_id：GLM的结束符[eop]的ID
让eos_token_id等于eop的id，即模型立即停止生成
'''
tokenizer.decode(output[0].tolist())

In [None]:
import torch

model = model.eval()

def predict_test():
    predict = []
    with torch.inference_mode():
        for d in ds["test"]:
            inputs = tokenizer("摘要生成: \n" + d["content"] + tokenizer.mask_token, return_tensors="pt")
            inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=64)
            inputs = inputs.to("cuda")
            output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenizer.eop_token_id, do_sample=True)
            predict.append(tokenizer.decode(output[0].tolist()).split("<|startofpiece|>")[1].replace("<|endofpiece|>", "").strip())
            print("curID:", len(predict))
    return predict

'''

标记	功能	类比
[MASK]	任务触发 - 告诉模型"这里要生成"
[gMASK]	模式切换 - 切换到生成模式
[sop]	开始信号 - 生成正式开始
[eop]	结束信号 - 生成结束
'''

'''逐步处理过程
步骤1：解码token序列
full_text = tokenizer.decode(output[0].tolist())
# full_text = "[CLS]摘要生成:\\n中国军网报道...严寒训练[sop]<|startofpiece|>黑河边防军人零下30℃训练<|endofpiece|>"

步骤2：分割并定位生成内容
parts = full_text.split("<|startofpiece|>")
# parts = [
#     "[CLS]摘要生成:\\n中国军网报道...严寒训练[sop]",
#     "黑河边防军人零下30℃训练<|endofpiece|>"
#           ]
generated_part = parts[1]  # 取第二个元素
# generated_part = "黑河边防军人零下30℃训练<|endofpiece|>"

步骤3：移除结束标记
clean_text = generated_part.replace("<|endofpiece|>", "")
# clean_text = "黑河边防军人零下30℃训练"

步骤4：清理空白字符
final_text = clean_text.strip()
# final_text = "黑河边防军人零下30℃训练"

步骤5：添加到预测列表并打印进度
predict.append(final_text)
# predict现在包含: ["黑河边防军人零下30℃训练"]

print("curID:", len(predict))
# 输出: "curID: 1" (表示已经处理了1个样本)
'''

In [None]:
result = predict_test()

In [None]:
result

In [None]:
from rouge_chinese import Rouge

rouge = Rouge()

docode_preds = [" ".join(p) for p in result]
decode_labels = [" ".join(l) for l in ds["test"]["title"]]
scores = rouge.get_scores(docode_preds, decode_labels, avg=True)
{
    "rouge-1": scores["rouge-1"]["f"],
    "rouge-2": scores["rouge-2"]["f"],
    "rouge-l": scores["rouge-l"]["f"],
}