In [1]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [2]:
import json
import random
# 读取train.json文件
# train_data = []

# 逐行读取train.json文件
# with open('./LCSTS/train.json', 'r', encoding='utf-8') as f:
#     for line in f:
#         data = json.loads(line)
#         train_data.append(data)
        
# 读取train.json文件并随机选择5000条数据
train_data = []
num_samples = 5000

with open('./LCSTS/train.json', 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        train_data.append(data)

# 随机选择5000条数据
train_data = random.sample(train_data, num_samples)        


# # 读取dev.json文件
# with open('./LCSTS/dev.json', 'r', encoding='utf-8') as f:
#     dev_data = json.load(f)

# 处理数据集，提取id、summary和content
def process_data(data):
    processed_data = []
    for item in data:
        processed_item = {
            "id": item["id"],
            "summary": item["summary"],
            "content": item["content"]
        }
        processed_data.append(processed_item)
    return processed_data

# # 处理train数据
processed_train_data = process_data(train_data)

# # 处理dev数据
# processed_dev_data = process_data(dev_data)

# 将数据整理成Dataset格式
train_dataset = Dataset.from_dict({"id": [item["id"] for item in processed_train_data],
                                   "title": [item["summary"] for item in processed_train_data],
                                   "content": [item["content"] for item in processed_train_data]})

# # 划分训练集和测试集
ds = train_dataset.train_test_split(100, seed=42)
ds

# train_size = int(0.7 * len(train_dataset))
# val_size = int(0.1 * len(train_dataset))
# test_size = len(train_dataset) - train_size - val_size

# ds = train_dataset.train_test_split(train_size=train_size, test_size=test_size, seed=42)
# ds

# 现在你可以像之前处理nlpcc_2017数据集一样使用train_dataset和test_dataset了

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'content'],
        num_rows: 4900
    })
    test: Dataset({
        features: ['id', 'title', 'content'],
        num_rows: 100
    })
})

In [3]:
ds["train"][0]

{'id': 455698,
 'title': '宝龙许华芳：未来十年是商业地产黄金十年',
 'content': '当前电商来势汹汹，对传统商业的冲击已经看得见。许华芳也同意电商带来的变化，称电商分流掉一部分传统商业是毋庸置疑的。不过他也明确表示，“但这并不意味着它(电商)可以取代传统商业。'}

In [4]:
ds["train"][1]

{'id': 757222,
 'title': '路虎发现运动版配置曝光19日国内首发',
 'content': '近日我们从官方获悉，路虎发现运动版(DiscoverySport)将于9月19日在国内正式亮相并公布中文新名称及官方预售价。与此同时的是我们才相关渠道获取到了一份相关配置表。根据我们的了解，路虎发现运动版将于2015年初上市发售。'}

In [8]:
#模型下载
from modelscope import snapshot_download
# model_dir = snapshot_download('langboat/mengzi-t5-base',revision='v1.0.0')
model_dir = snapshot_download('ZhipuAI/glm-large-chinese',cache_dir='model2')
print(model_dir)

# #模型下载
# from modelscope import snapshot_download
# model_dir = snapshot_download('ZhipuAI/glm-large-chinese')

model2/ZhipuAI/glm-large-chinese


In [11]:
# 对于高版本的Transformers加载会报错，需要修改源码
# 文件地址 ~/.cache\huggingface\modules\transformers_modules\THUDM\glm-large-chinese\230f54e413fab4bc8f29bd3508aab301d757ef3e\tokenization_glm.py
# 231行 super().__init__(**kwargs) 移动至 235行，放至self.sp_model.Load(vocab_file)的后面一行
tokenizer = AutoTokenizer.from_pretrained("model2/ZhipuAI/glm-large-chinese", trust_remote_code=True)
# tokenizer


In [15]:
def process_func(exmaples):
    contents = ["摘要生成: \n" + e + tokenizer.mask_token for e in exmaples["content"]]
    inputs = tokenizer(contents, max_length=256, truncation=True, padding="max_length", return_tensors="pt")
    inputs = tokenizer.build_inputs_for_generation(inputs, targets=exmaples['title'], padding=True, max_gen_length=64)
    return inputs

In [16]:
tokenized_ds = ds.map(process_func, batched=True, remove_columns=ds["train"].column_names)
tokenized_ds

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'position_ids', 'attention_mask', 'labels'],
        num_rows: 4500
    })
    test: Dataset({
        features: ['input_ids', 'position_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
})

In [17]:
tokenizer.decode(tokenized_ds["train"][0]["input_ids"])

'[CLS] 摘要生成: 吴敬琏警示道,目前宏观与微观经济两方面正开始显现危机。吴敬琏指出,随着资产负债率的日益攀升,一些地区出现的危机很快传到其他地方,比如温州、苏南等地相继出现的“跑路”事件,包括6月的钱荒,其实都是给人们发预警信号。[MASK]<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|en

In [18]:
tokenized_ds["train"][0]["labels"]

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,

In [19]:
print(tokenized_ds["train"][0]["position_ids"])

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221

In [20]:
model = AutoModelForSeq2SeqLM.from_pretrained("model2/ZhipuAI/glm-large-chinese", trust_remote_code=True)

In [21]:
# import torch
# from datasets import Dataset
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

# # 加载数据集
# ds = Dataset.load_from_disk("./nlpcc_2017/")
# ds = ds.train_test_split(100, seed=42)

# # 加载分词器
# tokenizer = AutoTokenizer.from_pretrained("model2/ZhipuAI/glm-large-chinese", trust_remote_code=True)

# # 数据预处理函数
# def process_func(examples):
#     contents = ["摘要生成: \n" + e + tokenizer.mask_token for e in examples["content"]]
#     inputs = tokenizer(contents, max_length=384, truncation=True, padding="max_length", return_tensors="pt")
#     inputs = tokenizer.build_inputs_for_generation(inputs, targets=examples['title'], padding=True, max_gen_length=64)
#     return inputs

# # 处理数据集
# tokenized_ds = ds.map(process_func, batched=True, remove_columns=ds["train"].column_names)

# # 加载模型
# model = AutoModelForSeq2SeqLM.from_pretrained("model2/ZhipuAI/glm-large-chinese", trust_remote_code=True)

# 设置训练参数
args = Seq2SeqTrainingArguments(
    output_dir="./summary_glm",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    logging_steps=8,
    num_train_epochs=1,
    save_total_limit=1,  # 只保留最近的一个模型
    save_steps=512,       # 每500步保存一次模型
    # eval_strategy="steps",  # 评估的策略
    load_best_model_at_end=True,  # 在训练结束时加载最佳模型
    eval_strategy="steps",
)

# 创建训练器，添加 eval_dataset
trainer = Seq2SeqTrainer(
    args=args,
    model=model,
    train_dataset=tokenized_ds["train"],  # 训练集
    eval_dataset=tokenized_ds['test'],  # 评估集
    tokenizer=tokenizer,
)
# 开始训练
trainer.train()

Detected kernel version 4.19.91, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2024-10-01 23:05:36,427] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


df: /root/.triton/autotune: 没有那个文件或目录




Step,Training Loss,Validation Loss
8,3.2005,2.899231
16,3.0754,2.816801
24,2.9051,2.798641
32,2.9223,2.805028
40,3.0185,2.776683
48,2.9373,2.763617
56,2.8229,2.764317
64,2.8317,2.754272
72,2.8438,2.747238
80,2.9022,2.735222


TrainOutput(global_step=140, training_loss=2.9211345400129045, metrics={'train_runtime': 1152.5461, 'train_samples_per_second': 3.904, 'train_steps_per_second': 0.121, 'total_flos': 3041186650521600.0, 'train_loss': 2.9211345400129045, 'epoch': 0.9955555555555555})

In [134]:
# # 保存模型和分词器
# trainer.save_model(args.output_dir)  # 保存模型
# tokenizer.save_pretrained(args.output_dir)  # 保存分词器

# 保存模型
path='./summary_glm/checkpoint-140'

model.save_pretrained('./summary_glm/checkpoint-140')
# 保存分词器
tokenizer.save_pretrained('./summary_glm/checkpoint-140')



('./summary_glm/checkpoint-140/tokenizer_config.json',
 './summary_glm/checkpoint-140/special_tokens_map.json',
 './summary_glm/checkpoint-140/cog-pretrain.model',
 './summary_glm/checkpoint-140/added_tokens.json')

In [18]:
# 加载模型进行推理

path='./checkpoint-140'
# 检查是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(path,trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(path,trust_remote_code=True)
# 将模型移动到 GPU
model.to(device)

GLMForConditionalGeneration(
  (glm): GLMModel(
    (word_embeddings): VocabEmbedding()
    (transformer): GLMStack(
      (embedding_dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(1025, 1024)
      (block_position_embeddings): Embedding(1025, 1024)
      (layers): ModuleList(
        (0-23): 24 x GLMBlock(
          (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attention): SelfAttention(
            (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
            (attention_dropout): Dropout(p=0.1, inplace=False)
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (output_dropout): Dropout(p=0.1, inplace=False)
          )
          (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
            (dense_4h_to_h): Linear(in_fe

In [4]:
# with torch.no_grad():
#     output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenizer.eop_token_id, do_sample=True)

# # 解码并输出摘要
# summary = tokenizer.decode(output[0].tolist(), skip_special_tokens=True)
# print("生成的摘要:", summary)


# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# # 将模型移动到 GPU
# model.to(device)

input_text = ds["test"][0]["content"]
inputs = tokenizer("摘要生成: \n" + input_text + tokenizer.mask_token, return_tensors="pt")
inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=64)
# inputs = inputs.to("cpu")
# 将输入张量移动到与模型相同的设备
inputs = {key: value.to(device) for key, value in inputs.items()}


output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenizer.eop_token_id, do_sample=True)
tokenizer.decode(output[0].tolist())

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'[CLS] 摘要生成: 中国10月工业、投资和消费同比增速齐齐下滑,其中工业增速再次回落至8%以下,投资增速受房地产市场疲弱影响更是录得近13年最低,反映经济下行压力仍大,经济调整态势不改,料四季度将进一步放缓。[MASK]<|endoftext|> <|startofpiece|> 中金:经济“二次探底”进程仍在进行中 <|endofpiece|>'

In [19]:
import torch

model = model.eval()

def predict_test():
    predict = []
    with torch.inference_mode():
        for d in ds["test"]:
            inputs = tokenizer("摘要生成: \n" + d["content"] + tokenizer.mask_token, return_tensors="pt")
            inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=64)
            inputs = inputs.to("cuda")
            output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenizer.eop_token_id, do_sample=True)
            predict.append(tokenizer.decode(output[0].tolist()).split("<|startofpiece|>")[1].replace("<|endofpiece|>", "").strip())
            print("curID:", len(predict))
    return predict

In [9]:
result = predict_test()
result

---------开始----------------------------------------
迭代0次，收敛
最大的1个值： [1.]
对应的索引： [1]
结果：
专家表示，从2...。

curID: 1
---------开始----------------------------------------
迭代0次，收敛
最大的1个值： [1.]
对应的索引： [1]
结果：
相关负责人表示界旅游城市排名是国际上首个以城市综合旅游服务能力作为主要依据的排行榜。

curID: 2
---------开始----------------------------------------
迭代0次，收敛
最大的1个值： [1.]
对应的索引： [1]
结果：
在众多游戏发展中，中国移动游戏市场的发展最为耀眼，截至9月份移动游戏用户接近3亿人。

curID: 3
---------开始----------------------------------------
迭代1次，收敛
最大的1个值： [0.15]
对应的索引： [0]
结果：
周末是欣赏美术作品的好时机，昨日，两个书画、篆刻作品展开幕：“文以载道·湖南中国画学会诗书画印邀请展”在湖南省画院美术馆开幕，“山东·湖南篆刻联展”在长沙简牍博物馆开幕，两个展览均持续到12月15日，向市民免费开放。

curID: 4
---------开始----------------------------------------
迭代1次，收敛
最大的1个值： [0.15]
对应的索引： [0]
结果：
这次“海巡01轮”出海，携带了多套海测设备，包括1套黑匣子搜寻仪、2个潜水机器人、1套旁侧声纳和磁力仪，这些设备完全是国外引进的，水下最佳探测范围是2000米，最大探测范围是5000米。

curID: 5
---------开始----------------------------------------
迭代0次，收敛
最大的1个值： [1.]
对应的索引： [1]
结果：
”在约四十分钟的演说中，安倍首相连喊七次“世界第一”。

curID: 6
---------开始----------------------------------------
迭代0次，收敛
最大的1个值： [1.]
对应的索引：

['专家表示，从2...。\n',
 '相关负责人表示界旅游城市排名是国际上首个以城市综合旅游服务能力作为主要依据的排行榜。\n',
 '在众多游戏发展中，中国移动游戏市场的发展最为耀眼，截至9月份移动游戏用户接近3亿人。\n',
 '周末是欣赏美术作品的好时机，昨日，两个书画、篆刻作品展开幕：“文以载道·湖南中国画学会诗书画印邀请展”在湖南省画院美术馆开幕，“山东·湖南篆刻联展”在长沙简牍博物馆开幕，两个展览均持续到12月15日，向市民免费开放。\n',
 '这次“海巡01轮”出海，携带了多套海测设备，包括1套黑匣子搜寻仪、2个潜水机器人、1套旁侧声纳和磁力仪，这些设备完全是国外引进的，水下最佳探测范围是2000米，最大探测范围是5000米。\n',
 '”在约四十分钟的演说中，安倍首相连喊七次“世界第一”。\n',
 '由于可能产生的污染问题，正引起来自民间的强烈反对，包括140余名离退休老干部。\n',
 '纳智捷，这个来自于台湾的汽车品牌随着一部《痞子英雄》开始被国内消费者所认识，通过一部大空间、高配置的SUV让消费者开始了解这个品牌，之后其又慢慢地将MPV和轿车等车型引入国内汽车市场。\n',
 '二手车电商O2O，不仅面临着电商流量、产品、IT系统的线上问题，更考验其线下渠道、检测团队管理的能力。\n',
 '电力增速走势对经济波动反映准确，对工业增加值变动更为敏感。\n',
 '①英特尔会重新对智能手机感兴趣。\n',
 '这部法规在保护老年人权益方面作出了多项规定，包括禁止有独立生活能力的子女“啃老”，对于虐待老人的养老机构最高可罚款３万元。\n',
 '谷歌被控侵犯七项专利权。\n',
 '韩联社援引“朝鲜总联”机关报《朝鲜新报》4月25日报道称，朝鲜计划将位于朝鲜咸镜南道的端川港发展成为国际贸易港口，为实现经济建设和核武建设并进的奋斗目标打下经济基础。\n',
 '与此同时，辉瑞从本月初开始真正一分为三，其中包括一个仿制药部门。\n',
 '12月13日晚19点35分，中国男足将在湖南郴州与吉尔吉斯斯坦队进行热身赛，由于球队刚刚集训一周的时间，考虑到球员们的身体情况，佩兰也决定在这场比赛多安排一些球员出场，在战前进行的发布会上佩兰表示，会在上、下半时采取两套不同的阵容。\n',
 '凭借过去8年的完美答卷，默克尔帮助联盟党在9月22日的联邦议院选

In [10]:
from rouge_chinese import Rouge

rouge = Rouge()

docode_preds = [" ".join(p) for p in result]
decode_labels = [" ".join(l) for l in ds["test"]["title"]]
scores = rouge.get_scores(docode_preds, decode_labels, avg=True)
{
    "rouge-1": scores["rouge-1"]["f"],
    "rouge-2": scores["rouge-2"]["f"],
    "rouge-l": scores["rouge-l"]["f"],
}

{'rouge-1': 0.2403395384550552,
 'rouge-2': 0.1218037292673829,
 'rouge-l': 0.18250341080199725}

In [11]:
from rouge_chinese import Rouge

rouge = Rouge()

docode_preds = [" ".join(p) for p in result]
decode_labels = [" ".join(l) for l in ds["test"]["title"]]

                
# 计算 ROUGE 分数
scores = rouge.get_scores(docode_preds, decode_labels, avg=True)

# 提取 ROUGE 的 Precision、Recall 和 F1 值
rouge_scores = {
    "rouge-1": {
        "precision": scores["rouge-1"]["p"],
        "recall": scores["rouge-1"]["r"],
        "f1": scores["rouge-1"]["f"]
    },
    "rouge-2": {
        "precision": scores["rouge-2"]["p"],
        "recall": scores["rouge-2"]["r"],
        "f1": scores["rouge-2"]["f"]
    },
    "rouge-l": {
        "precision": scores["rouge-l"]["p"],
        "recall": scores["rouge-l"]["r"],
        "f1": scores["rouge-l"]["f"]
    }
}
print(rouge_scores)

{'rouge-1': {'precision': 0.18880349664387153, 'recall': 0.3940944136805994, 'f1': 0.2403395384550552}, 'rouge-2': {'precision': 0.09491139369605771, 'recall': 0.20992371675788346, 'f1': 0.1218037292673829}, 'rouge-l': {'precision': 0.14293111552242682, 'recall': 0.3136475881923821, 'f1': 0.18250341080199725}}


In [12]:
from bert_score import BERTScorer
from rouge import Rouge
from transformers import BertTokenizer, BertForMaskedLM
from bert_score import BERTScorer
import jieba
model_name = 'bert-base-chinese'
tokenizer_A = BertTokenizer.from_pretrained(model_name)
model_A = BertForMaskedLM.from_pretrained(model_name)

scorer = BERTScorer(model_type=model_name, num_layers=model_A.config.num_hidden_layers)
bert_scores = []
for pred, ref in zip(result,ds["test"]["title"]):
    P, R, F1 = scorer.score([pred], [ref])

    # 存储Precision、Recall和F1值
    bert_scores.append({
        "bert-precision": P.item(),
        "bert-recall": R.item(),
        "bert-f1": F1.item()
    })
# 计算平均BERTScore的Precision、Recall和F1值
avg_precision = sum(score["bert-precision"] for score in bert_scores) / len(bert_scores)
avg_recall = sum(score["bert-recall"] for score in bert_scores) / len(bert_scores)
avg_f1 = sum(score["bert-f1"] for score in bert_scores) / len(bert_scores)

print("\nAverage BERTScore:")
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1:", avg_f1)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Average BERTScore:
Average Precision: 0.6132225662469863
Average Recall: 0.661447462439537
Average F1: 0.634861501455307


In [5]:
# input_text=''
import jieba
from rouge_chinese import Rouge

input_text = ds["test"][-1]["content"]
title = ds["test"][-1]["title"]

inputs = tokenizer("摘要生成: \n" + input_text + tokenizer.mask_token, return_tensors="pt")
inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=64)
# inputs = inputs.to("cpu")
# 将输入张量移动到与模型相同的设备
inputs = {key: value.to(device) for key, value in inputs.items()}

output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenizer.eop_token_id,do_sample=True)
print(tokenizer.decode(output[0].tolist()))

# 原始文本
text = tokenizer.decode(output[0].tolist())
# 提取数据
start_marker = '<|startofpiece|>'
end_marker = '<|endofpiece|>'
# 找到起始和结束位置
start_index = text.find(start_marker) + len(start_marker)
end_index = text.find(end_marker)
# 提取所需数据
extracted_data = text[start_index:end_index].strip()
# 打印结果

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[CLS] 摘要生成: 昨日,“湘约启航”李湘加盟深圳卫视新闻发布会在京举行,李湘正式与深圳卫视签约,成为主管品牌推广的副总监。李湘说跳槽最重要的原因不是钱,“对于我来说最重要的是发展机会,还有快乐的工作环境。“李湘透露,自己的人事关系也会从湖南迁往深圳。[MASK]<|endoftext|> <|startofpiece|> 李湘加盟深圳卫视:跳槽最主要原因为钱 <|endofpiece|>


In [7]:
from rouge import Rouge
from transformers import BertTokenizer, BertForMaskedLM
from bert_score import BERTScorer
import jieba

# 定义预测和实际值
result = title
predict = extracted_data

# 加载BERT模型和tokenizer
model_name = './bert-base-chinese'
tokenizer_A = BertTokenizer.from_pretrained(model_name)
model_A = BertForMaskedLM.from_pretrained(model_name)
# 计算BERTScore
scorer = BERTScorer(model_type=model_name, num_layers=model_A.config.num_hidden_layers, batch_size=1)
P, R, F1 = scorer.score([predict], [result])

# 初始化 ROUGE 评分器
rouge = Rouge()
# 使用 jieba 进行分词
result_tokens = " ".join(jieba.cut(result))  # 实际值分词
predict_tokens = " ".join(jieba.cut(predict))  # 预测值分词


print('\n原文title内容为:  '+title)
print("\n")
print('文本摘要内容为:  '+extracted_data)
print("\n分词情况\n")
print(result_tokens)
print(predict_tokens)

# 将实际值和预测值放入列表
decode_preds = [predict_tokens]  # 预测值
decode_labels = [result_tokens]  # 实际值

# 计算 ROUGE 分数
scores = rouge.get_scores(decode_preds, decode_labels, avg=True)

# 提取 ROUGE 分数
rouge_scores = {
    "rouge-1": scores['rouge-1']['f'],
    "rouge-2": scores['rouge-2']['f'],
    "rouge-l": scores['rouge-l']['f'],
}
# 输出 ROUGE 和 BERTScore 结果
print("\nROUGE 分数:")
print(rouge_scores)

print("\nBERTScore:")
bert_scores = {
    "bert-precision": P.item(),
    "bert-recall": R.item(),
    "bert-f1": F1.item()
}
# 输出转换后的 BERTScore 结果
print(bert_scores)

Some weights of the model checkpoint at ./bert-base-chinese were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



原文title内容为:  李湘加盟深圳卫视任副总监


文本摘要内容为:  李湘加盟深圳卫视:跳槽最主要原因为钱

分词情况

李湘 加盟 深圳 卫视 任 副 总监
李湘 加盟 深圳 卫视 : 跳槽 最 主要 原因 为 钱

ROUGE 分数:
{'rouge-1': 0.44444443969135805, 'rouge-2': 0.3749999953125, 'rouge-l': 0.44444443969135805}

BERTScore:
{'bert-precision': 0.7679480314254761, 'bert-recall': 0.8269060850143433, 'bert-f1': 0.7963373064994812}


In [8]:
def generate_summary(input_text, tokenizer=tokenizer, model=model, device=device):
    # 生成摘要的输入
    inputs = tokenizer("摘要生成: \n" + input_text + tokenizer.mask_token, return_tensors="pt")
    inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=64)

    # 将输入张量移动到与模型相同的设备
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # 生成输出
    output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenizer.eop_token_id, do_sample=True)

    # 解码生成的文本
    text = tokenizer.decode(output[0].tolist())
    # print(text)

    # 提取数据
    start_marker = '<|startofpiece|>'
    end_marker = '<|endofpiece|>'
    # 找到起始和结束位置
    start_index = text.find(start_marker) + len(start_marker)
    end_index = text.find(end_marker)
    # 提取所需数据
    extracted_data = text[start_index:end_index].strip()

    return extracted_data

# input_text = ds["test"][5]["content"]
# generate_summary(input_text)


input_text = ds["test"][5]["content"]
print('原文内容： '+input_text)
summary = generate_summary(input_text, tokenizer, model, device)
print('\n文本摘要内容为: ' + summary)
print('文本标题内容: ' + ds["test"][5]["title"])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


原文内容： 近阶段苹果的诸多投资者无疑仿佛徜徉在九霄云外。根据苹果官方公布的总利润再次刷新历史，公司的股票也再创新高。此图表非常清晰的展示了苹果股票的“孤独”，其中Y轴代表这些公司目前的市值，X轴代表年增长率涨幅情况。

文本摘要内容为: 图解《寂寞乔布斯》:苹果的“黄金单身汉”
文本标题内容: “孤独”的苹果：一张图说明竞争对手和苹果的差距


In [11]:
from ipywidgets import Button, Textarea, Output, HBox, VBox

import string
# 数字转换为汉字的字典
num_to_chinese = {
    "0": "零",
    "1": "一",
    "2": "二",
    "3": "三",
    "4": "四",
    "5": "五",
    "6": "六",
    "7": "七",
    "8": "八",
    "9": "九",
    "10": "十",
    "11": "十一",
    "12": "十二"
}
def remove_all_punctuation_and_convert_number_to_chinese(text):
    # 创建一个翻译表，用于将所有标点符号替换为空格
    translator = str.maketrans('', '', string.punctuation + "。"+"、"+"'"+"“"+"”"+"："+":")
    # 使用翻译表去除文本中的所有标点符号
    text_without_punctuation = text.translate(translator)
    
    # 将文本中的数字转换为汉字
    for digit, chinese_character in num_to_chinese.items():
        text_without_punctuation = text_without_punctuation.replace(digit, chinese_character)
    
    return text_without_punctuation

import re
def contains_non_chinese_text(input_text):
    # 使用正则表达式匹配非中文字符
    non_chinese_pattern = re.compile(r'[^\u4e00-\u9fff]')  # 匹配不是中文字符的正则表达式
    match = non_chinese_pattern.search(input_text)

    if match:
        return True
    else:
        return False
def check_text_input(input_text):
    if not isinstance(input_text, str):
        return 0
    elif contains_non_chinese_text(input_text):
        return 2
    else:
        return 2
# # 示例用法
# check_text_input('这是一段文本内容')
# check_text_input('Hello World')
# check_text_input('1234')


# 定义一个简单的函数
def greet(b):
    input_text = remove_all_punctuation_and_convert_number_to_chinese(input_widget.value)
    output_num=check_text_input(input_text)  #数字输出 判断输入文本的类型
    if input_widget.value == '':
        text = '内容为空，请输入文本内容。'
    else:
        if output_num == 0:
            text = '输入不是文本，请输入文本内容。'
        elif output_num==1:
            text = '输入包含非中文字符，请输入中文文本。'
        else:
            text = generate_summary(input_widget.value)
    
    output_widget.clear_output()
    with output_widget:
        print(f'{text}')

# 创建一个输出控件
output_widget = Output()

# 创建一个大的文本输入控件
input_widget = Textarea(
    placeholder='输入你的文本...',
    description='文本输入:',
    layout={'width': '600px', 'height': '300px'},  # 设置宽度和高度
    style={'overflow': 'auto'}  # 允许滚动
)

# 创建一个按钮
submit_button = Button(description="生成摘要")

# 将按钮的点击事件与 greet 函数关联
submit_button.on_click(greet)
# 增加间隔
# input_widget.layout.margin = '30px 30px 30px 30px'  # 上右下左的间距
# submit_button.layout.margin = '0 0 30px 0'  # 上右下左的间距
# 布局
ui = VBox([input_widget, submit_button, output_widget])

# 显示控件
ui

VBox(children=(Textarea(value='', description='文本输入:', layout=Layout(height='300px', width='600px'), placehold…