In [15]:
import re
import torch
import jieba.posseg as posseg
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

###################################################
# TextRank实现
###################################################
# 停用词路径
stopwords_path = 'stopwords.txt'
# 需要排除的词性
stopPOS = []

# 读取停用词
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = [line.strip() for line in f.readlines()]

def segment_text_to_sentence(text):
    # 将文本分割成句子
    sentences = re.split(r'[。！？!?]', text)
    sentences = [sentence.strip().replace(" ", "").replace('\n','') for sentence in sentences if sentence.strip()]
    return sentences

def segment_text_to_words(text, use_stopwords):
    # 分词并去除停用词
    global stopPOS, stopwords
    stopPOS = [item.lower() for item in stopPOS]
    words = posseg.cut(text)
    if use_stopwords:
        words = [word for word, flag in words if flag[0].lower() not in stopPOS and word not in stopwords]
    else:
        words = [word for word, flag in words if flag[0].lower() not in stopPOS]
    words = set(words)

    return words

def original_similarity_matrix(sentences, use_stopwords):
    # 计算原始相似性矩阵
    sentence_words = [set(segment_text_to_words(item, use_stopwords)) for item in sentences]
    # print(sentence_words)
    size = len(sentences)
    similarity_matrix = np.zeros((size, size))
    for i in range(size):
        for j in range(i+1, size):
            if len(sentence_words[i]) == 0 or len(sentence_words[j]) == 0:
                similarity = 0
            else:
                # 计算相似性
                similarity = len(sentence_words[i] & sentence_words[j]) / (np.log(len(sentence_words[i])) + np.log(len(sentence_words[i])) + 1e-10)
            similarity_matrix[i][j] = similarity_matrix[j][i] = similarity
    return similarity_matrix

def cosine_tfidf_similarity_matrix(sentences, use_stopwords):
    # 计算基于TF-IDF的余弦相似性矩阵
    sentence_words = [' '.join(segment_text_to_words(item, use_stopwords)) for item in sentences]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentence_words)
    similarity_matrix = cosine_similarity(tfidf_matrix)

    # 将对角线元素设置为0，避免自身与自身的相似性干扰
    np.fill_diagonal(similarity_matrix, 0)
    return similarity_matrix

def summarize_text_rank(text, d=0.85, iter_num=200, top=3, method='默认方式', use_stopwords=True):
    sentences = segment_text_to_sentence(text)

    print('---------开始----------------------------------------')
    if method == '默认方式':
        edge_weight = original_similarity_matrix(sentences, use_stopwords)
    elif method == 'TF-IDF':
        edge_weight = cosine_tfidf_similarity_matrix(sentences, use_stopwords)

    node_weight = np.ones((len(sentences)))
    # print(node_weight)
    
#     print("句子间相似性计算(边的权重)")
#     print(edge_weight)

#     print("\n句子节点参数迭代更新：")
    for num in range(iter_num):                
        # TextRank迭代公式
        # print(node_weight)
        node_weight_new = (1-d) + d * node_weight @ (edge_weight / (edge_weight.sum(axis=-1) + 1e-10)).T
        if ((node_weight_new - node_weight)**2).sum() < 1e-10:
            break
        node_weight = node_weight_new
        # print(node_weight)

    if num < iter_num:
        print('迭代{}次，收敛'.format(num))
    else:
        print('迭代{}次，未收敛'.format(num))

    sorted_indices = np.argsort(node_weight)[::-1]

    # 获取最大的几个值及其对应的索引
    top_indices = sorted(sorted_indices[:top])
    top_values = node_weight[top_indices]

    print('最大的{}个值：'.format(top), top_values)
    print('对应的索引：', top_indices)
    print('结果：')
    result = ''
    for idx in top_indices:
        result += sentences[idx] + '。\n'
    print(result)

    return result

# # # 示例
# text = '记者7日从江苏省苏州市有关部门证实,该市一名初中副校长涉嫌泄露联考考题被停职,纪委已介入调查。今年4月,苏州市吴江区举行初三联考,震泽初级中学是联考学校之一,联考成绩将直接影响考生能否进入四星级高中(即重点高中)。就在考试前一天,考卷题目已在网上疯传,引发社会关注。据知情人士介绍,疑似震泽初级中学副校长将题目提前泄露给自己亲戚,然后这位亲戚又将题目进行了散播,导致题目最终被传到了网上。据苏州市吴江区教育局副局长沈正元介绍,泄题事件发生后,教育局于5月紧急推出了学校自主招生考试政策,以弥补泄题造成的不良后果。5月11日,教育局对涉嫌泄题违纪的震泽初级中学副校长作出停职处理,区纪委也已介入调查,教育部门会根据纪委处理意见进行相应处理。另据家长反映,泄题事件中一些“得题考生”出现在了震泽中学的保送名单里。对此,沈正元回应称,目前对涉嫌的涉事成年人进行了初步处理,纪委尚未作出明确处理意见,所以推优、自主招生等还按正常程序进行,因此出现了市民反映的疑似“得题考生”出现在名单中的情况。具体处理意见将在调查结果公布后作出,教育局将尽快进行相关调查工作,保障学生正当利益。(记者刘巍巍'
# summarize_text_rank(text)

In [17]:
#加载模型
# 检查是否有可用的 GPU
path='../checkpoint-153'  #模型存在的文件夹
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(path,trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(path,trust_remote_code=True)
# 将模型移动到 GPU
model.to(device)

GLMForConditionalGeneration(
  (glm): GLMModel(
    (word_embeddings): VocabEmbedding()
    (transformer): GLMStack(
      (embedding_dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(1025, 1024)
      (block_position_embeddings): Embedding(1025, 1024)
      (layers): ModuleList(
        (0-23): 24 x GLMBlock(
          (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attention): SelfAttention(
            (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
            (attention_dropout): Dropout(p=0.1, inplace=False)
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (output_dropout): Dropout(p=0.1, inplace=False)
          )
          (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
            (dense_4h_to_h): Linear(in_fe

In [18]:
path_mt5='../checkpoint-140'
# 检查是否有可用的 GPU
device_mt5 = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_mt5 = AutoModelForSeq2SeqLM.from_pretrained(path_mt5,trust_remote_code=True)
tokenizer_mt5 = AutoTokenizer.from_pretrained(path_mt5,trust_remote_code=True)
# 将模型移动到 GPU
model_mt5.to(device_mt5)

GLMForConditionalGeneration(
  (glm): GLMModel(
    (word_embeddings): VocabEmbedding()
    (transformer): GLMStack(
      (embedding_dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(1025, 1024)
      (block_position_embeddings): Embedding(1025, 1024)
      (layers): ModuleList(
        (0-23): 24 x GLMBlock(
          (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attention): SelfAttention(
            (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
            (attention_dropout): Dropout(p=0.1, inplace=False)
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (output_dropout): Dropout(p=0.1, inplace=False)
          )
          (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
            (dense_4h_to_h): Linear(in_fe

In [19]:
def summary_glm(input_text, tokenizer=tokenizer, model=model, device=device):
    # 生成摘要的输入
    inputs = tokenizer("摘要生成: \n" + input_text + tokenizer.mask_token, return_tensors="pt")
    inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=64)

    # 将输入张量移动到与模型相同的设备
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # 生成输出
    output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenizer.eop_token_id, do_sample=True)

    # 解码生成的文本
    text = tokenizer.decode(output[0].tolist())
    # print(text)

    # 提取数据
    start_marker = '<|startofpiece|>'
    end_marker = '<|endofpiece|>'
    # 找到起始和结束位置
    start_index = text.find(start_marker) + len(start_marker)
    end_index = text.find(end_marker)
    # 提取所需数据
    extracted_data = text[start_index:end_index].strip()

    return extracted_data

def summary_mt5(input_text, tokenizer=tokenizer_mt5, model=model_mt5, device=device):
    # 生成摘要的输入
    inputs = tokenizer("摘要生成: \n" + input_text + tokenizer.mask_token, return_tensors="pt")
    inputs = tokenizer.build_inputs_for_generation(inputs, max_gen_length=64)

    # 将输入张量移动到与模型相同的设备
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # 生成输出
    output = model.generate(**inputs, max_new_tokens=64, eos_token_id=tokenizer.eop_token_id, do_sample=True)

    # 解码生成的文本
    text = tokenizer.decode(output[0].tolist())
    # print(text)

    # 提取数据
    start_marker = '<|startofpiece|>'
    end_marker = '<|endofpiece|>'
    # 找到起始和结束位置
    start_index = text.find(start_marker) + len(start_marker)
    end_index = text.find(end_marker)
    # 提取所需数据
    extracted_data = text[start_index:end_index].strip()

    return extracted_data
# input_text = " 近阶段苹果的诸多投资者无疑仿佛徜徉在九霄云外。根据苹果官方公布的总利润再次刷新历史，公司的股票也再创新高。此图表非常清晰的展示了苹果股票的“孤独”，其中Y轴代表这些公司目前的市值，X轴代表年增长率涨幅情况。"
# print('原文内容： '+input_text)
# summary = summary_mt5(input_text, tokenizer_mt5, model_mt5, device_mt5)
# print('\n文本摘要内容为: ' + summary)
# summary = generate_summary(input_text, tokenizer, model, device)

In [20]:
import ipywidgets as widgets
from IPython.display import display
from IPython.display import clear_output
from ipywidgets import Layout
from IPython.display import Javascript

import jieba
###################################################
# UI界面实现
###################################################

def summarize_text(change):
    input_text = input_text_widget.value
    d = float(d_entry.value) if d_entry.value else 0.85
    top = int(top_entry.value) if top_entry.value else 3
    processing_method = processing_method_dropdown.value
    use_stopwords = use_stopwords_checkbox.value
    summary = summarize_text_rank(input_text, d=d, top=top, method=processing_method, use_stopwords=use_stopwords)
    output_text_widget.value = summary

def summarize_text_glm(change):
    input_text = input_text_widget.value
    summary_result = summary_glm(input_text, tokenizer, model, device)
    output_text_widget_glm.value = summary_result
    
def summarize_text_mt5(change):
    input_text = input_text_widget.value
    summary_result = summary_mt5(input_text, tokenizer_mt5, model_mt5, device_mt5)
    output_text_widget_mt5.value = summary_result

# 创建输入文本框
input_text_widget = widgets.Textarea(
    layout=Layout(width='70%', height='150px'),
    placeholder='输入文本'
)
display(input_text_widget)

# 创建TextRank参数设置
use_stopwords_checkbox = widgets.Checkbox(
    value=True,
    layout=Layout(width='15%'),
    description='使用停用词'
)
d_entry = widgets.FloatText(
    value=0.85,
    layout=Layout(width='25%'),
    description='阻尼系数:'
)
top_entry = widgets.IntText(
    value=3,
    layout=Layout(width='25%'),
    description='摘要句数:'
)
processing_method_dropdown = widgets.Dropdown(
    options=['默认方式', 'TF-IDF'],
    value='默认方式',
    layout=Layout(width='35%'),
    description='相似度度量:'
)
display(use_stopwords_checkbox)
# display(use_stopwords_checkbox, d_entry, top_entry, processing_method_dropdown)
display(widgets.HBox([d_entry,top_entry, processing_method_dropdown]))

# 创建按钮，用于触发文本摘要
summarize_button = widgets.Button(
    description='TextRank生成摘要',
    layout=Layout(width='33%')
)
summarize_button.on_click(summarize_text)

summarize_button_glm = widgets.Button(
    description='GLM生成摘要',
    layout=Layout(width='33%')
)
summarize_button_glm.on_click(summarize_text_glm)

summarize_button_mt5 = widgets.Button(
    description='MT5生成摘要',
    layout=Layout(width='33%')
)
summarize_button_mt5.on_click(summarize_text_mt5)

display(widgets.HBox([summarize_button, summarize_button_glm, summarize_button_mt5]))

# 创建输出文本框
output_text_widget = widgets.Textarea(
    layout=Layout(width='70%', height='150px'),
    placeholder='TextRank输出文本'
)
output_text_widget_glm = widgets.Textarea(
    layout=Layout(width='70%', height='150px'),
    placeholder='GLM输出文本/nlpcc数据集模型'
)
output_text_widget_mt5 = widgets.Textarea(
    layout=Layout(width='70%', height='150px'),
    placeholder='MT5输出文本/LCSTS数据集模型'
)
display(output_text_widget)
display(output_text_widget_glm)
display(output_text_widget_mt5)

Textarea(value='', layout=Layout(height='150px', width='70%'), placeholder='输入文本')

Checkbox(value=True, description='使用停用词', layout=Layout(width='15%'))

HBox(children=(FloatText(value=0.85, description='阻尼系数:', layout=Layout(width='25%')), IntText(value=3, descri…

HBox(children=(Button(description='TextRank生成摘要', layout=Layout(width='33%'), style=ButtonStyle()), Button(des…

Textarea(value='', layout=Layout(height='150px', width='70%'), placeholder='TextRank输出文本')

Textarea(value='', layout=Layout(height='150px', width='70%'), placeholder='GLM输出文本/nlpcc数据集模型')

Textarea(value='', layout=Layout(height='150px', width='70%'), placeholder='MT5输出文本/LCSTS数据集模型')