In [17]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from scipy.sparse import csr_matrix
from datasets import load_from_disk, Dataset, DatasetDict
import os
from multiprocessing import Pool

# 下载必要的 NLTK 数据
nltk.download("punkt")
nltk.download("stopwords")

def preprocess_text(text):
    """
    对文本进行预处理，包括分句、分词、去停词、小写化。
    """
    # 分句
    sentences = sent_tokenize(text)

    # 分词、去停词、小写化
    stop_words = set(stopwords.words("english"))
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())  # 小写化
        words = [word for word in words if word.isalnum() and word not in stop_words]  # 去停词和非字母数字
        preprocessed_sentences.append(" ".join(words))
    return sentences, preprocessed_sentences

def textrank_summary(text, num_sentences=6):
    """
    使用 TextRank 生成摘要。
    :param text: 输入文本
    :param num_sentences: 摘要中包含的句子数量
    """
    # 1. 文本预处理
    original_sentences, preprocessed_sentences = preprocess_text(text)

    if len(original_sentences) <= num_sentences:
        return " ".join(original_sentences)  # 如果句子数量不足，直接返回原句子

    # 2. 计算句子之间的稀疏相似性矩阵
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)

    # 稀疏化相似性矩阵
    threshold = 0.05
    sparse_matrix = csr_matrix(similarity_matrix * (similarity_matrix > threshold))

    # 3. 构建图并计算句子重要性
    try:
        nx_graph = nx.from_scipy_sparse_array(sparse_matrix)
    except AttributeError:
        nx_graph = nx.from_scipy_sparse_matrix(sparse_matrix)

    scores = nx.pagerank(nx_graph)

    # 4. 根据重要性排序句子并生成摘要
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
    summary = " ".join([s for _, s in ranked_sentences[:num_sentences]])

    return summary

def process_sample(sample):
    """
    处理单个样本：生成摘要并保留原始数据。
    """
    text = sample["text"]
    abstract = sample["abstract"]
    summary = textrank_summary(text, num_sentences=4)
    return {"text": text, "abstract": abstract, "generated_summary": summary}




[nltk_data] Downloading package punkt to
[nltk_data]     /media/user/volume2/students/s124md209_01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /media/user/volume2/students/s124md209_01/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
text = "Nick Clegg was heckled over his broken tuition fees pledge today as he ramped up his attack on George Osborne   with a stark warning that the Tory Chancellor is 'a very dangerous man'. The Deputy Prime Minister was met by protesters in Surbiton, south-west London, who chanted: 'Nick Clegg lied to me, he said uni would be free'. Mr Clegg ignored the protesters as he stuck to his message that only the Lib Dems can be trusted to balance the budget without hitting the poor. Scroll down for video . Nick Clegg was heckled by tuition fees protesters while out campaigning in Surbiton, South West London, today - while Lib Dem supporters tried to intervene . Nick Clegg was in Kingston and Surbiton to campaign with fellow Cabinet member, Secretary of State for Energy and Climate Change, Ed Davey today . The Lib Dem leader's campaign stop came after he launched a furious attack on the 'ideological' cuts planned by the Tories. Speaking to Tony Blair's former spin doctor Alastair Campbell in May's edition of GQ magazine, Mr Clegg said the chancellor's plans would do 'so much damage'. He added: 'I don't know of a developed economy that wants to do something as rigidly ideological as he wants to do, to balance the books through public spending reductions alone, not tax, with one section, the working poor, taking the biggest hit. 'I find it socially and morally unacceptable, but also economically a disaster.' The 'dramatic lurch to the right' involves a 'harder approach than anything the arch-Thatcherites would do' and would 'destroy public services', he said. Mr Clegg added: 'George Osborne is a very dangerous man with a very dangerous plan, and I will do everything in my power to stop it.' In contrast, Mr Clegg said the Prime Minister was 'a classic traditional shire Tory', adding: 'I can live with that.' He said Mr Cameron was 'very much a Tory, and in that tradition he is not too much about grand vision'. The Deputy Prime Minister launched a furious attack on the 'ideological' cuts planned by the Tories . Mr Clegg met seven-month-old Elise Popperwell and her mother Melissa Popperwell in South West London this morning . The Deputy Prime Minister said David Cameron was 'a classic traditional shire Tory', adding: 'I can live with that.' Asked about Ed Miliband, Mr Clegg said he was a 'perfectly nice guy, personable, as is David Cameron'. The Lib Dem leader said he wanted his party back in government 'because Osborne's ideological assault on public services has to be stopped' and 'Ed Miliband and Ed Balls' head-in-the-sand approach to public spending has to be challenged, because it would be really bad for the economy'. Treasury Secretary Danny Alexander meanwhile claimed that a top Tory told him 'you take care of the workers and we'll take care of the bosses', in a private Whitehall meeting to discuss the Coalition Government's priorities. Mr Alexander said it showed that David Cameron and George Osborne were guilty of 'breathtaking hypocrisy' for trying to claim responsibility for raising the threshold at which people pay tax - while privately arguing for tax cuts for the better-off. He said: 'In March 2012, when the economy was still in the very early stages of recovery, we wanted to do a very big increase in the personal allowance to put a lot of money back into folks' pockets. The Tories' priority at the time was the top rate of tax. 'I remember one meeting with a group of senior Conservatives and one of them said, 'listen, you take care of the workers and we'll take care of the bosses'."


In [21]:
result=process_sample({"text": text, "abstract": "This is a test."})

print("示例摘要:", result["generated_summary"])

示例摘要: In contrast, Mr Clegg said the Prime Minister was 'a classic traditional shire Tory', adding: 'I can live with that.' The Deputy Prime Minister said David Cameron was 'a classic traditional shire Tory', adding: 'I can live with that.' The Deputy Prime Minister was met by protesters in Surbiton, south-west London, who chanted: 'Nick Clegg lied to me, he said uni would be free'. The Deputy Prime Minister launched a furious attack on the 'ideological' cuts planned by the Tories .


In [None]:

def remove_location_and_author(text):
    # Remove location/author patterns at the start of the text, e.g., "LONDON, England (Reuters) -- "
    pattern_location = r'^[A-Z\s,]+\([A-Za-z]+\)\s*--\s*'
    text = re.sub(pattern_location, '', text).strip()

    # Remove patterns like "(CNN)" or "(Reuters)"
    pattern_source = r'\([A-Za-z\s]+\)'
    text = re.sub(pattern_source, '', text).strip()

    # Remove patterns like "By . James Chapman for the Daily Mail ."
    pattern_byline = r'^By\s\.\s.*?\.'
    text = re.sub(pattern_byline, '', text).strip()

    # Remove newlines and replace with spaces
    text = re.sub(r'\n+', ' ', text).strip()

    return text