In [1]:
# Necessary imports

from A_DataCollectors.ScientificLiteratureCollector.scientific_literature_collector import ScientificLiteratureCollector
from A_DataCollectors.ForumCollector.forum_collector import ForumCollector
from A_DataCollectors.ForumCollector.forum_application import ForumApplication
from B_DataProcessors.text_preprocessor import TextPreprocessor
from C_Analyzers.Summarization.extractive_summarizer import ExtractiveSummarizer
from C_Analyzers.Summarization.functions import RelevanceScores
from C_Analyzers.Relation_Extraction.relation_extractor import RelationExtractor
from C_Analyzers.Sentiment_Analysis.sentiment_analyzer import SentimentAnalyzer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_d

In [2]:
# Collect data from scientific article
path = "https://arxiv.org/pdf/1903.10318.pdf"

collector = ScientificLiteratureCollector(path)
text = collector.collect('pdf', 'url' if path.startswith('http') else 'local', 'scipy')

print(text)

{'title': 'Fine-tune BERT for Extractive Summarization', 'authors': 'Yang Liu', 'pub_date': '', 'abstract': ', a pre-trained Transformer (Vaswani et al., 2017) model, has achieved ground-breaking performance on multiple NLP tasks. In this paper, we describe BERTSUM, a simple variant of BERT, for extractive summarization. Our system is the state of the art on the CNN/Dailymail dataset, outperforming the previous best-performed system by 1.65 on ROUGE-L. The codes to reproduce our results are available at https://github. com/nlpyang/BertSum', 'sections': [{'heading': 'Introduction', 'text': 'Single-document summarization is the task of automatically generating a shorter version of a document while retaining its most important information. The task has received much attention in the natural language processing community due to its potential for various information access applications. Examples include tools which digest textual content (e.g., news, social media, reviews), answer questions



In [3]:
# Preprocess text from the article

preprocessor = TextPreprocessor(steps=['clean_data', 'split_sentences'])
preprocessed_text = preprocessor.preprocess_grobid(text)

print(preprocessed_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{'title': ['Fine-tune BERT for Extractive Summarization'], 'Introduction': ['Single-document summarization is the task of automatically generating a shorter version of a document while retaining its most important information.', 'The task has received much attention in the natural language processing community due to its potential for various information access applications.', 'Examples include tools which digest textual content (e.g., news, social media, reviews), answer questions, or provide recommendations.', 'The task is often divided into two paradigms, abstractive summarization and extractive summarization.', 'In abstractive summarization, target summaries contains words or phrases that were not in the original text and usually require various text rewriting operations to generate, while extractive approaches form summaries by copying and concatenating the most important spans (usually sentences) in a document.', 'In this paper, we focus on extractive summarization.', 'Although m

In [20]:
# Create a dictionary to hold sentences with their headers
sentences_with_headers = {}
for header, sentences in preprocessed_text.items():
    for sentence in sentences:
        sentences_with_headers[sentence] = header

# Consolidate all sentences in one list
all_sentences = list(sentences_with_headers.keys())

# Summarize data
es = ExtractiveSummarizer(all_sentences)
summary = es.summarize('position_textrank', top_n=15, order_by_rank=False)
print(summary)
print(type(summary))

# filter out sentences less than four words long
summary = [sentence for sentence in summary.split('. ') if len(sentence.split()) >= 4]

# Retrieve headers for each sentence in the summary
summary_with_headers = [(sentences_with_headers[sentence], sentence) for sentence in summary]


Single-document summarization is the task of automatically generating a shorter version of a document while retaining its most important information. The task has received much attention in the natural language processing community due to its potential for various information access applications. Examples include tools which digest textual content (e.g., news, social media, reviews), answer questions, or provide recommendations. In abstractive summarization, target summaries contains words or phrases that were not in the original text and usually require various text rewriting operations to generate, while extractive approaches form summaries by copying and concatenating the most important spans (usually sentences) in a document. As illustrated in the table, all BERT-based models outperformed previous state-of-the-art models by a large margin. Ablation studies are conducted to show the contribution of different components of BERTSUM. The results are shown in in Table 2. Interval segmen

KeyError: 'Single-document summarization is the task of automatically generating a shorter version of a document while retaining its most important information'

In [12]:
# Collect data from forum discussion

discussion_link = "https://forums.space.com/threads/constellations-space-travel.29641/"
message_class = "message-content js-messageContent"
message_text_class = "bbWrapper"
message_author_class = "username"
pagination_class = "pageNav-main"

collector = ForumCollector(name='name', base_url='base', description='desc', category='cat')
app = ForumApplication(collector)
collected_messages = app.collect_messages_by_discussion_link(
            discussion_link=discussion_link,
            message_class=message_class,
            full_message_class=False,
            pagination_class=pagination_class,
            message_text_class=message_text_class,
            message_author_class=message_author_class,
            store_in_dict=False,  # Here's the second change
            return_messages=True
        )

print(collected_messages)

1
https://forums.space.com/threads/constellations-space-travel.29641/
2
https://forums.space.com/threads/constellations-space-travel.29641/page-2
3
https://forums.space.com/threads/constellations-space-travel.29641/page-3
4
{1: {'text': 'How far can we travel through space, until the constellations are no longer recognizable?', 'author': '', 'discussion_link': 'https://forums.space.com/threads/constellations-space-travel.29641/'}, 2: {'text': 'That is a tricky one.  I have often seen diagrams of how constellations change in time, especially The Plough (Big Dipper) and IIRC these suggest several thousand years at least.  I guess the closer the stars involved the smaller distance you would have to travel, whereas very distant objects would probably appear pretty fixed.\n\nIf I find anything further, I will post it.\n\nCat', 'author': '', 'discussion_link': 'https://forums.space.com/threads/constellations-space-travel.29641/'}, 3: {'text': "Hey, I just found the following. Hope it helps:\

In [13]:
# Preprocess text from the forum discussion

summarization_steps = ['clean_data', 'split_sentences']
preprocessor = TextPreprocessor(summarization_steps)
preprocessed_text = preprocessor.preprocess_forum_discussion(collected_messages)

print(preprocessed_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


{1: ['How far can we travel through space, until the constellations are no longer recognizable?'], 2: ['That is a tricky one.', 'I have often seen diagrams of how constellations change in time, especially The Plough (Big Dipper) and IIRC these suggest several thousand years at least.', 'I guess the closer the stars involved the smaller distance you would have to travel, whereas very distant objects would probably appear pretty fixed.If I find anything further, I will post it.Cat'], 3: ['Hey, I just found the following.', 'Hope it helps:Richard Adkins, Amateur astronomer for over half a century.Answered Aug 21, 2015The apparent location of stars within the field of view of an observer traveling through interstellar space would change with distance as the traveler got further and further from earth.', 'The constellations, as we have defined them are, are made up of objects in vastly different positions in three dimensional space.', 'Some are comparably close to us and others are extremel

In [10]:
# Analyze text from the forum discussion
from spacy.lang.en.stop_words import STOP_WORDS

# Summarize data
new_dict = {}
for header, messages in preprocessed_text.items():
    es = ExtractiveSummarizer(messages)
    summary = es.summarize('relevance_scores', top_n=5, order_by_rank=False)

    # filter out messages less than four words long
    summary = '. '.join(message for message in summary if len(message.split()) >= 3)

    new_dict[header] = summary

relevance_scores = RelevanceScores()
top_messages = relevance_scores.select_top_messages(new_dict, 3, STOP_WORDS)
result = top_messages  # Replace the result with the top messages

print(result)

[(1, 'How far can we travel through space, until the constellations are no longer recognizable?'), (5, 'At the velocities at which we would be traveling,any changes in the appearance of most of the constellations would be negligible.. Knowing the distances to the nearest stars, whether or not they were part of the constellations, would be much more useful in verifying the location of the traveler.'), (6, 'How big is our universe and then if I was outside it could I even see our universes.. if I was looking at our systems it would be a generate mapping system depending upon the species technology and or systems that engages propulsion.. maps are extremely important in space , being able to see everything can be the difference between turning up safe and or hitting something an hoping the force field gets it.')]
