In [None]:
# Necessary imports

from A_DataCollectors.ScientificLiteratureCollector.scientific_literature_collector import ScientificLiteratureCollector
from A_DataCollectors.ForumCollector.forum_collector import ForumCollector
from A_DataCollectors.ForumCollector.forum_application import ForumApplication
from B_DataProcessors.text_preprocessor import TextPreprocessor
from C_Analyzers.Summarization.extractive_summarizer import ExtractiveSummarizer
from C_Analyzers.Summarization.functions import RelevanceScores
from C_Analyzers.Relation_Extraction.relation_extractor import RelationExtractor
from C_Analyzers.Sentiment_Analysis.sentiment_analyzer import SentimentAnalyzer

In [None]:
# Collect data from scientific article
path = "https://arxiv.org/pdf/1903.10318.pdf"

collector = ScientificLiteratureCollector(path)
text = collector.collect('pdf', 'url' if path.startswith('http') else 'local', 'scipy')

print(text)

In [None]:
# Preprocess text from the article

preprocessor = TextPreprocessor(steps=['clean_data', 'split_sentences'])
preprocessed_text = preprocessor.preprocess_grobid(text)

print(preprocessed_text)

In [None]:
# Analyze text from the article

# Consolidate all sentences in one list
all_sentences = []
for header, sentences in preprocessed_text.items():
    all_sentences.extend(sentences)

# Summarize data
es = ExtractiveSummarizer(all_sentences)
summary = es.summarize('position_textrank', top_n=15, order_by_rank=False)

# filter out sentences less than four words long
summary = [sentence for sentence in summary.split('. ') if len(sentence.split()) >= 4]

print(summary)

In [None]:
# Collect data from forum discussion

discussion_link = "https://forums.space.com/threads/constellations-space-travel.29641/"
message_class = "message-content js-messageContent"
message_text_class = "bbWrapper"
message_author_class = "username"
pagination_class = "pageNav-main"

collector = ForumCollector(name='name', base_url='base', description='desc', category='cat')
app = ForumApplication(collector)
collected_messages = app.collect_messages_by_discussion_link(
            discussion_link=discussion_link,
            message_class=message_class,
            full_message_class=False,
            pagination_class=pagination_class,
            message_text_class=message_text_class,
            message_author_class=message_author_class,
            store_in_dict=False,  # Here's the second change
            return_messages=True
        )

print(collected_messages)

In [None]:
# Preprocess text from the forum discussion

summarization_steps = ['clean_data', 'split_sentences']
preprocessor = TextPreprocessor(summarization_steps)
preprocessed_text = preprocessor.preprocess_forum_discussion(collected_messages)

print(preprocessed_text)

In [None]:
# Analyze text from the forum discussion
from spacy.lang.en.stop_words import STOP_WORDS

# Summarize data
new_dict = {}
for header, messages in preprocessed_text.items():
    es = ExtractiveSummarizer(messages)
    summary = es.summarize('relevance_scores', top_n=5, order_by_rank=False)

    # filter out messages less than four words long
    summary = '. '.join(message for message in summary if len(message.split()) >= 3)

    new_dict[header] = summary

relevance_scores = RelevanceScores()
top_messages = relevance_scores.select_top_messages(new_dict, 3, STOP_WORDS)
result = top_messages  # Replace the result with the top messages

print(result)