In [4]:
import sys
sys.path.append('C:\\Users\\noudy\\PycharmProjects\\Cassidy\\application')

from A_DataCollectors.ScientificLiteratureCollector.scientific_literature_collector import ScientificLiteratureCollector
from C_DataProcessors.text_preprocessor import TextPreprocessor
from D_Analyzers.Summarization.extractive_summarizer import ExtractiveSummarizer
from D_Analyzers.Relation_Extraction.relation_extractor import RelationExtractor
from D_Analyzers.Sentiment_Analysis.sentiment_analyzer import SentimentAnalyzer


class ScientificLiteratureAnalyzer:
    def __init__(self, path):
        self.path = path
        self.format = 'pdf'
        self.source_type = 'url' if path.startswith('http') else 'local'
        self.method = 'scipy'

    def analyze(self, functionality, preprocessing_steps=[]):
        analysis = getattr(self, functionality)(preprocessing_steps)
        return analysis

    def summarize(self, preprocessing_steps=[]):
        # Collect data
        collector = ScientificLiteratureCollector(self.path)
        text = collector.collect(self.format, self.source_type, self.method)

        # Preprocess data
        summarization_steps = preprocessing_steps
        preprocessor = TextPreprocessor(summarization_steps)
        preprocessed_text = preprocessor.preprocess_grobid(text)

        # Summarize data
        new_dict = {}
        for header, sentences in preprocessed_text.items():
            es = ExtractiveSummarizer(sentences)
            summary = es.summarize('textrank', top_n=3, order_by_rank=False)

            # filter out sentences less than four words long
            summary = '. '.join(sentence for sentence in summary.split('. ') if len(sentence.split()) >= 3)

            new_dict[header] = summary

        return new_dict

    def relation_extractor(self, preprocessing_steps=[]):
        # Collect data
        collector = ScientificLiteratureCollector(self.path)
        text = collector.collect(self.format, self.source_type, self.method)

        # Preprocess data
        relation_steps = ['clean_data']
        preprocessor = TextPreprocessor(relation_steps)
        preprocessed_text = preprocessor.preprocess_grobid(text)
        preprocessed_text = preprocessor.concatenate_sections_grobid(preprocessed_text)

        relation_extraction_steps = preprocessing_steps
        relation_preprocessor = TextPreprocessor(relation_extraction_steps)
        new_text = relation_preprocessor.preprocess_string(preprocessed_text)

        # Analyze the data
        relation_extractor = RelationExtractor(new_text)
        relations = relation_extractor.extract('co_occurrence')

        return relations


In [14]:
from A_DataCollectors.ForumCollector.forum_collector import ForumCollector
from A_DataCollectors.ForumCollector.forum_application import ForumApplication
from C_DataProcessors.text_preprocessor import TextPreprocessor
from D_Analyzers.Sentiment_Analysis.sentiment_analyzer import SentimentAnalyzer
from D_Analyzers.Relation_Extraction.relation_extractor import RelationExtractor


class ForumAnalyzer:
    def __init__(self,discussion_link, message_class, full_message_class, pagination_class, message_text_class, message_author_class):
        self.discussion_link = discussion_link
        self.message_class = message_class
        self.full_message_class = full_message_class
        self.pagination_class = pagination_class
        self.message_text_class = message_text_class
        self.message_author_class = message_author_class
        self.collector = ForumCollector(name='name', base_url='base', description='desc', category='cat')
        self.app = ForumApplication(self.collector)
        self.collected_messages = self.app.collect_messages_by_discussion_link(
            discussion_link=self.discussion_link,
            message_class=self.message_class,
            full_message_class=self.full_message_class,
            pagination_class=self.pagination_class,
            message_text_class=self.message_text_class,
            message_author_class=self.message_author_class,
            store_in_dict=False,  # Here's the second change
            return_messages=True
        )

    def analyze(self, functionality, preprocessing_steps=[]):
        analysis = getattr(self, functionality)(preprocessing_steps)
        return analysis

    def relation_extractor(self, preprocessing_steps=[]):
        # Preprocess data
        relation_steps = ['clean_data']
        preprocessor = TextPreprocessor(relation_steps)
        preprocessed_text = preprocessor.preprocess_forum_discussion(self.collected_messages)
        preprocessed_text = preprocessor.concatenate_sections_grobid(preprocessed_text)

        relation_extraction_steps = preprocessing_steps
        relation_preprocessor = TextPreprocessor(relation_extraction_steps)
        new_text = relation_preprocessor.preprocess_string(preprocessed_text)

        # Analyze the data
        relation_extractor = RelationExtractor(new_text)
        relations = relation_extractor.extract('co_occurrence')

        return relations

    def sentiment_analysis(self, preprocessing_steps=[]):
        summarization_preprocessor = TextPreprocessor(preprocessing_steps)
        preprocessed_data = summarization_preprocessor.preprocess_forum_discussion(self.collected_messages)

        new_dict = {}
        for header, text in preprocessed_data.items():
            sa = SentimentAnalyzer(text)
            new_dict[header] = sa.analyze('textblob_analysis')

        return new_dict


In [17]:
discussion_link = "https://forums.space.com/threads/constellations-space-travel.29641/"
message_class = "message-content js-messageContent"
full_message_class = False
pagination_class = "pageNav-main"
message_text_class = "bbWrapper"
message_author_class = "username"

forum_analyzer = ForumAnalyzer(discussion_link, message_class, full_message_class, pagination_class, message_text_class, message_author_class)
result = forum_analyzer.analyze('relation_extractor', preprocessing_steps=['clean_data', 'case_folding', "split_sentences", "tokenize_sentences", "pos_tagging", "filter_pos_tagged"])

print(result)

1
https://forums.space.com/threads/constellations-space-travel.29641/
2
https://forums.space.com/threads/constellations-space-travel.29641/page-2
3
https://forums.space.com/threads/constellations-space-travel.29641/page-3
4
[['space', 'constellations'], [], ['i', 'diagrams', 'constellations', 'time', 'plough', 'dipper', 'years'], ['i', 'stars', 'distance', 'objects', 'anything', 'i', 'it.cat'], ['hey', 'i', 'following'], ['hope', 'richard', 'adkins', 'astronomer', 'aug', 'twenty-one', 'thousand', 'location', 'stars', 'field', 'view', 'observer', 'space', 'distance', 'traveler', 'earth'], ['constellations', 'objects', 'positions', 'space'], ['others', 'constellation'], ['shape', 'constellation'], ['alpha', 'centauri', 'star', 'star', 'part', 'constellation', 'centaurus'], ['anything', 'light', 'years', 'constellation', 'appearance', 'centaurus', 'program', 'stars', "'any", 'position', 'space', 'constellations'], ['see', 'celestia', 'home'], ['link', 'celestia', 'homecelestia', 'space', 

In [4]:
path = "C:/Users/noudy/Downloads/s10791-016-9286-2.pdf"

relation = ScientificLiteratureAnalyzer(path)
result = relation.relation_extractor()

print(result)



[('term', 'terms'), ('background', 'collection'), ('terms', 'terms'), ('collection', 'terms'), ('g', 't'), ('t', 't'), ('t', 'terms'), ('collection', 'collection'), ('collection', 'term'), ('g', 'g')]


In [2]:
# Insert your document path here (either URL or direct path)
path = "C:\\Users\\noudy\\Downloads\\1-s2.0-S0749596X09001247-main.pdf"

summarizer = ScientificLiteratureAnalyzer(path)
result = summarizer.summarize()

for key, value in result.items():
    print(f"\nSection: {key}\n{'=' * len('Section: ' + key)}\nSummary: {value}\n")




Section: title
Summary: How cross-language similarity and task demands affect cognate recognition.


Section: Introduction
Summary: Consider the following text (after Peter Verstegen): Drink gin in restaurant, whiskey in hotel, champagne in bed. Later effect: Oh God, migraine. Tablet in warm water!.


Section: The cognate facilitation effect
Summary: In many reaction time (RT) studies, involving a variety of experimental paradigms, cognates were responded to faster than control words that exist in only one language. Over the years, several theoretical accounts have been proposed. The task dependence of cognate effects and the related predictions of the four accounts will be considered before presenting Experiment two..


Section: Four positions on cognate representation and processing
Summary: one, for a proposal). one, panel d). The various model predictions are summarized in Table one..


Section: The cognate facilitation effect and cross-linguistic similarity
Summary: Dijkstra et a

In [5]:
# Insert either a path or link to a scientific article in PDF-format.
path = "C:\\Users\\noudy\\Downloads\\1-s2.0-S0749596X09001247-main.pdf"

# Insert the sequence of preprocessing steps you want to perform on the text.
preprocessing = ['clean_data', 'split_sentences']

# Insert the functionality
functionality = 'summarize'

summarizer = ScientificLiteratureAnalyzer(path)
result = summarizer.analyze(functionality, preprocessing)

for key, value in result.items():
    print(f"\nSection: {key}\n{'=' * len('Section: ' + key)}\nSummary: {value}\n")





Section: title
Summary: How cross-language similarity and task demands affect cognate recognition.


Section: Introduction
Summary: Consider the following text (after Peter Verstegen): Drink gin in restaurant, whiskey in hotel, champagne in bed. Later effect: Oh God, migraine. Tablet in warm water!.


Section: The cognate facilitation effect
Summary: In many reaction time (RT) studies, involving a variety of experimental paradigms, cognates were responded to faster than control words that exist in only one language. Over the years, several theoretical accounts have been proposed. The task dependence of cognate effects and the related predictions of the four accounts will be considered before presenting Experiment two..


Section: Four positions on cognate representation and processing
Summary: one, for a proposal). one, panel d). The various model predictions are summarized in Table one..


Section: The cognate facilitation effect and cross-linguistic similarity
Summary: Dijkstra et a