In [1]:
import sys
sys.path.append('C:\\Users\\noudy\\PycharmProjects\\Cassidy\\Cassidy\\application')

from A_DataCollectors.ScientificLiteratureCollector.scientific_literature_collector import ScientificLiteratureCollector
from C_DataProcessors.text_preprocessor import TextPreprocessor
from D_Analyzers.Summarization.extractive_summarizer import ExtractiveSummarizer
from D_Analyzers.Relation_Extraction.relation_extractor import RelationExtractor


class ScientificLiteratureSummarizer:
    def __init__(self, path):
        self.path = path
        self.format = 'pdf'
        self.source_type = 'url' if path.startswith('http') else 'local'
        self.method = 'scipy'

    def summarize(self):
        # Collect data
        collector = ScientificLiteratureCollector(self.path)
        text = collector.collect(self.format, self.source_type, self.method)

        # Preprocess data
        summarization_steps = ['clean_data', 'split_sentences']
        preprocessor = TextPreprocessor(summarization_steps)
        preprocessed_text = preprocessor.preprocess_grobid(text)

        # Summarize data
        new_dict = {}
        for header, sentences in preprocessed_text.items():
            es = ExtractiveSummarizer(sentences)
            summary = es.summarize('textrank', top_n=3, order_by_rank=False)

            # filter out sentences less than four words long
            summary = '. '.join(sentence for sentence in summary.split('. ') if len(sentence.split()) >= 3)

            new_dict[header] = summary

        return new_dict

    def relation_extractor(self):
        # Collect data
        collector = ScientificLiteratureCollector(self.path)
        text = collector.collect(self.format, self.source_type, self.method)

        # Preprocess data
        relation_steps = ['clean_data']
        preprocessor = TextPreprocessor(relation_steps)
        preprocessed_text = preprocessor.preprocess_grobid(text)
        preprocessed_text = preprocessor.concatenate_sections_grobid(preprocessed_text)

        relation_extraction_steps = ['clean_data', 'case_folding', 'split_sentences', 'tokenize_sentences', 'pos_tagging', 'filter_pos_tagged']
        relation_preprocessor = TextPreprocessor(relation_extraction_steps)
        new_text = relation_preprocessor.preprocess_string(preprocessed_text)

        # Analyze the data
        relation_extractor = RelationExtractor(new_text)
        relations = relation_extractor.extract('co_occurrence')

        return relations


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\noudy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = "C:/Users/noudy/Downloads/1-s2.0-S0749596X09001247-main.pdf"

relation = ScientificLiteratureSummarizer(path)
result = relation.relation_extractor()

print(result)



[('thousand', 'thousand'), ('decision', 'language'), ('cognates', 'effect'), ('effect', 'similarity'), ('cognates', 'similarity'), ('kroll', 'thousand'), ('frequency', 'similarity'), ('cognate', 'effect'), ('cognates', 'decision'), ('frequency', 'word')]


In [4]:
# Insert your document path here (either URL or direct path)
path = "C:/Users/noudy/Downloads/s10791-016-9286-2.pdf"

summarizer = ScientificLiteratureSummarizer(path)
result = summarizer.summarize()

for key, value in result.items():
    print(f"\nSection: {key}\n{'=' * len('Section: ' + key)}\nSummary: {value}\n")




Section: title
Summary: Evaluation and analysis of term scoring methods for term extraction.


Section: Introduction
Summary: 


Section: Our approach
Summary: We start by explaining our approach before discussing the term scoring literature and methodology, because understanding the general work flow of our experiments helps understanding the purpose of the term scoring methods we implemented.Our approach comprises four steps:one. We do not to apply filtering for partof-speech patterns because it cannot be known in advance which POS-patterns are relevant for the collection. For example, for some domains we might only be interested in noun phrases as terms, while for another domain verb phrases are important too..


Section: Scoring all candidate terms
Summary: We implemented the methods described in Sects three.two and three.three.


Section: 3.
Summary: Ranking the terms by their score.Depending on the context in which the terms are used, a top-k of the ranked list is returned.


Se