In [2]:
#0. Imports
import spacy
from spacy import displacy
import glob
import re
from texttable import Texttable

#import local .py-Files
from preprocessing import * 
from classRuleStat import * 
from helpFunctions import * 
#import rules
from rConditions import * 
from rFull import * 
from rMinus import * 

In [5]:
#1. Extra outputs for better information: Set from 0 to 1 for visualization
intPaper           = 20   # Select a specific example sentence out of the database of extracted statistics
specificSentence   = ""   # Manually insert a sentence. If empty string, intPaper decides about the sentence
printText          = 1    # Print out the actual content of the examplesentence
displayNounPhrases = 1    # Show the intern Noun_Chunks of the example sentence
displayParseTree   = 1    # Visualize the ParseTree
displayAnnotations = 1    # Get additional PoS-Data about every word in the sentence
pathToSentences    = ""
#2. Load ------------------------------------------------------------------------------------------------------------------
#Load Model ----------------------------------------------
nlp = spacy.load('en_core_web_sm', disable=["ner"])
#Alternative: extract from file "examplesentences" -------
exampleList = loadTXTtoList()

if pathToSentences == "":
    text = exampleList[intPaper]
#Load manual sentence if exist-----------------------------------
if specificSentence:
    text = specificSentence

#3. Preprocess ------------------------------------------------------------------------------------------------------------
rPlusMatches, text = handleRPLUSMATCH(text)
text = whitespaceReduction(text)
text = deleteSemicolon(text)
text = checkAmountOfSentences(text)

if printText == 1:
    print(text)
doc = nlp(text)
if doc[-1].pos_ == "SPACE":
    doc = doc[0:len(doc)-1]

noun_chunks = modifyNounChunks(doc)
if displayNounPhrases == 1:
    print("Noun Phrases: {}".format(noun_chunks))

output = RuleStat(noun_chunks)
output = output.checkNounChunks()

# 4. Apply Rules
# 4.1 Apply R-Rules: ------------------------------------------------------------------------------------------------------
output = rMinus(doc, output)
# Print out additional information: ------------------------------------------------------------------------------------
if displayNounPhrases == 1:
    print("Noun Phrases: {}".format(output.noun_chunks))
if displayAnnotations == 1:
    outputSentenceData(doc)
if displayParseTree == 1:
    displacy.render(doc, jupyter=True, style='dep', options={'distance': 80})

#4.2 Apply R+ Rules: -----------------------------------------------------------------------------------------------
fullFunctions = [lambda parDoc, parOutput: bagOfWordsSetFull(parDoc, parOutput), 
                       lambda parDoc, parOutput: comparativeAdjective(parDoc, parOutput)]
findConditionsFunctions = [lambda parDoc, parOutput: bagOfWordsSetCondition(parDoc, parOutput)]
#The findAspectFunctions are special RMinus functions that we applied after the extraction of other rules for better results.
findAspectFunctions = [lambda parDoc, parOutput: significantExcluder(parDoc, parOutput), 
                       lambda parDoc, parOutput: passiveAuxiliary(parDoc, parOutput)] #add "rootThat(doc, output),"
for fn in fullFunctions:     # iterate over list of functions, where the current function in the list is referred to as fn
    if output.state["conditions"] or output.state["aspect"] or not output.state["noun_chunks"]:
        break
    else:
        output = fn(doc, output)
if output.state["noun_chunks"]:
    output = findwhWords(doc, output)

if not output.state["aspect"]:
    for fn in findAspectFunctions:
        if output.state["aspect"] or not output.state["noun_chunks"]:
            break
        output = fn(doc, output)
if not output.state["conditions"]:
    for fn in findConditionsFunctions:
        if output.state["conditions"] or not output.state["noun_chunks"]:
            break
        output = fn(doc, output)
# 5. Check the leftover nounchunks. If they overlap conditions or aspects remove them----------------------------------
if not output.state["noun_chunks"]:
    output = checkExtractionNeatness(output)

# If only one nounChunk is left, assign it automatically.
#if only one nounChunk left, and conditions had been found/ aspect had been found, it must be the other one-------------
if len(output.noun_chunks) == 1:
    #find out if the last nounChunk should be assigned or not:
    if output.state["conditions"] and output.state["aspect"] == False:
        #add noun_chunk as condition
        x = output.noun_chunks[0]
        span = doc[x[0]:x[1]+1]
        print("The last nounChunk is:", end = " ")
        for token in span:
            print(token.text, end = " ")
        toggle = input("The last NounChunk is probably an aspect. Do you want to add it? y/n")
        if toggle == "y":
            lastChunk = output.noun_chunks[0]
            output.setaspect(lastChunk[0], lastChunk[1])
            output.deleteNoun_Chunk(lastChunk[0])            


    elif output.state["aspect"]:
        #add noun_chunk as condition
        print(output.noun_chunks)
        x = output.noun_chunks[0]
        span = doc[x[0]:x[1]+1]
        print("The last nounChunk is:", end = " ")
        for token in span:
            print(token.text, end = " ")
        print()
        if output.state["conditions"]:
            toggle = input("It could be a Condition. Do you want to add it? y/n")
        else:
            toggle = input("The last NounChunk is probably a Condition. Do you want to add it? y/n")
        if toggle == "y":
            lastChunk = output.noun_chunks[0]
            output.setconditionBegin(lastChunk[0])
            output.setconditionEnd(lastChunk[1])
            output.deleteNoun_Chunk(lastChunk[0])

#Rule: Enumeration
#only check for enumerations when output is not empty anymore
if output.state["conditions"]:
    output = checkForEnumerations(doc, output)
    
if output.state["aspect"] or output.state["conditions"]:
    output.printOutput(doc)
else:
    print("No Conditions found, the left over noun_chunks are: '{}'".format(output.noun_chunks))

In the PfSWIB vs PfSWIB∆ comparison, the qPCR data showed a significant linear correlation with the RNA-seq data
Noun Phrases: [(0, 2), (4, 5), (7, 9), (11, 20)]
Noun Phrases: [(0, 2), (4, 5), (7, 9), (11, 20)]
+----+-----------+----------+-------+------+-----------+-----------+-----------+
| Nr | Text      | dep_     | pos_  | tag_ | depEx     | posEx     | tagEx     |
+----+-----------+----------+-------+------+-----------+-----------+-----------+
| 0  | In        | prep     | ADP   | IN   | prepositi | adpositio | conjuncti |
|    |           |          |       |      | onal      | n         | on, subor |
|    |           |          |       |      | modifier  |           | dinating  |
|    |           |          |       |      |           |           | or prepos |
|    |           |          |       |      |           |           | ition     |
+----+-----------+----------+-------+------+-----------+-----------+-----------+
| 1  | the       | det      | DET   | DT   | determine | det

bagOfWordsSetFull started
Starting comparativeAdjectives
In IN
No Relative Clause
12 [(0, 2), (4, 5), (7, 9), (11, 20)]
bagofwords: 'a significant' found
Removed (11, 20) resulting in [(0, 2), (4, 5), (7, 9)]
Removed (0, 2) resulting in [(4, 5), (7, 9)]
Removed (4, 5) resulting in [(7, 9)]
[(7, 9)]
The last nounChunk is: the qPCR data 
It could be a Condition. Do you want to add it? y/nn
Starting checkForEnumerations
Aspect from 11 to 20
 a significant linear correlation with the RNA - seq data
Condition0 from 0 to 2
 In the PfSWIB
Condition1 from 4 to 5
 PfSWIB∆ comparison
1 Noun_Phrases not assigned:
(7, 9): the qPCR data  
