In [51]:
import spacy
import neuralcoref

nlp = spacy.load('en_core_web_sm', disable=["ner"])
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7f899a075970>

In [52]:
def get_branch(t, sent, include_self=True):        
    branch = recurse(t)
    if include_self:
        branch += [t]
            
    branch = [w for w in sent if w in branch]# and w.dep_ in include]

    lemmas = []
    tags = []
    
    for token in branch:
        lemma = token.lemma_.lower()
        #if len(lemma) <= 2:
        #    continue
        if any([char.isdigit() for char in lemma]):
            continue
        if any(punc in lemma for punc in ['.',',',':',';', '-']):
            continue
        lemmas.append(lemma)
        tags.append(token.tag_)
    
    return lemmas, tags

def recurse(*tokens):
    children = []
    def add(tok):       
        sub = tok.children
        for item in sub:
            children.append(item)
            add(item)
    for token in tokens:
        add(token)    
    return children

In [53]:
from collections import defaultdict

# subject dependencies
subdeps = ['nsubj','nsubjpass', 'expl']

# main dependencies
maindeps = ['nsubj','nsubjpass', 'expl', 'advmod', 'dobj', 'prep', 'xcomp', 
            'dative', 'advcl', 'agent', 'ccomp', 'acomp', 'attr']

def parse_by_subject(sent, resolve_corefs=True):
    subjects = [t for t in sent if t.dep_ in subdeps]

    ## Only for debugging
    #for cur_sub in subjects:
    #    if "Board" == str(cur_sub):
    #        print(all_tokens)

    datalist = []

    # Each subject corresponds to a statement that it is the subject of.
    # Hence this is a loop over *statements*
    for obnum, subject in enumerate(subjects):   
        subdep = subject.dep_        
        mlem = None
        verb = subject.head
        if not verb.tag_.startswith('V'):
            continue        
                
        vlem = verb.lemma_
        
        tokenlists = defaultdict(list)                        
        neg = ''
        for t in verb.children:
            if t.tag_ == 'MD':
                mlem = t.orth_.lower()
                continue
            dep = t.dep_
            if dep in ['punct','cc','det', 'meta', 'intj', 'dep']:
                continue
            if dep == 'neg':
                neg = 'not'                
            elif t.dep_ == 'prt':
                vlem = vlem + '_' + t.orth_.lower()                    
            else:
                tokenlists[dep].append(t)
                
        slem = subject.lemma_
        in_coref = False
        cr_subject = subject.text
        cr_slem = slem
        num_clusters = 0
        coref_replaced = False
        if resolve_corefs:
            in_coref = subject._.in_coref
            # Now check if it's *different* from the coref cluster's main coref
            if in_coref:
                coref_clusters = subject._.coref_clusters
                num_clusters = len(coref_clusters)
                first_cluster = coref_clusters[0]
                # Get the main of this first cluster
                #cluster_main_lem = first_cluster.main.lemma_
                cluster_main_lem = first_cluster.main.root.lemma_
                if slem != cluster_main_lem:
                    # Replace it with main!
                    cr_slem = cluster_main_lem
                    coref_replaced = True
                    cr_subject = cluster_main_lem

        data = {'orig_subject': subject.text,
                'orig_slem': slem,
                'in_coref': in_coref,
                'subject': cr_subject,
                'slem': cr_slem,
                'coref_replaced': coref_replaced,
                'modal':mlem,
                'neg': neg,
                'verb': vlem,
                #'full_sentence': str(sent),
                #'subfilter': 0,
                'passive': 0,
                'md': 0}
        
        if subdep == 'nsubjpass':
            data['passive'] = 1
        if mlem is not None:
            data['md'] = 1
        
        subphrase, subtags = get_branch(subject,sent)                                        
        
        data['subject_branch'] = subphrase        
        data['subject_tags'] = subtags
        
        object_branches = []
        object_tags = []
        
        for dep, tokens in tokenlists.items():
            if dep in subdeps:
                continue
            for t in tokens:
                tbranch, ttags = get_branch(t,sent)                
                object_branches.append(tbranch)
                object_tags.append(ttags)
        data['object_branches'] = object_branches
        data['object_tags'] = object_tags

        data['full_statement'] = ""
        
        # So upon being added to datalist, the "data" dictionary has the following
        # keys: 'orig_subject','orig_slem','in_coref','subject', 'slem',"modal",
        # "neg","verb","passive","md","subject_branch","subject_tags",
        # "object_branches", "object_tags", "full_statement" (empty string for now)

        datalist.append(data)
    
    return datalist

In [54]:
def get_statements(art_nlp, contract_id, art_num, args):
    statement_list = []
    time_in_pbs = 0
    # For now, since spaCy neural coref is buggy, need to check if
    # there are any coref clusters in the doc
    if args.use_neural_coref:
        any_corefs = art_nlp._.coref_clusters is not None
    else:
        any_corefs = False
    
    for sentence_num, sent in enumerate(art_nlp.sents):
        tokcheck = str(sent).split()
        if any([x.isupper() and len(x) > 3 for x in tokcheck]):
            # Don't parse this sentence
            continue
        
        sent_statements = parse_by_subject(sent, resolve_corefs=any_corefs)
        
        for statement_num, statement_data in enumerate(sent_statements):
            full_data = statement_data.copy()
            full_data['contract_id'] = contract_id
            full_data['article_num'] = art_num
            full_data['sentence_num'] = sentence_num
            full_data['statement_num'] = statement_num
            full_data['full_sentence'] = str(sent)
            statement_list.append(full_data)
    return statement_list

## Tests for Parsing

In [55]:
sentence = "The boy is eating an apple, and the girl drinks juice."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'boy', 'orig_slem': 'boy', 'in_coref': False, 'subject': 'boy', 'slem': 'boy', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'eat', 'passive': 0, 'md': 0, 'subject_branch': ['the', 'boy'], 'subject_tags': ['DT', 'NN'], 'object_branches': [['be'], ['an', 'apple'], ['the', 'girl', 'drink', 'juice']], 'object_tags': [['VBZ'], ['DT', 'NN'], ['DT', 'NN', 'NNS', 'NN']], 'full_statement': ''}
The det DT DET [] [boy, eating]
boy nsubj NN NOUN [The] [eating]
is aux VBZ AUX [] [eating]
eating ROOT VBG VERB [boy, is, apple, ,, and, drinks] []
an det DT DET [] [apple, eating]
apple dobj NN NOUN [an] [eating]
, punct , PUNCT [] [eating]
and cc CC CCONJ [] [eating]
the det DT DET [] [girl, drinks, eating]
girl nsubj NN NOUN [the] [drinks, eating]
drinks conj NNS NOUN [girl, juice, .] [eating]
juice dobj NN NOUN [] [drinks, eating]
. punct . PUNCT [] [drinks, eating]


In [56]:
sentence = "Employees must have at least one break during the working day."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'Employees', 'orig_slem': 'employee', 'in_coref': False, 'subject': 'Employees', 'slem': 'employee', 'coref_replaced': False, 'modal': 'must', 'neg': '', 'verb': 'have', 'passive': 0, 'md': 1, 'subject_branch': ['employee'], 'subject_tags': ['NNS'], 'object_branches': [['at', 'least', 'one', 'break', 'during', 'the', 'work', 'day']], 'object_tags': [['RB', 'RBS', 'CD', 'NN', 'IN', 'DT', 'VBG', 'NN']], 'full_statement': ''}
Employees nsubj NNS NOUN [] [have]
must aux MD VERB [] [have]
have ROOT VB AUX [Employees, must, break, .] []
at advmod RB ADV [] [least, one, break, have]
least advmod RBS ADV [at] [one, break, have]
one nummod CD NUM [least] [break, have]
break dobj NN NOUN [one, during] [have]
during prep IN ADP [day] [break, have]
the det DT DET [] [day, during, break, have]
working amod VBG VERB [] [day, during, break, have]
day pobj NN NOUN [the, working] [during, break, have]
. punct . PUNCT [] [have]


In [57]:
sentence = "The minimum salary in effect as of April 1, 2010 will be 952.20 per month."
doc = nlp(sentence)
    
sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'salary', 'orig_slem': 'salary', 'in_coref': False, 'subject': 'salary', 'slem': 'salary', 'coref_replaced': False, 'modal': 'will', 'neg': '', 'verb': 'be', 'passive': 0, 'md': 1, 'subject_branch': ['the', 'minimum', 'salary', 'in', 'effect', 'as', 'of', 'april'], 'subject_tags': ['DT', 'JJ', 'NN', 'IN', 'NN', 'IN', 'IN', 'NNP'], 'object_branches': [['per', 'month']], 'object_tags': [['IN', 'NN']], 'full_statement': ''}
The det DT DET [] [salary, be]
minimum amod JJ ADJ [] [salary, be]
salary nsubj NN NOUN [The, minimum, in, as] [be]
in prep IN ADP [effect] [salary, be]
effect pobj NN NOUN [] [in, salary, be]
as prep IN SCONJ [of] [salary, be]
of prep IN ADP [April] [as, salary, be]
April pobj NNP PROPN [1, ,, 2010] [of, as, salary, be]
1 nummod CD NUM [] [April, of, as, salary, be]
, punct , PUNCT [] [April, of, as, salary, be]
2010 nummod CD NUM [] [April, of, as, salary, be]
will aux MD VERB [] [be]
be ROOT VB AUX [salary, will, 952.20, .] []
952.20 attr CD NUM [pe

In [58]:
sentence = "Every employee, qualified or unskilled, who permanently replaces a professional, must have his/her professional card classified."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'employee', 'orig_slem': 'employee', 'in_coref': False, 'subject': 'employee', 'slem': 'employee', 'coref_replaced': False, 'modal': 'must', 'neg': '', 'verb': 'have', 'passive': 0, 'md': 1, 'subject_branch': ['every', 'employee', 'qualified', 'or', 'unskilled', 'who', 'permanently', 'replace', 'a', 'professional'], 'subject_tags': ['DT', 'NN', 'JJ', 'CC', 'JJ', 'WP', 'RB', 'VBZ', 'DT', 'NN'], 'object_branches': [['/', 'professional', 'card', 'classify']], 'object_tags': [['SYM', 'JJ', 'NN', 'VBD']], 'full_statement': ''}
{'orig_subject': 'who', 'orig_slem': 'who', 'in_coref': False, 'subject': 'who', 'slem': 'who', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'replace', 'passive': 0, 'md': 0, 'subject_branch': ['who'], 'subject_tags': ['WP'], 'object_branches': [['permanently'], ['a', 'professional']], 'object_tags': [['RB'], ['DT', 'NN']], 'full_statement': ''}
{'orig_subject': 'card', 'orig_slem': 'card', 'in_coref': False, 'subject': 'card', 'slem': '

MAY NEED TO INCLUDE APPOS?

In [59]:
sentence = "In the cases of supervisory and command positions, the term referred to in item 1 above will be 90 days."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'term', 'orig_slem': 'term', 'in_coref': False, 'subject': 'term', 'slem': 'term', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'refer', 'passive': 0, 'md': 0, 'subject_branch': ['the', 'term'], 'subject_tags': ['DT', 'NN'], 'object_branches': [['in', 'the', 'case', 'of', 'supervisory', 'and', 'command', 'position'], ['to'], ['in', 'item'], ['above']], 'object_tags': [['IN', 'DT', 'NNS', 'IN', 'JJ', 'CC', 'NN', 'NNS'], ['IN'], ['IN', 'NN'], ['RB']], 'full_statement': ''}
In prep IN ADP [cases] [referred, be]
the det DT DET [] [cases, In, referred, be]
cases pobj NNS NOUN [the, of] [In, referred, be]
of prep IN ADP [positions] [cases, In, referred, be]
supervisory amod JJ ADJ [and, command] [positions, of, cases, In, referred, be]
and cc CC CCONJ [] [supervisory, positions, of, cases, In, referred, be]
command conj NN NOUN [] [supervisory, positions, of, cases, In, referred, be]
positions pobj NNS NOUN [supervisory] [of, cases, In, referred, be]
, punct , 

In [60]:
sentence = "The boy is eating an apple and the girl is drinking juice."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'boy', 'orig_slem': 'boy', 'in_coref': False, 'subject': 'boy', 'slem': 'boy', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'eat', 'passive': 0, 'md': 0, 'subject_branch': ['the', 'boy'], 'subject_tags': ['DT', 'NN'], 'object_branches': [['be'], ['an', 'apple'], ['the', 'girl', 'be', 'drink', 'juice']], 'object_tags': [['VBZ'], ['DT', 'NN'], ['DT', 'NN', 'VBZ', 'VBG', 'NN']], 'full_statement': ''}
{'orig_subject': 'girl', 'orig_slem': 'girl', 'in_coref': False, 'subject': 'girl', 'slem': 'girl', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'drink', 'passive': 0, 'md': 0, 'subject_branch': ['the', 'girl'], 'subject_tags': ['DT', 'NN'], 'object_branches': [['be'], ['juice']], 'object_tags': [['VBZ'], ['NN']], 'full_statement': ''}
The det DT DET [] [boy, eating]
boy nsubj NN NOUN [The] [eating]
is aux VBZ AUX [] [eating]
eating ROOT VBG VERB [boy, is, apple, and, drinking] []
an det DT DET [] [apple, eating]
apple dobj NN NOUN [an] [eating]
an

Include obj???

In [61]:
sentence = "Replacement employees may not be entitled to a pension."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'employees', 'orig_slem': 'employee', 'in_coref': False, 'subject': 'employees', 'slem': 'employee', 'coref_replaced': False, 'modal': 'may', 'neg': 'not', 'verb': 'entitle', 'passive': 1, 'md': 1, 'subject_branch': ['replacement', 'employee'], 'subject_tags': ['NN', 'NNS'], 'object_branches': [['be'], ['to', 'a', 'pension']], 'object_tags': [['VB'], ['IN', 'DT', 'NN']], 'full_statement': ''}
Replacement compound NN NOUN [] [employees, entitled]
employees nsubjpass NNS NOUN [Replacement] [entitled]
may aux MD VERB [] [entitled]
not neg RB PART [] [entitled]
be auxpass VB AUX [] [entitled]
entitled ROOT VBN VERB [employees, may, not, be, to, .] []
to prep IN ADP [pension] [entitled]
a det DT DET [] [pension, to, entitled]
pension pobj NN NOUN [a] [to, entitled]
. punct . PUNCT [] [entitled]


In [62]:
sentence = "I have been reading a book."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'I', 'orig_slem': '-PRON-', 'in_coref': False, 'subject': 'I', 'slem': '-PRON-', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'read', 'passive': 0, 'md': 0, 'subject_branch': [], 'subject_tags': [], 'object_branches': [['have'], ['be'], ['a', 'book']], 'object_tags': [['VBP'], ['VBN'], ['DT', 'NN']], 'full_statement': ''}
I nsubj PRP PRON [] [reading]
have aux VBP AUX [] [reading]
been aux VBN AUX [] [reading]
reading ROOT VBG VERB [I, have, been, book, .] []
a det DT DET [] [book, reading]
book dobj NN NOUN [a] [reading]
. punct . PUNCT [] [reading]


In [63]:
sentence = "The book was read by her."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'book', 'orig_slem': 'book', 'in_coref': False, 'subject': 'book', 'slem': 'book', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'read', 'passive': 1, 'md': 0, 'subject_branch': ['the', 'book'], 'subject_tags': ['DT', 'NN'], 'object_branches': [['be'], ['by']], 'object_tags': [['VBD'], ['IN']], 'full_statement': ''}
The det DT DET [] [book, read]
book nsubjpass NN NOUN [The] [read]
was auxpass VBD AUX [] [read]
read ROOT VBN VERB [book, was, by, .] []
by agent IN ADP [her] [read]
her pobj PRP PRON [] [by, read]
. punct . PUNCT [] [read]


In [64]:
sentence = "The book must be read by her."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'book', 'orig_slem': 'book', 'in_coref': False, 'subject': 'book', 'slem': 'book', 'coref_replaced': False, 'modal': 'must', 'neg': '', 'verb': 'read', 'passive': 1, 'md': 1, 'subject_branch': ['the', 'book'], 'subject_tags': ['DT', 'NN'], 'object_branches': [['be'], ['by']], 'object_tags': [['VB'], ['IN']], 'full_statement': ''}
The det DT DET [] [book, read]
book nsubjpass NN NOUN [The] [read]
must aux MD VERB [] [read]
be auxpass VB AUX [] [read]
read ROOT VBN VERB [book, must, be, by, .] []
by agent IN ADP [her] [read]
her pobj PRP PRON [] [by, read]
. punct . PUNCT [] [read]


In [65]:
sentence = "The book should not have been read by her."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'book', 'orig_slem': 'book', 'in_coref': False, 'subject': 'book', 'slem': 'book', 'coref_replaced': False, 'modal': 'should', 'neg': 'not', 'verb': 'read', 'passive': 1, 'md': 1, 'subject_branch': ['the', 'book'], 'subject_tags': ['DT', 'NN'], 'object_branches': [['have'], ['be'], ['by']], 'object_tags': [['VB'], ['VBN'], ['IN']], 'full_statement': ''}
The det DT DET [] [book, read]
book nsubjpass NN NOUN [The] [read]
should aux MD VERB [] [read]
not neg RB PART [] [read]
have aux VB AUX [] [read]
been auxpass VBN AUX [] [read]
read ROOT VBN VERB [book, should, not, have, been, by, .] []
by agent IN ADP [her] [read]
her pobj PRP PRON [] [by, read]
. punct . PUNCT [] [read]


In [66]:
sentence = "A break is allowed for employees."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'break', 'orig_slem': 'break', 'in_coref': False, 'subject': 'break', 'slem': 'break', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'allow', 'passive': 1, 'md': 0, 'subject_branch': ['a', 'break'], 'subject_tags': ['DT', 'NN'], 'object_branches': [['be'], ['for', 'employee']], 'object_tags': [['VBZ'], ['IN', 'NNS']], 'full_statement': ''}
A det DT DET [] [break, allowed]
break nsubjpass NN NOUN [A] [allowed]
is auxpass VBZ AUX [] [allowed]
allowed ROOT VBN VERB [break, is, for, .] []
for prep IN ADP [employees] [allowed]
employees pobj NNS NOUN [] [for, allowed]
. punct . PUNCT [] [allowed]


Possible changes?

In [67]:
sentence = "Employees are prohibited from eating in the workplace."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'Employees', 'orig_slem': 'employee', 'in_coref': False, 'subject': 'Employees', 'slem': 'employee', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'prohibit', 'passive': 1, 'md': 0, 'subject_branch': ['employee'], 'subject_tags': ['NNS'], 'object_branches': [['be'], ['from', 'eat', 'in', 'the', 'workplace']], 'object_tags': [['VBP'], ['IN', 'VBG', 'IN', 'DT', 'NN']], 'full_statement': ''}
Employees nsubjpass NNS NOUN [] [prohibited]
are auxpass VBP AUX [] [prohibited]
prohibited ROOT VBN VERB [Employees, are, from, .] []
from prep IN ADP [eating] [prohibited]
eating pcomp VBG VERB [in] [from, prohibited]
in prep IN ADP [workplace] [eating, from, prohibited]
the det DT DET [] [workplace, in, eating, from, prohibited]
workplace pobj NN NOUN [the] [in, eating, from, prohibited]
. punct . PUNCT [] [prohibited]


In [68]:
sentence = "Employees must have at least one break during the working day."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'Employees', 'orig_slem': 'employee', 'in_coref': False, 'subject': 'Employees', 'slem': 'employee', 'coref_replaced': False, 'modal': 'must', 'neg': '', 'verb': 'have', 'passive': 0, 'md': 1, 'subject_branch': ['employee'], 'subject_tags': ['NNS'], 'object_branches': [['at', 'least', 'one', 'break', 'during', 'the', 'work', 'day']], 'object_tags': [['RB', 'RBS', 'CD', 'NN', 'IN', 'DT', 'VBG', 'NN']], 'full_statement': ''}
Employees nsubj NNS NOUN [] [have]
must aux MD VERB [] [have]
have ROOT VB AUX [Employees, must, break, .] []
at advmod RB ADV [] [least, one, break, have]
least advmod RBS ADV [at] [one, break, have]
one nummod CD NUM [least] [break, have]
break dobj NN NOUN [one, during] [have]
during prep IN ADP [day] [break, have]
the det DT DET [] [day, during, break, have]
working amod VBG VERB [] [day, during, break, have]
day pobj NN NOUN [the, working] [during, break, have]
. punct . PUNCT [] [have]


ROOT IN CONDITIONAL?

In [69]:
sentence = "This system will not apply to hours worked with a night shift that cover the period from 10:00 pm of one day to 5:00 am of the following day."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'system', 'orig_slem': 'system', 'in_coref': False, 'subject': 'system', 'slem': 'system', 'coref_replaced': False, 'modal': 'will', 'neg': 'not', 'verb': 'apply', 'passive': 0, 'md': 1, 'subject_branch': ['this', 'system'], 'subject_tags': ['DT', 'NN'], 'object_branches': [['to', 'hour'], ['work', 'with', 'a', 'night', 'shift', 'that', 'cover', 'the', 'period', 'from', 'pm', 'of', 'one', 'day', 'to', 'am', 'of', 'the', 'follow', 'day']], 'object_tags': [['IN', 'NNS'], ['VBN', 'IN', 'DT', 'NN', 'NN', 'WDT', 'VBP', 'DT', 'NN', 'IN', 'NN', 'IN', 'CD', 'NN', 'IN', 'NN', 'IN', 'DT', 'VBG', 'NN']], 'full_statement': ''}
{'orig_subject': 'that', 'orig_slem': 'that', 'in_coref': False, 'subject': 'that', 'slem': 'that', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'cover', 'passive': 0, 'md': 0, 'subject_branch': ['that'], 'subject_tags': ['WDT'], 'object_branches': [['the', 'period', 'from', 'pm', 'of', 'one', 'day'], ['to', 'am', 'of', 'the', 'follow', 'day']]

In [70]:
sentence = "In the event of termination of the employment contract, whatever the cause, if there are hours to the EMPLOYEE's credit, these will be paid by the EMPLOYER as overtime, plus the additional ones provided for in the Collective Bargaining Agreement together with the other severance pay."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'there', 'orig_slem': 'there', 'in_coref': False, 'subject': 'there', 'slem': 'there', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'be', 'passive': 0, 'md': 0, 'subject_branch': ['there'], 'subject_tags': ['EX'], 'object_branches': [['if'], ['hour', 'to', 'the', 'employee', "'s", 'credit']], 'object_tags': [['IN'], ['NNS', 'IN', 'DT', 'NNP', 'POS', 'NN']], 'full_statement': ''}
{'orig_subject': 'these', 'orig_slem': 'these', 'in_coref': False, 'subject': 'these', 'slem': 'these', 'coref_replaced': False, 'modal': 'will', 'neg': '', 'verb': 'pay', 'passive': 1, 'md': 1, 'subject_branch': ['these'], 'subject_tags': ['DT'], 'object_branches': [['in', 'the', 'event', 'of', 'termination', 'of', 'the', 'employment', 'contract'], ['as', 'overtime'], ['if', 'there', 'be', 'hour', 'to', 'the', 'employee', "'s", 'credit'], ['be'], ['by', 'the', 'employer'], ['the', 'additional', 'one', 'provide', 'for', 'in', 'the', 'collective', 'bargaining', 'agreement', 'togeth

In [71]:
sentence = "Employees need to be guaranteed certain rights."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'Employees', 'orig_slem': 'employee', 'in_coref': False, 'subject': 'Employees', 'slem': 'employee', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'need', 'passive': 0, 'md': 0, 'subject_branch': ['employee'], 'subject_tags': ['NNS'], 'object_branches': [['to', 'be', 'guarantee', 'certain', 'right']], 'object_tags': [['TO', 'VB', 'VBN', 'JJ', 'NNS']], 'full_statement': ''}
Employees nsubj NNS NOUN [] [need]
need ROOT VBP VERB [Employees, guaranteed, .] []
to aux TO PART [] [guaranteed, need]
be auxpass VB AUX [] [guaranteed, need]
guaranteed xcomp VBN VERB [to, be, rights] [need]
certain amod JJ ADJ [] [rights, guaranteed, need]
rights dobj NNS NOUN [certain] [guaranteed, need]
. punct . PUNCT [] [need]


In [76]:
sentence = "It is also ensured, for employees, retirees and pensioners linked to them, the current system of granting and funding of medicines."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'It', 'orig_slem': '-PRON-', 'in_coref': False, 'subject': 'It', 'slem': '-PRON-', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'ensure', 'passive': 1, 'md': 0, 'subject_branch': ['the', 'current', 'system', 'of', 'granting', 'and', 'funding', 'of', 'medicine'], 'subject_tags': ['DT', 'JJ', 'NN', 'IN', 'NN', 'CC', 'NN', 'IN', 'NNS'], 'object_branches': [['be'], ['also'], ['for', 'employee', 'retiree', 'and', 'pensioner', 'link', 'to']], 'object_tags': [['VBZ'], ['RB'], ['IN', 'NNS', 'NNS', 'CC', 'NNS', 'VBN', 'IN']], 'full_statement': ''}
It nsubjpass PRP PRON [system] [ensured]
is auxpass VBZ AUX [] [ensured]
also advmod RB ADV [] [ensured]
ensured ROOT VBN VERB [It, is, also, ,, for, ,, .] []
, punct , PUNCT [] [ensured]
for prep IN ADP [employees] [ensured]
employees pobj NNS NOUN [,, retirees, linked] [for, ensured]
, punct , PUNCT [] [employees, for, ensured]
retirees conj NNS NOUN [and, pensioners] [employees, for, ensured]
and cc CC CCONJ [] [retir

In [73]:
sentence = "Doctors, nurses, and all other employees are permitted vacation time."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'Doctors', 'orig_slem': 'doctor', 'in_coref': False, 'subject': 'Doctors', 'slem': 'doctor', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'permit', 'passive': 1, 'md': 0, 'subject_branch': ['doctor', 'nurse', 'and', 'all', 'other', 'employee'], 'subject_tags': ['NNS', 'NNS', 'CC', 'DT', 'JJ', 'NNS'], 'object_branches': [['be'], ['vacation', 'time']], 'object_tags': [['VBP'], ['NN', 'NN']], 'full_statement': ''}
Doctors nsubjpass NNS NOUN [,, nurses] [permitted]
, punct , PUNCT [] [Doctors, permitted]
nurses conj NNS NOUN [,, and, employees] [Doctors, permitted]
, punct , PUNCT [] [nurses, Doctors, permitted]
and cc CC CCONJ [] [nurses, Doctors, permitted]
all det DT DET [] [employees, nurses, Doctors, permitted]
other amod JJ ADJ [] [employees, nurses, Doctors, permitted]
employees conj NNS NOUN [all, other] [nurses, Doctors, permitted]
are auxpass VBP AUX [] [permitted]
permitted ROOT VBN VERB [Doctors, are, time, .] []
vacation compound NN NOUN [] [time

In [74]:
sentence = "It is prohibited for apprentices and interns to work overtime beyond the daily schedule."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'It', 'orig_slem': '-PRON-', 'in_coref': False, 'subject': 'It', 'slem': '-PRON-', 'coref_replaced': False, 'modal': None, 'neg': '', 'verb': 'prohibit', 'passive': 1, 'md': 0, 'subject_branch': [], 'subject_tags': [], 'object_branches': [['be'], ['for', 'apprentice', 'and', 'intern'], ['to', 'work', 'overtime', 'beyond', 'the', 'daily', 'schedule']], 'object_tags': [['VBZ'], ['IN', 'NNS', 'CC', 'NNS'], ['TO', 'VB', 'NN', 'IN', 'DT', 'JJ', 'NN']], 'full_statement': ''}
It nsubjpass PRP PRON [] [prohibited]
is auxpass VBZ AUX [] [prohibited]
prohibited ROOT VBN VERB [It, is, for, work, .] []
for prep IN ADP [apprentices] [prohibited]
apprentices pobj NNS NOUN [and, interns] [for, prohibited]
and cc CC CCONJ [] [apprentices, for, prohibited]
interns conj NNS NOUN [] [apprentices, for, prohibited]
to aux TO PART [] [work, prohibited]
work advcl VB VERB [to, overtime, beyond] [prohibited]
overtime dobj NN NOUN [] [work, prohibited]
beyond prep IN ADP [schedule] [work, proh

In [75]:
sentence = "Employees are not allowed to eat in the workplace, and they cannot drink in the workplace."
doc = nlp(sentence)

sent_statements = parse_by_subject(doc)
for data in sent_statements:
    print(data)

for token in doc:
    print(token.text, token.dep_, token.tag_, token.pos_, \
          [children for children in token.children], [ancestors for ancestors in token.ancestors])

{'orig_subject': 'Employees', 'orig_slem': 'employee', 'in_coref': True, 'subject': 'Employees', 'slem': 'employee', 'coref_replaced': False, 'modal': None, 'neg': 'not', 'verb': 'allow', 'passive': 1, 'md': 0, 'subject_branch': ['employee'], 'subject_tags': ['NNS'], 'object_branches': [['be'], ['to', 'eat', 'in', 'the', 'workplace'], ['can', 'not', 'drink', 'in', 'the', 'workplace']], 'object_tags': [['VBP'], ['TO', 'VB', 'IN', 'DT', 'NN'], ['MD', 'RB', 'VB', 'IN', 'DT', 'NN']], 'full_statement': ''}
{'orig_subject': 'they', 'orig_slem': '-PRON-', 'in_coref': True, 'subject': 'employee', 'slem': 'employee', 'coref_replaced': True, 'modal': 'can', 'neg': 'not', 'verb': 'drink', 'passive': 0, 'md': 1, 'subject_branch': [], 'subject_tags': [], 'object_branches': [['in', 'the', 'workplace']], 'object_tags': [['IN', 'DT', 'NN']], 'full_statement': ''}
Employees nsubjpass NNS NOUN [] [allowed]
are auxpass VBP AUX [] [allowed]
not neg RB PART [] [allowed]
allowed ROOT VBN VERB [Employees, ar