In [286]:
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 

from spacy.matcher import Matcher 
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span 
from spacy import displacy 

pd.set_option('display.max_colwidth', 100)

In [287]:
grt_synset = """above
over
more than 
beyond 
surpassing 
upwards of 
more so 
exceeding 
outnumbering 
in excess of 
distinct 
superiors 
betters 
more
higher ups 
outstripping 
outpacing 
outweighing 
upward of 
degree 
higher than 
larger than 
faster than 
heavier than 
longer than 
further 
major
outperforming 
older than 
taller than 
one more 
separate 
some other 
upper
superior 
higher 
high
oversight 
good
finest 
your betters 
older
greater 
down
upstairs 
advance 
mountain 
outracing 
overcome 
overcoming 
exceed 
outstrip 
overstepping 
outpace 
beats
surpass 
surpasses 
exceeds 
precedes 
crossing 
outperformed 
outdone
exceeded 
excellence 
superiority 
outweigh 
predominates 
trumps 
much
most
excess 
just
only"""
grtlist=grt_synset.split('\n')
#print(grtlist)
ii = 0
while ii < len(grtlist):
    if(grtlist[ii].strip()==""):
        grtlist.pop(ii)
    else:
        grtlist[ii] = grtlist[ii].strip()
    ii+=1

In [288]:
testdata = """get me documents with publish date greater than 10-Oct-2010.
get me companies with revenue greater than 100000.
companies with revenue above 210000.
companies with revenue in excess of 300000.
companies with revenue exceeding 400000.
get me companies with revenue less than 500000.
get me companies with revenue lesser than $ 6b.
get me companies with revenue lesser than twenty seven billion.
what deals have revenue higher than $8b ?
get me companies with revenue more than 900000.
Which are the companies having revenue of more than 10000?
Which are the companies having annual revenues of more than $ 1000000
deals with deal size more than 110000."""

In [289]:
# load spaCy model
nlp = spacy.load("en_core_web_sm")

matcher = PhraseMatcher(nlp.vocab)
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in grtlist]
matcher.add("TerminologyList", None, *patterns)


text = "get me companies with revenue greater than 500000" 

# Plot the dependency graph 
doc = nlp(text) 
displacy.render(doc, style='dep',jupyter=True)

In [290]:
def getmatches(doc):
    matches = matcher(doc)
    return matches

In [291]:
#text = "Tableau was recently acquired by Salesforce." 
text="get me documents with publish date greater than 10-Oct-2010"
#text="get me companies with revenue greater than $1b"
#text="get me companies with revenue greater than ten thousand"
doc = nlp(text) 

for tok in doc:
    print(tok.text,"-->",tok.dep_,"-->",tok.pos_, tok.head, tok.lefts, tok.tag_)
    print([w for w in tok.lefts])

get --> ROOT --> VERB get <generator object at 0x000002E588BEF488> VB
[]
me --> dative --> PRON get <generator object at 0x000002E58E00B9D8> PRP
[]
documents --> dobj --> NOUN get <generator object at 0x000002E58E00B488> NNS
[]
with --> prep --> ADP documents <generator object at 0x000002E58E00B9D8> IN
[]
publish --> amod --> NOUN date <generator object at 0x000002E58E00B488> NN
[]
date --> pobj --> NOUN with <generator object at 0x000002E5874FE378> NN
[publish]
greater --> amod --> ADJ date <generator object at 0x000002E588BEF488> JJR
[]
than --> prep --> ADP greater <generator object at 0x000002E588BEF488> IN
[]
10-Oct-2010 --> pobj --> NUM than <generator object at 0x000002E5874FE378> CD
[]


In [292]:
def subtree_matcher(doc):
	subjpass = 0

	for i,tok in enumerate(doc):
		# find dependency tag that contains the text "subjpass"
		print(tok.text,"-->",tok.dep_,"-->",tok.pos_)
		if tok.dep_.find("subjpass") == True:
			subjpass = 1

	x = ''
	y = ''

	# if subjpass == 1 then sentence is passive
	if subjpass == 1:
		for i,tok in enumerate(doc):
			if tok.dep_.find("subjpass") == True:
				y = tok.text

			if tok.dep_.endswith("obj") == True:
				x = tok.text
	
	# if subjpass == 0 then sentence is not passive
	else:
		for i,tok in enumerate(doc):
			if tok.dep_.endswith("subj") == True:
				x = tok.text

			if tok.dep_.endswith("obj") == True:
				y = tok.text

	return x,y

In [293]:
def subtree_matcher_text(text,renderchart=False):
    doc = nlp(text)
    if (renderchart):
        displacy.render(doc, style='dep',jupyter=True)
    print("**********************************")
    return subtree_matcher(doc)

In [294]:
testdata_lines=testdata.split('\n')
for l in testdata_lines:
    print(subtree_matcher_text(l,True))


**********************************
get --> ROOT --> VERB
me --> dative --> PRON
documents --> dobj --> NOUN
with --> prep --> ADP
publish --> amod --> NOUN
date --> pobj --> NOUN
greater --> amod --> ADJ
than --> prep --> ADP
10-Oct-2010 --> pobj --> NUM
. --> punct --> PUNCT
('', '10-Oct-2010')


**********************************
get --> ROOT --> VERB
me --> dative --> PRON
companies --> dobj --> NOUN
with --> prep --> ADP
revenue --> pobj --> NOUN
greater --> amod --> ADJ
than --> prep --> ADP
100000 --> pobj --> NUM
. --> punct --> PUNCT
('', '100000')


**********************************
companies --> ROOT --> NOUN
with --> prep --> ADP
revenue --> pobj --> NOUN
above --> prep --> ADP
210000 --> pobj --> NUM
. --> punct --> PUNCT
('', '210000')


**********************************
companies --> ROOT --> NOUN
with --> prep --> ADP
revenue --> pobj --> NOUN
in --> prep --> ADP
excess --> pobj --> NOUN
of --> prep --> ADP
300000 --> pobj --> NUM
. --> punct --> PUNCT
('', '300000')


**********************************
companies --> ROOT --> NOUN
with --> prep --> ADP
revenue --> pobj --> NOUN
exceeding --> acl --> VERB
400000 --> dobj --> NUM
. --> punct --> PUNCT
('', '400000')


**********************************
get --> ROOT --> VERB
me --> dative --> PRON
companies --> dobj --> NOUN
with --> prep --> ADP
revenue --> pobj --> NOUN
less --> amod --> ADJ
than --> quantmod --> ADP
500000 --> npadvmod --> NUM
. --> punct --> PUNCT
('', 'revenue')


**********************************
get --> ROOT --> VERB
me --> dative --> PRON
companies --> dobj --> NOUN
with --> prep --> ADP
revenue --> pobj --> NOUN
lesser --> amod --> ADJ
than --> prep --> ADP
$ --> nmod --> SYM
6b --> pobj --> NUM
. --> punct --> PUNCT
('', '6b')


**********************************
get --> ROOT --> VERB
me --> dative --> PRON
companies --> dobj --> NOUN
with --> prep --> ADP
revenue --> pobj --> NOUN
lesser --> amod --> ADJ
than --> prep --> ADP
twenty --> quantmod --> NUM
seven --> compound --> NUM
billion --> pobj --> NUM
. --> punct --> PUNCT
('', 'billion')


**********************************
what --> det --> NOUN
deals --> nsubj --> NOUN
have --> ROOT --> VERB
revenue --> dobj --> NOUN
higher --> amod --> ADJ
than --> prep --> ADP
$ --> nmod --> SYM
8b --> pobj --> NUM
? --> punct --> PUNCT
('deals', '8b')


**********************************
get --> ROOT --> VERB
me --> dative --> PRON
companies --> dobj --> NOUN
with --> prep --> ADP
revenue --> pobj --> NOUN
more --> amod --> ADJ
than --> quantmod --> ADP
900000 --> npadvmod --> NUM
. --> punct --> PUNCT
('', 'revenue')


**********************************
Which --> nsubj --> ADJ
are --> ROOT --> VERB
the --> det --> DET
companies --> attr --> NOUN
having --> acl --> VERB
revenue --> dobj --> NOUN
of --> prep --> ADP
more --> amod --> ADJ
than --> quantmod --> ADP
10000 --> pobj --> NUM
? --> punct --> PUNCT
('Which', '10000')


**********************************
Which --> nsubj --> ADJ
are --> ROOT --> VERB
the --> det --> DET
companies --> attr --> NOUN
having --> acl --> VERB
annual --> amod --> ADJ
revenues --> dobj --> NOUN
of --> prep --> ADP
more --> amod --> ADJ
than --> quantmod --> ADP
$ --> quantmod --> SYM
1000000 --> pobj --> NUM
('Which', '1000000')


**********************************
deals --> ROOT --> NOUN
with --> prep --> ADP
deal --> compound --> NOUN
size --> pobj --> NOUN
more --> amod --> ADJ
than --> quantmod --> ADP
110000 --> npadvmod --> NUM
. --> punct --> PUNCT
('', 'size')


In [295]:
import spacy
from nltk import Tree


#en_nlp = spacy.load('en')

doc = nlp("get me companies with revenue greater than 500000")
#doc = nlp("get me documents publishing tomorrow")
doc=nlp("get me documents with publish date greater than 10-Oct-2010")
doc=nlp("The annual revenues of company such as Apple is more than 1000000")
doc=nlp("Which are the companies having annual revenues of more than $ 1000000")
doc=nlp("companies with revenue in excess of 300000.")
doc=nlp(testdata)

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_


[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

      get                            
  _____|___________________________   
 |          documents              | 
 |              |                  |  
 |             with                | 
 |              |                  |  
 |             date                | 
 |      ________|__________        |  
 |     |                greater    | 
 |     |                   |       |  
 |     |                  than     . 
 |     |                   |       |  
 me publish           10-Oct-2010    
                                     

       get       
  ______|______   
 |  companies  | 
 |      |      |  
 |     with    | 
 |      |      |  
 |   revenue   | 
 |      |      |  
 |   greater   | 
 |      |      |  
 |     than    . 
 |      |      |  
 me   100000     
                 

        companies    
    ________|______   
  with             | 
   |               |  
revenue            | 
   |               |  
 above             . 
   |               |  
 210000              


[None, None, None, None, None, None, None, None, None, None, None, None]

In [296]:
def get_relation(sent):
    doc = nlp(sent)
    # Matcher class object
    matcher = Matcher(nlp.vocab)
    
    #define the pattern
    pattern = [{'DEP':'pobj'}, 
            {'DEP':'amod','OP':"?"},
                {'DEP':'prep','OP':"?"},
              {'DEP':'pobj'}]
    pattern = [{'ENT_TYPE': 'CARDINAL'}]
    
    matcher.add("matching_1", None, pattern)
    matches = matcher(doc)
    k = len(matches) - 1
    if k<0:
        return "--"
    print("k",k)
    print(matches[k][1])
    print(matches[k][2])
    if(matches[k][1] == matches[k][2]):
        span=doc[matches[k][1]]
        return(span.text)
    span = doc[matches[k][1]:matches[k][2]]
    return(span.text)

In [297]:
def extract_gtr_relation(sent):
    doc = nlp(sent)
    num2compare=""
    field2compare = ""
    comparesign=""
    matches = getmatches(doc)
    for tok in doc:
        #print(tok.pos_)
        if(tok.pos_ == "NUM"):
            num2compare = tok.text
            #print("Head of num: {0} is {1}".format(tok.text, tok.head))
            childrentxt = [token.text for token in tok.children]
            children = [token for token in tok.children]
            if(len(children) > 0):
                #print("Child of num: {0} is {1}".format(tok.text, " ".join(childrentxt)))
                #print("len",len(children))
                for j in range(len(children)-1,-1,-1):
                    #print("j", j)
                    #print("Inside range loop", children[j].text)
                    if(children[j].pos_ == "NUM" or children[j].pos_ == "SYM" ):
                        #concatenate - prefix to num2compare
                        #print("concatenating children : " + children[j].text)
                        num2compare = children[j].text + " " + num2compare
                        
            is_obj = tok.head.dep_ == "pobj"
            newtok = tok.head
            #print("newtok.dep_", newtok.dep_)
            i=0
            skiptok = False
            while is_obj == False:
                newtok = newtok.head
                #print("newtok.dep_", newtok.dep_)
                for match_id, start, end in matches:
                    if start <= newtok.i <= end:
                        skiptok = True
                        break;
                    
                is_obj = (newtok.dep_ == "pobj" or newtok.dep_ == "dobj") #and (skiptok == False))
                i+=1
                if i> 5:
                    #print("Dont go beyond 5 levels")
                    break
            lefts = [w.text for w in newtok.lefts]
            field2compare = ''
            if(len(lefts) == 0):
                #print("Field to compare : {0}".format(newtok.text))
                field2compare = newtok.text
            else:
                if(len(lefts) == 1):
                    field2compare = lefts[0] + " " + str(newtok.text)
                else:
                    #print(lefts)
                    s = str(" ".join(lefts))
                    #print("Field to compare : {0} - {1}".format(s, newtok.text))
                    field2compare = " ".join(lefts) + " " + newtok.text
            
            
            #Check for greater than/less than sign
            comparesign=''
            #print(">>>>>>>>>>>>>>><<<<<<<<<<<<<<<")
            is_sign = (tok.head.dep_ == "amod" or tok.head.lemma_ in set(grtlist))
            newsigntok = tok.head
            #print("newsigntok.dep_", newsigntok.dep_)
            i=0
            while is_sign == False:
                newsigntok = newsigntok.head
                print("newsigntok.dep_", newsigntok.dep_)
                is_sign = (newsigntok.dep_ == "amod" or newsigntok.lemma_ in set(grtlist))
                #print("is_sign is {0} for {1}".format(is_sign, newsigntok.text))
                i+=1
                if i> 5:
                    #print("Dont go beyond 5 levels")
                    break
            if(is_sign==True):
                comparesign = newsigntok.text
            
            
    return field2compare, comparesign, num2compare
        
        
    

In [298]:
testdata_lines=testdata.split('\n')
for l in testdata_lines:
    print(l)
    doc=nlp(l)
    #[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]
    print("=====================>>",extract_gtr_relation(l))

get me documents with publish date greater than 10-Oct-2010.
newsigntok.dep_ amod
get me companies with revenue greater than 100000.
newsigntok.dep_ amod
companies with revenue above 210000.
companies with revenue in excess of 300000.
newsigntok.dep_ pobj
companies with revenue exceeding 400000.
get me companies with revenue less than 500000.
newsigntok.dep_ ROOT
newsigntok.dep_ ROOT
newsigntok.dep_ ROOT
newsigntok.dep_ ROOT
newsigntok.dep_ ROOT
newsigntok.dep_ ROOT
get me companies with revenue lesser than $ 6b.
newsigntok.dep_ amod
get me companies with revenue lesser than twenty seven billion.
newsigntok.dep_ prep
newsigntok.dep_ amod
newsigntok.dep_ prep
newsigntok.dep_ amod
newsigntok.dep_ amod
what deals have revenue higher than $8b ?
newsigntok.dep_ amod
get me companies with revenue more than 900000.
newsigntok.dep_ ROOT
newsigntok.dep_ ROOT
newsigntok.dep_ ROOT
newsigntok.dep_ ROOT
newsigntok.dep_ ROOT
newsigntok.dep_ ROOT
Which are the companies having revenue of more than 10