In [54]:
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

pd.set_option('display.max_colwidth', 100)

In [55]:
testdata = """get me documents with publish date greater than 10-Oct-2010.
get me companies with revenue greater than 500000.
companies with revenue above 510000.
companies with revenue in access of 500000.
companies with revenue exceeding 500000.
get me companies with revenue less than 500000.
get me companies with revenue lesser than $ 5b.
get me companies with revenue lesser than five billion.
what deals have revenue higher than $5b ?
get me companies with revenue more than 500000.
Which are the companies having revenue of more than 10000?
deals with deal size more than 200000."""

In [56]:
# load spaCy model
nlp = spacy.load("en_core_web_sm")

text = "get me companies with revenue greater than 500000" 

# Plot the dependency graph 
doc = nlp(text) 
displacy.render(doc, style='dep',jupyter=True)

In [57]:
#text = "Tableau was recently acquired by Salesforce." 
text="get me documents with publish date greater than 10-Oct-2010"
#text="get me companies with revenue greater than $1b"
#text="get me companies with revenue greater than ten thousand"
doc = nlp(text) 

for tok in doc:
    print(tok.text,"-->",tok.dep_,"-->",tok.pos_, tok.head, tok.lefts, tok.tag_)
    print([w for w in tok.lefts])

get --> ROOT --> VERB get <generator object at 0x0000017ECC251AE8> VB
[]
me --> dative --> PRON get <generator object at 0x0000017ECC2A4400> PRP
[]
documents --> dobj --> NOUN get <generator object at 0x0000017ECC251AE8> NNS
[]
with --> prep --> ADP documents <generator object at 0x0000017ECC2A4510> IN
[]
publish --> amod --> NOUN date <generator object at 0x0000017ECC2A4400> NN
[]
date --> pobj --> NOUN with <generator object at 0x0000017ECC2A4510> NN
[publish]
greater --> amod --> ADJ date <generator object at 0x0000017ECC2A4400> JJR
[]
than --> prep --> ADP greater <generator object at 0x0000017ECC251AE8> IN
[]
10-Oct-2010 --> pobj --> NUM than <generator object at 0x0000017ECC2A4400> CD
[]


In [58]:
def subtree_matcher(doc):
	subjpass = 0

	for i,tok in enumerate(doc):
		# find dependency tag that contains the text "subjpass"
		print(tok.text,"-->",tok.dep_,"-->",tok.pos_)
		if tok.dep_.find("subjpass") == True:
			subjpass = 1

	x = ''
	y = ''

	# if subjpass == 1 then sentence is passive
	if subjpass == 1:
		for i,tok in enumerate(doc):
			if tok.dep_.find("subjpass") == True:
				y = tok.text

			if tok.dep_.endswith("obj") == True:
				x = tok.text
	
	# if subjpass == 0 then sentence is not passive
	else:
		for i,tok in enumerate(doc):
			if tok.dep_.endswith("subj") == True:
				x = tok.text

			if tok.dep_.endswith("obj") == True:
				y = tok.text

	return x,y

In [59]:
def subtree_matcher_text(text,renderchart=False):
    doc = nlp(text)
    if (renderchart):
        displacy.render(doc, style='dep',jupyter=True)
    print("**********************************")
    return subtree_matcher(doc)

In [60]:
testdata_lines=testdata.split('\n')
for l in testdata_lines:
    print(subtree_matcher_text(l))


**********************************
get --> ROOT --> VERB
me --> dative --> PRON
documents --> dobj --> NOUN
with --> prep --> ADP
publish --> amod --> NOUN
date --> pobj --> NOUN
greater --> amod --> ADJ
than --> prep --> ADP
10-Oct-2010 --> pobj --> NUM
. --> punct --> PUNCT
('', '10-Oct-2010')
**********************************
get --> ROOT --> VERB
me --> dative --> PRON
companies --> dobj --> NOUN
with --> prep --> ADP
revenue --> pobj --> NOUN
greater --> amod --> ADJ
than --> prep --> ADP
500000 --> pobj --> NUM
. --> punct --> PUNCT
('', '500000')
**********************************
companies --> ROOT --> NOUN
with --> prep --> ADP
revenue --> pobj --> NOUN
above --> prep --> ADP
510000 --> pobj --> NUM
. --> punct --> PUNCT
('', '510000')
**********************************
companies --> ROOT --> NOUN
with --> prep --> ADP
revenue --> pobj --> NOUN
in --> prep --> ADP
access --> pobj --> NOUN
of --> prep --> ADP
500000 --> pobj --> NUM
. --> punct --> PUNCT
('', '500000')
*******

In [61]:
import spacy
from nltk import Tree


#en_nlp = spacy.load('en')

doc = nlp("get me companies with revenue greater than 500000")
#doc = nlp("get me documents publishing tomorrow")
doc=nlp("get me documents with publish date greater than 10-Oct-2010")
doc=nlp("The annual revenues of company such as Apple is more than 1000000")
doc=nlp("Which are the companies having annual revenues of more than 1000000")

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_


[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

           are                                  
   _________|________                            
  |              companies                      
  |     _____________|________                   
  |    |                    having              
  |    |                      |                  
  |    |                   revenues             
  |    |     _________________|________          
  |    |    |                          of       
  |    |    |                          |         
  |    |    |                       1000000     
  |    |    |                  ________|_____    
Which the annual             more           than



[None]

In [62]:
def get_relation(sent):
    doc = nlp(sent)
    # Matcher class object
    matcher = Matcher(nlp.vocab)
    
    #define the pattern
    pattern = [{'DEP':'pobj'}, 
            {'DEP':'amod','OP':"?"},
                {'DEP':'prep','OP':"?"},
              {'DEP':'pobj'}]
    #pattern = [{'ENT_TYPE': 'CARDINAL', 'OP': '?'}]
    
    matcher.add("matching_1", None, pattern)
    matches = matcher(doc)
    k = len(matches) - 1
    if k<0:
        return "--"
    print("k",k)
    print(matches[k][1])
    print(matches[k][2])
    if(matches[k][1] == matches[k][2]):
        span=doc[matches[k][1]]
        return(span.text)
    span = doc[matches[k][1]:matches[k][2]]
    return(span.text)

In [63]:
testdata_lines=testdata.split('\n')
for l in testdata_lines:
    print(get_relation(l))

k 0
5
9
date greater than 10-Oct-2010
k 0
4
8
revenue greater than 500000
k 0
2
5
revenue above 510000
k 1
4
7
access of 500000
--
--
--
--
--
--
--
--
