#### importing libraries

In [1]:
import re
import nltk
import string
import math
import spacy
import numpy as np
import pandas as pd
from tqdm import tqdm #process bars

from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

#### import spacy model

In [2]:
nlp = spacy.load("en_core_web_sm")

#### data mining

In [3]:
text1 = "GDP in developing countries such as India will continue decreasing during Covid-19." #such as
text2 = "Here is how you can keep your car and other vehicles clean." #and/or
text3 = "Eight people, including two children, were injured in the explosion" #including
text4 = "A healthy eating pattern includes fruits, especially whole fruits" #especially
#create a spacy object
doc1 = nlp(text1)
doc2 = nlp(text2)
doc3 = nlp(text3)
doc4 = nlp(text4)
docList =[doc1,doc2,doc3,doc4]

In [4]:
#print token, POS tag, dependency (subject, object, modifiers)
for doc in docList:
    print("\n",doc)
    for token in doc:
        print(token.text, "-->",token.dep_,"-->",token.pos_)


 GDP in developing countries such as India will continue decreasing during Covid-19.
GDP --> nsubj --> PROPN
in --> prep --> ADP
developing --> amod --> VERB
countries --> pobj --> NOUN
such --> amod --> ADJ
as --> prep --> SCONJ
India --> pobj --> PROPN
will --> aux --> VERB
continue --> ROOT --> VERB
decreasing --> xcomp --> VERB
during --> prep --> ADP
Covid-19 --> pobj --> PROPN
. --> punct --> PUNCT

 Here is how you can keep your car and other vehicles clean.
Here --> advmod --> ADV
is --> ROOT --> AUX
how --> advmod --> ADV
you --> nsubj --> PRON
can --> aux --> VERB
keep --> ccomp --> VERB
your --> poss --> DET
car --> dobj --> NOUN
and --> cc --> CCONJ
other --> amod --> ADJ
vehicles --> conj --> NOUN
clean --> oprd --> ADJ
. --> punct --> PUNCT

 Eight people, including two children, were injured in the explosion
Eight --> nummod --> NUM
people --> nsubjpass --> NOUN
, --> punct --> PUNCT
including --> prep --> VERB
two --> nummod --> NUM
children --> pobj --> NOUN
, --> pun

In [5]:
#define the pattern
# pattern=[
#     {'POS':'NOUN'},
#     {'LOWER':'such'},
#     {'LOWER':'as'},
#     {'POS':'PROPN'}
# ] #order matters

# to capture modifier also
pattern1=[
    {'DEP':'amod', 'OP':'?'},
    {'POS':'NOUN'},
    {'LOWER':'such'},
    {'LOWER':'as'},
    {'POS':'PROPN'}
]
pattern2=[
    {'DEP':'amod', 'OP':'?'},
    {'POS':'NOUN'},
    {'LOWER':'and', 'OP':'?'},
    {'LOWER':'or', 'OP':'?'},
    {'LOWER':'other'},
    {'POS':'NOUN'}
]
pattern3=[
    {'DEP':'nummod', 'OP':'?'},
    {'DEP':'amod', 'OP':'?'},
    {'POS':'NOUN'},
    {'IS_PUNCT':True},
    {'LOWER':'including'},
    {'DEP':'nummod', 'OP':'?'},
    {'DEP':'amod', 'OP':'?'},
    {'POS':'NOUN'}
]
pattern4=[
    {'DEP':'nummod', 'OP':'?'},
    {'DEP':'amod', 'OP':'?'},
    {'POS':'NOUN'},
    {'IS_PUNCT':True},
    {'LOWER':'especially'},
    {'DEP':'nummod', 'OP':'?'},
    {'DEP':'amod', 'OP':'?'},
    {'POS':'NOUN'}
]

patternList = [pattern1,pattern2,pattern3,pattern4]

In [6]:
for pattern in patternList:
    #Matcher class obj
    matcher = Matcher(nlp.vocab)
    matcher.add("matching_1",None,pattern)
    
    if(pattern == pattern1):
        doc = doc1 
    elif(pattern == pattern2):
        doc = doc2
    elif(pattern == pattern3):
        doc = doc3
    else:
        doc = doc4
    matches = matcher(doc)
    span = doc[matches[0][1]:matches[0][2]]
    print(span)
    #developing countries is the hypernym and India is the hyponym- Hearst patterns

developing countries such as India
car and other vehicles
Eight people, including two children
fruits, especially whole fruits


#### drawback of rule based - the patterns are not generalized soln is subtree

In [7]:
text1 = "Tableau was recently acquired by Salesforce"

#plot the dependency graph
doc1 = nlp(text1)
displacy.render(doc1,style='dep',jupyter=True)

In [8]:
text2 = "Careem, a ride hailing major in middle east was acquired by Uber"

#plot the dependency graph
doc2 = nlp(text2)
displacy.render(doc2,style='dep',jupyter=True)

In [9]:
for token in doc1:
        print(token.text, "-->",token.dep_,"-->",token.pos_)

Tableau --> nsubjpass --> PROPN
was --> auxpass --> AUX
recently --> advmod --> ADV
acquired --> ROOT --> VERB
by --> agent --> ADP
Salesforce --> pobj --> NOUN


#### subtree matching- we hv to check which dependency paths are common b/w multiple sentences.

In [29]:
def subtree_match(doc):
    x=y=''
    flag=0
    
    for i, tok in enumerate(doc):
        #extract subject
        #passive
        if(tok.dep_.find('subjpass')==True):
            y=tok.text
            flag=True
        #active
        elif(tok.dep_.endswith('subj')==True):
            flag == False
            x=tok.text
        else:
            if(flag == False and tok.dep_.endswith('obj')==True):
                print("junk")
                y=tok.text
            elif(flag == True and tok.dep_.endswith('obj')==True):
                x=tok.text
            
        #extract object
        
    return x,y

In [11]:
subtree_match(doc1)

('Salesforce', 'Tableau')

In [12]:
subtree_match(doc2)

('Uber', 'Careem')

In [13]:
#Suppose
text3 = "Salesforce acquired Tableau"

#plot the dependency graph
doc3 = nlp(text3)
displacy.render(doc3,style='dep',jupyter=True)

In [14]:
subtree_match(doc3)

('Tableau', '')

In [15]:
#wrong output coz it's in active voice thus making changes in subtree_match()
for token in doc3:
        print(token.text, "-->",token.dep_,"-->",token.pos_)

Salesforce --> nsubj --> NOUN
acquired --> ROOT --> VERB
Tableau --> dobj --> PROPN


In [30]:
subtree_match(doc3)

junk


('Salesforce', 'Tableau')

In [31]:
subtree_match(doc1)

('Salesforce', 'Tableau')