In [1]:
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

pd.set_option('display.max_colwidth', 200)

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
# sample text 
text = "GDP in developing countries such as Vietnam will continue growing at a high rate." 

# create a spaCy object 
doc = nlp(text)

In [4]:
for tok in doc: 
  print(tok.text, "-->",tok.dep_,"-->", tok.pos_)

GDP --> nsubj --> NOUN
in --> prep --> ADP
developing --> amod --> VERB
countries --> pobj --> NOUN
such --> amod --> ADJ
as --> prep --> SCONJ
Vietnam --> pobj --> PROPN
will --> aux --> VERB
continue --> ROOT --> VERB
growing --> xcomp --> VERB
at --> prep --> ADP
a --> det --> DET
high --> amod --> ADJ
rate --> pobj --> NOUN
. --> punct --> PUNCT


In [5]:
pattern = [{'POS':'NOUN'}, 
           {'LOWER': 'such'}, 
           {'LOWER': 'as'}, 
           {'POS': 'PROPN'}]

In [6]:
matcher = Matcher(nlp.vocab) 
matcher.add("matching_1", None, pattern) 

matches = matcher(doc) 
span = doc[matches[0][1]:matches[0][2]] 

print(span.text)

countries such as Vietnam


In [7]:
matcher = Matcher(nlp.vocab)

#define the pattern
pattern = [{'DEP':'amod', 'OP':"?"}, # adjectival modifier
           {'POS':'NOUN'},
           {'LOWER': 'such'},
           {'LOWER': 'as'},
           {'POS': 'PROPN'}]

matcher.add("matching_1", None, pattern)
matches = matcher(doc)

span = doc[matches[0][1]:matches[0][2]]
print(span.text)

developing countries such as Vietnam


In [8]:

text = "Tableau was recently acquired by Salesforce." 

# Plot the dependency graph 
doc = nlp(text) 
displacy.render(doc, style='dep',jupyter=True)

In [9]:
text = "Tableau was recently acquired by Salesforce." 
doc = nlp(text) 

for tok in doc: 
  print(tok.text,"-->",tok.dep_,"-->",tok.pos_)

Tableau --> nsubjpass --> PROPN
was --> auxpass --> AUX
recently --> advmod --> ADV
acquired --> ROOT --> VERB
by --> agent --> ADP
Salesforce --> pobj --> PROPN
. --> punct --> PUNCT


In [10]:

def subtree_matcher(doc): 
  x = '' 
  y = '' 
  
  # iterate through all the tokens in the input sentence 
  for i,tok in enumerate(doc): 
    # extract subject 
    if tok.dep_.find("subjpass") == True: 
      y = tok.text 
      
    # extract object 
    if tok.dep_.endswith("obj") == True: 
      x = tok.text 
      
  return x,y

In [11]:
subtree_matcher(doc)

('Salesforce', 'Tableau')

In [12]:

text_2 = "Careem, a ride hailing major in middle east, was acquired by Uber." 

doc_2 = nlp(text_2) 
subtree_matcher(doc_2)

('Uber', 'Careem')

In [13]:
text_3 = "Salesforce recently acquired Tableau." 
doc_3 = nlp(text_3) 
subtree_matcher(doc_3)

('Tableau', '')

In [14]:
for tok in doc_3:    
  print(tok.text, "-->",tok.dep_, "-->",tok.pos_)

Salesforce --> nsubj --> NOUN
recently --> advmod --> ADV
acquired --> ROOT --> VERB
Tableau --> dobj --> PROPN
. --> punct --> PUNCT


In [15]:
displacy.render(doc_3, style='dep',jupyter=True)

In [16]:
def new_subtree_matcher(doc):
  subjpass = 0

  for i,tok in enumerate(doc):
    # find dependency tag that contains the text "subjpass"    
    if tok.dep_.find("subjpass") == True:
      subjpass = 1

  x = ''
  y = ''
  z = ''

  # if subjpass == 1 then sentence is passive
  if subjpass == 1:
    for i,tok in enumerate(doc):
      if tok.dep_.find("subjpass") == True:
        y = tok.text

      if tok.dep_.endswith("obj") == True:
        x = tok.text
      
      if tok.dep_.endswith("ROOT") == True:
        z = tok.text
  
  # if subjpass == 0 then sentence is not passive
  else:
    for i,tok in enumerate(doc):
      if tok.dep_.endswith("subj") == True:
        x = tok.text

      if tok.dep_.endswith("obj") == True:
        y = tok.text
      
      if tok.dep_.endswith("ROOT") == True:
        z = tok.text

  return x,z,y

In [17]:
new_subtree_matcher(doc_3)

('Salesforce', 'acquired', 'Tableau')

In [18]:
new_subtree_matcher(nlp("Tableau was recently acquired by Salesforce."))

('Salesforce', 'acquired', 'Tableau')

In [19]:
new_subtree_matcher(nlp("Chirag is staying at Taj."))

('Chirag', 'staying', 'Taj')

In [20]:
from spacy.lemmatizer import Lemmatizer
lemmatizer = nlp.vocab.morphology.lemmatizer

def new_subtree_matcher1(doc1):
  #pos_ents = ["DATE", "LOC", "ORG", "GPE"]
  pos_words = ["born","bear", "live", "locate", "situate", "establish", "residence", "resident", "reside", "birthplace", "national", "countryman",
               "citizen", "employee", "work", "job", "do", "educate", "study"]

  doc = nlp(doc1)

  # ents = dict()
  # for e in doc.ents:
  #   if e.text not in ents:
  #     ents[e.text] = e.label_

  subjpass = 0

  for i,tok in enumerate(doc):
    # find dependency tag that contains the text "subjpass"    
    if tok.dep_.find("subjpass") == True:
      subjpass = 1

  x = ''
  flag_x = False
  y = ''
  flag_y = False
  z = ''
  flag_z = False
  lemma_z = ''

  # if subjpass == 1 then sentence is passive
  if subjpass == 1:
    for i,tok in enumerate(doc):
      if tok.dep_.find("subjpass") == True:
        if flag_y == False:
          y = tok.text
          flag_y = True
        else:
          y+=tok.text
          flag_y = False
      else:
        flag_y = False

      if tok.dep_.endswith("obj") == True:
        if flag_x == False:
          x = tok.text
          flag_x = True
        else:
          x+=tok.text
          flag_x = False
      else:
        flag_x = False
      
      if tok.dep_.endswith("ROOT") == True:
        z = tok.text
        lemma_z = lemmatizer(z, tok.pos_)[0]
  
  # if subjpass == 0 then sentence is not passive
  else:
    for i,tok in enumerate(doc):
      if tok.dep_.endswith("subj") == True:
        if flag_x == False:
          x = tok.text
          flag_x = True
        else:
          x+=tok.text
          flag_x = False
      else:
        flag_x = False

      if tok.dep_.endswith("obj") == True:
        if flag_y == False:
          y = tok.text
          flag_y = True
        else:
          y+=tok.text
          flag_y = False
      else:
        flag_y = False
      
      if tok.dep_.endswith("ROOT") == True:
        z = tok.text
        lemma_z = lemmatizer(z, tok.pos_)[0]
  
  #if x in ents and y in ents:
    #if (ents[x] == "PERSON" and ents[y] in pos_ents) or (ents[y] == "PERSON" and ents[x] in pos_ents):
  if lemma_z in pos_words:
    if x!='' and y!='' and z!='':
      return x,z,y
  return '','',''

In [21]:
new_subtree_matcher1("Rahul is studying in IITJ")

('Rahul', 'studying', 'IITJ')

In [22]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [23]:
path = "/content/drive/MyDrive/NLU_Project/train.json"

In [24]:
file = open(path)
data = file.read()
file.close()

In [25]:
import json
data1 = []
decoder = json.JSONDecoder()
while data:
    value, new_start = decoder.raw_decode(data)
    data = data[new_start:].strip()
    data1.append(value)

In [26]:
data1[0]

{'documentId': '13205',
 'documentText': "Documents Assist\n\nFlorida Cheap Divorce - Simple Divorce $280 flat fee - No Court Divorce\nEstablished in 2006.\nAfter a few years working as an Independent Associate of Legal Shield, formerly known as Pre-Paid Legal, Yamil Fuentes made it her mission to help her clients take advantage of their access to the legal system. After going through her own personal divorce she realized there was a need of affordable divorce services in the State of Florida.\nWhen she founded Apex Legal Document Preparation Services she became a Premium Member of the FALDP (Florida Association of Legal Document Preparers) and educated herself on the different types of divorce been offered across the State of Florida and learned from the best in the industry of legal document preparation services.\nThrough her membership she embarked on a journey to learn how to make the divorce process as easy as possible for her clients, making her company the most sought after divo

In [27]:
paras = []
for doc in data1:
  paras.append(doc['documentText'])
paras[0]

"Documents Assist\n\nFlorida Cheap Divorce - Simple Divorce $280 flat fee - No Court Divorce\nEstablished in 2006.\nAfter a few years working as an Independent Associate of Legal Shield, formerly known as Pre-Paid Legal, Yamil Fuentes made it her mission to help her clients take advantage of their access to the legal system. After going through her own personal divorce she realized there was a need of affordable divorce services in the State of Florida.\nWhen she founded Apex Legal Document Preparation Services she became a Premium Member of the FALDP (Florida Association of Legal Document Preparers) and educated herself on the different types of divorce been offered across the State of Florida and learned from the best in the industry of legal document preparation services.\nThrough her membership she embarked on a journey to learn how to make the divorce process as easy as possible for her clients, making her company the most sought after divorce preparer in Florida touting the follo

In [28]:
sents = []
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
for para in paras:
  sents.extend(sent_tokenize(para))
sents[0]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


'Documents Assist\n\nFlorida Cheap Divorce - Simple Divorce $280 flat fee - No Court Divorce\nEstablished in 2006.'

In [29]:
train_data = sents[:int(len(sents)*0.2)]
test_data = sents[int(len(sents)*0.2):int(len(sents)*0.25)]

In [30]:
train_labels = []
for sent in train_data:
  train_labels.append(new_subtree_matcher1(sent))

In [31]:
test_labels = []
for sent in test_data:
  test_labels.append(new_subtree_matcher1(sent))

In [36]:
for i in range(400,500):
  if train_labels[i] != ('','',''):
    print("Text:", train_data[i])
    print("Relation:", train_labels[i])
    print()

Text: The mathematical properties of the catenary curve were first studied by Robert Hooke in the 1670s, and its equation was derived by Leibniz, Huygens and Johann Bernoulli in 1691.
Relation: ('1691', 'studied', 'equation')

Text: She was born in Edmundston, New Brunswick.
Relation: ('Edmundston', 'born', 'She')

Text: She has lived in Moncton since the 1970s.
Relation: ('She', 'lived', '1970s')

Text: Couturier has also worked as a script writer and researcher for Radio-Canada.
Relation: ('Couturier', 'worked', 'Canada')

Text: He was born (1944) and raised in Downers Grove, IL.
Relation: ('Grove', 'born', 'He')

Text: He was born in Orkanger as a son of a builder.
Relation: ('builder', 'born', 'He')

Text: He worked as a laborer until 1923, first at a sawmill, then as a carpenter.
Relation: ('He', 'worked', 'carpenter')

