In [1]:
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 

pd.set_option('display.max_colwidth', 200)

In [2]:
# load spaCy model
nlp = spacy.load("en_core_web_sm")

In [3]:
# sample text 
text = "GDP in developing countries such as Vietnam will continue growing at a high rate." 

# create a spaCy object 
doc = nlp(text)

In [4]:
# print token, dependency, POS tag 
for tok in doc: 
  print(tok.text, "-->",tok.dep_,"-->", tok.pos_)

GDP --> nsubj --> PROPN
in --> prep --> ADP
developing --> amod --> VERB
countries --> pobj --> NOUN
such --> amod --> ADJ
as --> prep --> ADP
Vietnam --> pobj --> PROPN
will --> aux --> AUX
continue --> ROOT --> VERB
growing --> xcomp --> VERB
at --> prep --> ADP
a --> det --> DET
high --> amod --> ADJ
rate --> pobj --> NOUN
. --> punct --> PUNCT


In [20]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("Little girl dancing in the park.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])


Noun phrases: ['the park']
Verbs: ['dance']


In [21]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Little little ADJ JJ amod Xxxxx True False
girl girl NOUN NN compound xxxx True False
dancing dance VERB VBG ROOT xxxx True False
in in ADP IN prep xx True True
the the DET DT det xxx True True
park park NOUN NN pobj xxxx True False
. . PUNCT . punct . False False


In [73]:
for token in doc:
  print(token, ' ', token.dep_)

Dog   nsubj
eats   ROOT
icecream   dobj


In [94]:
text = "A boy studies maths in carpark" 

# Plot the dependency graph 
doc = nlp(text) 
displacy.render(doc, style='dep',jupyter=True)

In [95]:
for token in doc:
  print(token, ' ', token.pos_)

A   DET
boy   NOUN
studies   VERB
maths   NOUN
in   ADP
carpark   NOUN


In [96]:
def subtree_matcher(doc): 
  x = '' 
  y = '' 
  z = ''
  # iterate through all the tokens in the input sentence 
  for i,tok in enumerate(doc): 
    # extract action 
    if tok.dep_ == "acl" or tok.dep_ == 'ROOT' and tok.pos_ == 'VERB': 
      y = tok.text 
      
    # extract background
    if tok.dep_ == "pobj" : 
      x = tok.text 
      
    # exctact object
    if tok.pos_ == 'NOUN' and tok.dep_ == 'ROOT' or tok.dep_ == 'nsubj':
      z = tok.text
  return x,y,z

In [97]:
subtree_matcher(doc)
# should return background, action, object

('carpark', 'studies', 'boy')