In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import nltk
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
# Get treelib
!pip install -U treelib

# Setup Stanford Parser
!wget https://nlp.stanford.edu/software/stanford-parser-4.0.0.zip
!unzip stanford-parser-4.0.0.zip
!mkdir stanford-nlp-jars
!cp stanford-parser-4.0.0/stanford-parser.jar stanford-nlp-jars/
!cp stanford-parser-4.0.0/stanford-parser-4.0.0-models.jar stanford-nlp-jars/
!unzip stanford-nlp-jars/stanford-parser-4.0.0-models.jar -d stanford-nlp-jars/stanford-parser-4.0.0-models/
!pip
from nltk.parse import stanford
os.environ['STANFORD_PARSER'] = 'stanford-nlp-jars'
os.environ['STANFORD_MODELS'] = 'stanford-nlp-jars'
constituency_parser = stanford.StanfordParser(model_path="stanford-nlp-jars/stanford-parser-4.0.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
dependency_parser = stanford.StanfordDependencyParser()


# # Setup Stanford POS Tagger: Not needed
# !wget https://nlp.stanford.edu/software/stanford-tagger-4.1.0.zip
# !unzip stanford-tagger-4.1.0.zip
# !cp stanford-postagger-full-2020-08-06/stanford-postagger-4.1.0.jar stanford-nlp-jars/
# !cp stanford-postagger-full-2020-08-06/models/english-bidirectional-distsim.tagger stanford-nlp-jars/

# from nltk.tag.stanford import StanfordPOSTagger
# pos_model = 'stanford-nlp-jars/english-bidirectional-distsim.tagger'
# pos_jar   = 'stanford-nlp-jars/stanford-postagger-4.1.0.jar'
# pos_tagger = StanfordPOSTagger(pos_model, pos_jar)

Collecting treelib
  Downloading treelib-1.6.1.tar.gz (24 kB)
Building wheels for collected packages: treelib
  Building wheel for treelib (setup.py) ... [?25ldone
[?25h  Created wheel for treelib: filename=treelib-1.6.1-py3-none-any.whl size=18371 sha256=87ec7abb1908ea63d719cdd2edeb29d6c586cad84d298b93063dad1ec090ee5e
  Stored in directory: /root/.cache/pip/wheels/89/be/94/2c6d949ce599d1443426d83ba4dc93cd35c0f4638260930a53
Successfully built treelib
Installing collected packages: treelib
Successfully installed treelib-1.6.1
--2020-11-01 13:21:55--  https://nlp.stanford.edu/software/stanford-parser-4.0.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 182493647 (174M) [application/zip]
Saving to: ‘stanford-parser-4.0.0.zip’


2020-11-01 13:22:02 (25.7 MB/s) - ‘stanford-parser-4.0.0.zip’ saved [182493647/182493647]

Archive:  stan

In [3]:
#### Test Parser
sent1 = "The boy who jumped into the river saved another boy"
# sent1 = "The boy who jumped into the river saved another boy"
print(sent1)
print("Constituency parsing")
for t in constituency_parser.raw_parse(sent1):
	t.pretty_print()

print("Dependency parsing")
result = dependency_parser.raw_parse(sent1)
dep = result.__next__()

l = list(dep.triples())
print(l)
dep_parse = []
for el in l:
	dep_parse.append((el[0][0],el[2][0]))
print(dep_parse)

# #### Test POS Tagger
# text = "He is sincere and honest"
# words = nltk.word_tokenize(text)
# tagged_words = pos_tagger.tag(words)
# print(tagged_words)

The boy who jumped into the river saved another boy
Constituency parsing
                        ROOT                                        
                         |                                           
                         S                                          
                    _____|______________________________             
                   NP                                   |           
      _____________|_____                               |            
     |                  SBAR                            |           
     |        ___________|____                          |            
     |       |                S                         |           
     |       |                |                         |            
     |       |                VP                        |           
     |       |      __________|___                      |            
     |       |     |              PP                    VP          
     |       |     |    

In [4]:
def generateConstituencyTree(lst):
    string = "("
    for i in range(len(lst)):
        if isinstance(lst[i], list):
            string = string + " " + generateConstituencyTree(lst[i]) + " "
        else:
            string = string + " " + lst[i] + " "
    string = string + ")"
    return string

In [5]:
from collections import defaultdict
from itertools import chain

def dependency_to_phrase(tree):
	index = 0
	children = list(tree.nodes[index]["deps"].values())
	phrase_tree = []
	for c in children:
		for idx in c:
			phrase_tree.append(convert_to_phrase_without_root(tree, idx))
	phrase_tree = ['ROOT'] + phrase_tree
	return phrase_tree


def convert_to_phrase_without_root(tree, index):
	noun_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'PRP']
	verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
	children = list(tree.nodes[index]["deps"].values())
	if len(children) == 0:
		if tree.nodes[index]["tag"] not in noun_tags:
			return [tree.nodes[index]["tag"], tree.nodes[index]["word"]]
		else:
			return [[tree.nodes[index]["tag"], tree.nodes[index]["word"]]]

	right_children = []
	left_children = []
	for l in children:
		for idx in l:
			if idx > index:
				right_children.append(idx)
			elif idx < index:
				left_children.append(idx)

	left_children.reverse()
	phrase_tree = [[tree.nodes[index]["tag"], tree.nodes[index]["word"]]]
	left_ext_projections = []
	right_ext_projections = []
	# print(right_children, left_children)
	for lc in left_children:
		phrase_subtree = convert_to_phrase_without_root(tree, lc)
		if  (tree.nodes[lc]["tag"] in noun_tags and tree.nodes[index]["tag"] in verb_tags):
			left_ext_projections = [phrase_subtree] + left_ext_projections
		else:
			phrase_tree = [phrase_subtree] + phrase_tree
	for rc in right_children:
		phrase_subtree = convert_to_phrase_without_root(tree, rc)
		if  (tree.nodes[rc]["tag"] in verb_tags and tree.nodes[index]["tag"] in noun_tags):
			right_ext_projections.append(phrase_subtree)
		else:
			phrase_tree.append(phrase_subtree)
	if tree.nodes[index]["tag"] in noun_tags:
		phrase_tree = ['NP'] + phrase_tree
	elif tree.nodes[index]["tag"] in verb_tags:
		phrase_tree = ['VP'] + phrase_tree
	else:
		phrase_tree = ['X'] + phrase_tree
	if len(left_ext_projections) + len(right_ext_projections) > 0:
		phrase_tree = left_ext_projections + [phrase_tree] + right_ext_projections
		if len(left_ext_projections) > 0:
			phrase_tree = ['S'] + phrase_tree
		else:
			phrase_tree = ['NP'] + phrase_tree
	return phrase_tree


sent1 = "The boy who jumped into the river saved another boy"
parsed_sent1 = dependency_parser.raw_parse(sent1)
for t in parsed_sent1:
    lst = dependency_to_phrase(t)
    x = nltk.tree.Tree.fromstring(generateConstituencyTree(lst))
    x.pretty_print()

                       ROOT                                
                        |                                   
                        S                                  
                   _____|______________________             
                  NP                           |           
      ____________|_____                       |            
     |                  VP                     VP          
     |        __________|____            ______|_____       
     NP      |    |          NP         |            NP    
  ___|___    |    |      ____|____      |       _____|___   
 DT      NN  WP  VBD    IN   DT   NN   VBD     DT        NN
 |       |   |    |     |    |    |     |      |         |  
The     boy who jumped into the river saved another     boy

