In [1]:
import re
import pandas as pd
import nltk

from nltk.tree import Tree
# This uses corenlp server! Will need to alter code if using JAR files directly
# https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
from nltk.parse.corenlp import CoreNLPParser
from nltk.tag.stanford import CoreNLPTagger, CoreNLPPOSTagger, CoreNLPNERTagger

In [2]:
# Careful! CoreNLPTagger, CoreNLPPOSTagger, and CoreNLPNERTagger will all be replaced in the next NLTK version (3.2.6)
parser = CoreNLPParser(url='http://localhost:9000')
#pos_tagger = CoreNLPPOSTagger(url='http://localhost:9000')
#ner_tagger = CoreNLPNERTagger(url='http://localhost:9000')
pos_tagger = CoreNLPTagger(tagtype='pos', url='http://localhost:9000')
ner_tagger = CoreNLPTagger(tagtype='ner', url='http://localhost:9000')

In [3]:
# Get essays
essay_key = pd.read_csv('../data/essays_dataset/index.csv', sep=';')

essays = []
for filename in essay_key['filename']:
    with open('../data/essays_dataset/essays/'+filename, 'r') as f:
        essays.append(f.read().strip())
        
essay_key['essay'] = essays

In [4]:
# Altered behavior of NLTK so CoreNLP performs sentence splits
def constituency_parse(parser, sentences):
    """Return a list of parse strings for each sentence.  
    Each parse string can be fed into Tree.fromstring() to create NLTK Tree objects."""
    default_properties = {'outputFormat': 'json', 'annotators': 'tokenize,pos,lemma,ssplit,parse'}
    parsed_data = parser.api_call(sentences, properties=default_properties)
    
    parses = list()
    for parsed_sent in parsed_data['sentences']:
        parse = parsed_sent['parse']
        # Compress whitespace
        parse = re.sub('[\s]+', ' ', parse)
        parses.append(parse)
    return parses
        
def pos_tags(tagger, sentences):
    """List of sentences with a list of (word, tag) tuples"""
    default_properties = {'annotators': 'tokenize,ssplit,pos'}
    tagged_data = tagger.api_call(sentences, properties=default_properties)
    
    tags = list()
    for sent in tagged_data['sentences']:
        tags.append([(token['word'], token['pos']) for token in sent['tokens']])
    return tags

In [5]:
print(essay_key.loc[0,'essay'])

This is an important aspect of today time.
This products rathen are not much better, but today is not important the really character of the product, but only the money and the client not rappresented the important actor in this process.
Every day any people buy same products that is not rappresented the your necessity, but is only important buy any product.
To explain this argoment in my nation, at the television, there is an program that discuss of the problem rappresented by this.
More people go to this program television to talk about your problem, that is very radicate in my nation.
The modern society rappresented the perfect ambient to influenced the minds of all the person.
In my self is present the reasons of this statement, that is one of the problem of the life.
But not all the people and the time is in accord with this problem, because any time the person is too according with the make products.
Thus I agree with this statement, because this event is present in my life every 

In [6]:
essay_tags = pos_tags(pos_tagger, essay_key.loc[0,'essay'])

In [7]:
trees = constituency_parse(parser, essay_key.loc[0,'essay'])

In [8]:
def tree_to_str(trees):
    """Joins a list of trees in string form"""
    return ' '.join(trees)

def str_to_trees(tree_str):
    """Splits a string into a list of trees in string form"""
    d = "(ROOT"
    return  [(d+e).strip() for e in tree_str.split(d) if e]

In [9]:
print(trees)
print()
print(len(trees))

['(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (DT an) (JJ important) (NN aspect)) (PP (IN of) (NP (NN today) (NN time))))) (. .)))', '(ROOT (S (S (NP (DT This) (NNS products) (NN rathen)) (VP (VBP are) (RB not) (ADVP (RB much) (JJR better)))) (, ,) (CC but) (S (S (NP (NN today)) (VP (VBZ is) (RB not) (NP (NP (JJ important) (DT the) (RB really) (NN character)) (PP (IN of) (NP (DT the) (NN product)))))) (, ,) (CC but) (S (NP (NP (RB only) (DT the) (NN money)) (CC and) (NP (DT the) (NN client))) (ADVP (RB not)) (VP (VBD rappresented) (NP (DT the) (JJ important) (NN actor)) (PP (IN in) (NP (DT this) (NN process)))))) (. .)))', '(ROOT (S (NP (NP (DT Every) (NN day)) (SBAR (S (NP (DT any) (NNS people)) (VP (VBP buy) (NP (NP (JJ same) (NNS products)) (SBAR (WHNP (WDT that)) (S (VP (VBZ is) (RB not) (VP (VBN rappresented) (NP (DT the) (PRP$ your) (NN necessity))))))))))) (, ,) (CC but) (VP (VBZ is) (ADVP (RB only) (JJ important)) (VP (VB buy) (NP (DT any) (NN product)))) (. .)))', '(ROOT (S (

In [10]:
trees_str = tree_to_str(trees)
print(trees_str)

(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (DT an) (JJ important) (NN aspect)) (PP (IN of) (NP (NN today) (NN time))))) (. .))) (ROOT (S (S (NP (DT This) (NNS products) (NN rathen)) (VP (VBP are) (RB not) (ADVP (RB much) (JJR better)))) (, ,) (CC but) (S (S (NP (NN today)) (VP (VBZ is) (RB not) (NP (NP (JJ important) (DT the) (RB really) (NN character)) (PP (IN of) (NP (DT the) (NN product)))))) (, ,) (CC but) (S (NP (NP (RB only) (DT the) (NN money)) (CC and) (NP (DT the) (NN client))) (ADVP (RB not)) (VP (VBD rappresented) (NP (DT the) (JJ important) (NN actor)) (PP (IN in) (NP (DT this) (NN process)))))) (. .))) (ROOT (S (NP (NP (DT Every) (NN day)) (SBAR (S (NP (DT any) (NNS people)) (VP (VBP buy) (NP (NP (JJ same) (NNS products)) (SBAR (WHNP (WDT that)) (S (VP (VBZ is) (RB not) (VP (VBN rappresented) (NP (DT the) (PRP$ your) (NN necessity))))))))))) (, ,) (CC but) (VP (VBZ is) (ADVP (RB only) (JJ important)) (VP (VB buy) (NP (DT any) (NN product)))) (. .))) (ROOT (S (S (VP (TO T

In [11]:
trees = str_to_trees(trees_str)
print(trees)
print()
print(len(trees))

['(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (DT an) (JJ important) (NN aspect)) (PP (IN of) (NP (NN today) (NN time))))) (. .)))', '(ROOT (S (S (NP (DT This) (NNS products) (NN rathen)) (VP (VBP are) (RB not) (ADVP (RB much) (JJR better)))) (, ,) (CC but) (S (S (NP (NN today)) (VP (VBZ is) (RB not) (NP (NP (JJ important) (DT the) (RB really) (NN character)) (PP (IN of) (NP (DT the) (NN product)))))) (, ,) (CC but) (S (NP (NP (RB only) (DT the) (NN money)) (CC and) (NP (DT the) (NN client))) (ADVP (RB not)) (VP (VBD rappresented) (NP (DT the) (JJ important) (NN actor)) (PP (IN in) (NP (DT this) (NN process)))))) (. .)))', '(ROOT (S (NP (NP (DT Every) (NN day)) (SBAR (S (NP (DT any) (NNS people)) (VP (VBP buy) (NP (NP (JJ same) (NNS products)) (SBAR (WHNP (WDT that)) (S (VP (VBZ is) (RB not) (VP (VBN rappresented) (NP (DT the) (PRP$ your) (NN necessity))))))))))) (, ,) (CC but) (VP (VBZ is) (ADVP (RB only) (JJ important)) (VP (VB buy) (NP (DT any) (NN product)))) (. .)))', '(ROOT (S (

In [12]:
tree = Tree.fromstring(trees[0])
print(tree)

(ROOT
  (S
    (NP (DT This))
    (VP
      (VBZ is)
      (NP
        (NP (DT an) (JJ important) (NN aspect))
        (PP (IN of) (NP (NN today) (NN time)))))
    (. .)))


In [15]:
def get_productions(tree):
    """Get productions from an NLTK Tree object.  
    return a list of production rule strings."""
    rules = list()

    for rule in tree.productions():
        if not rule.is_lexical() and 'ROOT' not in rule.unicode_repr():
            rules.append(rule.unicode_repr())

    return rules

In [16]:
get_productions(tree)

['S -> NP VP .',
 'NP -> DT',
 'VP -> VBZ NP',
 'NP -> NP PP',
 'NP -> DT JJ NN',
 'PP -> IN NP',
 'NP -> NN NN']