# Sentence formation error detection
Part c.iii

In [1]:
import codecs
import json
import os
import pickle
import re
import sys

from tqdm import tqdm
import numpy as np
import pandas as pd
import nltk

# This uses corenlp server! Will need to alter code if using JAR files directly
# https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
from nltk.parse.corenlp import CoreNLPParser

In [12]:
def constituency_parse(parser, text, return_parse_obj=False, properties=None):
    """Creates parse strings for text.  
    Each parse string can be fed into Tree.fromstring() to create NLTK Tree objects.

    parser (CoreNLPParser): parser to parse sentences
    text (str): essay text
    return_parse_obj (bool): return parse object or string of trees
    properties (dict): override or add CoreNLP properties
    RETURNS (list/dict): a list of parses in string form or parse dict depending on return_parse_obj parameter
    """
    default_properties = {'annotators': 'tokenize,pos,ssplit,parse', 
                          'parse.buildgraphs': 'false'}
                          #'parse.model': 'edu/stanford/nlp/models/srparser/englishSR.ser.gz'}
    default_properties.update(properties or {})

    parsed_data = parser.api_call(text, properties=default_properties)
    if return_parse_obj:
        return parsed_data
    else:
        parses = list()
        for parse in parsed_data['sentences']:
            parse = parse['parse']
            # Compress whitespace
            parse = re.sub('[\s]+', ' ', parse)
            parses.append(parse)
        return parses
    
def tree_to_str(trees):
    """Joins a list of trees in string form"""
    return ' '.join(trees)

def str_to_trees(tree_str):
    """Splits a string into a list of trees in string form"""
    d = "(ROOT"
    return  [(d+e).strip() for e in tree_str.split(d) if e]

def get_productions(tree):
    """Get productions from an NLTK Tree object.  
    return a list of production rule strings."""
    rules = list()

    for rule in tree.productions():
        if not rule.is_lexical() and 'ROOT' not in rule.unicode_repr():
            rules.append(rule.unicode_repr())

    return rules

In [13]:
parser = CoreNLPParser(url='http://localhost:9000')

# Sentence formation funcions

In [34]:
%run tree_utils.py
# Imports tree functions

In [38]:
def sentence_formation_errors(parsed_sentence):
    """Finds sentence formation errors in a parsed sentence
    
    parsed_sentences (str): CoreNLP constituency parser output
    RETURNS (dict): Dict of error types with boolean values for if they occur in the sentence"""
    errors = {}
    root_node = create_tree(parsed_sentence)
    
    # fragment
    if 'FRAG' in root_node:
        errors['fragment'] = True
    
    
    for node in root_node.get_descendants('SBAR'):
        # SBAR without S
        if len(list(node.get_ancestors('S'))) == 0:
            errors['sbar_without_s'] = True
            
        # because with VBG
        if (node.children[0].label == 'IN' and node.children[0].word.lower() == 'because' and 
            'S' in [s.label for s in node.children[0].get_right_siblings()]):
            # Get first leftmost VP
            vp_node = node.get_descendants('VP').__next__()
            if 'VBG' in [c.label for c in vp_node.children]:
                errors['sbar_with_vbg'] = True
    
        # SBAR withot CC
        
    return errors

In [None]:
def essay_sentence_formation_errors(parser, essay_text):
    """Parses raw text to find sentence formation errors. Returns the number of sentences with at least 1 error
    
    Each parse string can be fed into Tree.fromstring() to create NLTK Tree objects.

    parser (CoreNLPParser): parser to parse sentences
    essay_text (str): essay text
    RETURNS (int, int): number of sentences with formation erros, total number of sentences
    """
    parsed_sentences = constituency_parse(parser, essay_text)
    
    count_sentences_with_errors = 0
    for sent in parsed_sentences:
        errors = sentence_formation_errors(sent)
        if len(errors) > 0:
            count_sentences_with_errors += 1
        
    return count_sentences_with_errors, len(parsed_sentences)

# Read data / Example use

In [8]:
# Get essays
essay_key = pd.read_csv('../data/essays_dataset/index.csv', sep=';')

essays = []
for filename in essay_key['filename']:
    with open('../data/essays_dataset/essays/'+filename, 'r') as f:
        essays.append(f.read().strip())
        
essay_key['essay'] = essays
essay_key.head()

In [14]:
essay_parses = pd.read_csv('../data/essays_dataset/index_with_parse.csv')

In [15]:
parsed_essay = essay_parses.loc[0, 'parsed_essay']
parsed_sentences = str_to_trees(parsed_essay)

In [17]:
parsed_sentences[7]

'(ROOT (FRAG (CC But) (RB not) (NP (NP (DT all) (DT the) (NNS people)) (CC and) (SBAR (S (NP (DT the) (NN time)) (VP (VBZ is) (PP (IN in) (NP (NP (NN accord)) (PP (IN with) (NP (DT this) (NN problem))))) (, ,) (SBAR (IN because) (NP (NP (DT any) (NN time)) (SBAR (S (NP (DT the) (NN person)) (VP (VBZ is) (ADJP (RB too) (PP (VBG according) (PP (IN with) (NP (DT the) (NN make) (NNS products)))))))))))))) (. .)))'

In [35]:
root_node = create_tree(parsed_sentences[7])
print_leaves(root_node)

But not all the people and the time is in accord with this problem , because any time the person is too according with the make products . 

In [None]:
# find errors in all parsed essays
essay_parses = pd.read_csv('../data/essays_dataset/index_with_parse.csv')

for i, row in essay_parses.iterrows():
    parsed_sentences = str_to_trees(row['parsed_essay'])
    
    count_sentences_with_errors = 0
    for sent in parsed_sentences:
        #sentence formation errors
        errors = sentence_formation_errors(sent)
        
        if len(errors) > 0:
            count_sentences_with_errors += 1
        
    count_sentences_with_errors, len(parsed_sentences)