# Annotation Template

In [19]:
import json
import re
import pandas as pd
import nltk
import numpy as np

from nltk.tree import Tree
# This uses corenlp server! Will need to alter code if using JAR files directly
# https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
from nltk.parse.corenlp import CoreNLPParser
from nltk.tag.stanford import CoreNLPTagger, CoreNLPPOSTagger, CoreNLPNERTagger

# For spelling checker
from enchant import Dict as dictionary

In [10]:
# Careful! CoreNLPTagger, CoreNLPPOSTagger, and CoreNLPNERTagger will all be replaced in the next NLTK version (3.2.6)
parser = CoreNLPParser(url='http://localhost:9000')
#pos_tagger = CoreNLPPOSTagger(url='http://localhost:9000')
#ner_tagger = CoreNLPNERTagger(url='http://localhost:9000')
pos_tagger = CoreNLPTagger(tagtype='pos', url='http://localhost:9000')
ner_tagger = CoreNLPTagger(tagtype='ner', url='http://localhost:9000')

In [11]:
# Get essays
essay_key = pd.read_csv('../data/essays_dataset/index.csv', sep=';')

essays = []
for filename in essay_key['filename']:
    with open('../data/essays_dataset/essays/'+filename, 'r') as f:
        essays.append(f.read().strip())
        
essay_key['essay'] = essays

In [12]:
essay_split = pd.read_csv('../data/essays_dataset/essay_split.csv')
essay_split.head()

Unnamed: 0,filename,grade,word_len,grader
0,990384.txt,high,568,Aldo
1,395987.txt,high,508,John
2,1949465.txt,high,458,Aldo
3,38209.txt,high,456,John
4,1834502.txt,high,454,Aldo


In [13]:
# Altered behavior of NLTK so CoreNLP performs sentence splits
def constituency_parse(parser, sentences, return_parse_obj=False):
    """Creates parse strings for each sentence.  
    Each parse string can be fed into Tree.fromstring() to create NLTK Tree objects.

    parser (CoreNLPParser): parser to parse sentences
    sentences (str): essay text
    return_parse_obj (bool): return parse object or string of trees
    RETURNS (list): a list of parses in string form
    """
    default_properties = {'outputFormat': 'json', 
                          'annotators': 'tokenize,pos,lemma,ssplit,parse'}
    parsed_data = parser.api_call(sentences, properties=default_properties)
    if return_parse_obj:
        return parsed_data
    else:
        parses = list()
        for parsed_sent in parsed_data['sentences']:
            parse = parsed_sent['parse']
            # Compress whitespace
            parse = re.sub('[\s]+', ' ', parse)
            parses.append(parse)
        return parses

def pos_tags(tagger, sentences):
    """Tags sentences with POS tags. Returns a list of (word, tag, start index, end index) tuples

    tagger (CoreNLPTagger): a tagger to tag sentences
    RETURNS (list): list of (word, tag) tuples
    """
    default_properties = {'annotators': 'tokenize,ssplit,pos'}
    tagged_data = tagger.api_call(sentences, properties=default_properties)
    
    tags = list()
    for sent in tagged_data['sentences']:
        tags.append([(token['word'], token['pos'], token['characterOffsetBegin'], token['characterOffsetEnd']) for token in sent['tokens']])
    return tags

In [14]:
def tree_to_str(trees):
    """Joins a list of trees in string form"""
    return ' '.join(trees)

def str_to_trees(tree_str):
    """Splits a string into a list of trees in string form"""
    d = "(ROOT"
    return  [(d+sent).strip() for sent in tree_str.split(d) if sent]

In [58]:
def num_sentence_annotation(parsed_data, orig_text, verbose=True):
    sentences = dict()
    sentences['indices'] = list()
    
    if verbose:
        print('Format: (Start index, end index) Sentence')
        print()
        
    for sent in parsed_data['sentences']:
        start_offset = sent['tokens'][0]['characterOffsetBegin']
        end_offset = sent['tokens'][-1]['characterOffsetEnd']
        sentences['indices'].append((start_offset, end_offset))
        if verbose:
            print((start_offset, end_offset), orig_text[start_offset:end_offset])
            print()
        
    sentences['num'] = len(parsed_data['sentences'])
    if verbose:
        print('Num sentence:', len(parsed_data['sentences']))
        print()
    return sentences

In [64]:
def sentence_sanity_check(sentence_dict):
    error = False
    
    if sentence_dict['num'] != len(sentence_dict['indices']):
        print('Number of sentences does not match')
        error = True
        
    prev = 0
    for i,j in enumerate(sentence_dict['indices']):
        if j[0] >= j[1]:
            print('Sentence', i, ': Start/end indices overlap')
            error = True
        if prev >= j[0] and prev != 0:
            print('Sentence', i, ': Previous end index overlaps start index')
            error = True
        if j[0] - prev > 1:
            print('Sentence', i, ': Is gap between sentences > 1 character/space?')
            error = True
        prev = j[1]
    
    if not error:
        print('No errors')

In [22]:
def find_word(orig_text, word):
    matches = list()
    for m in re.finditer(word, orig_text, flags=re.I):
        #print(m)
        if m.span(0)[0] < 10:
            before_context = (' ' * (10-m.span(0)[0])) + orig_text[0:m.span(0)[0]]
        else:
            before_context = orig_text[m.span(0)[0]-10 : m.span(0)[0]]
        
        if len(orig_text) - m.span(0)[1] < 10:
            #print(orig_text[m.span(0)[1]:])
            #print(' ' * (len(orig_text) - m.span(0)[1]))
            after_context = orig_text[m.span(0)[1]:] + (' ' * (len(orig_text) - m.span(0)[1]))
        else:
            after_context = orig_text[m.span(0)[1] : m.span(0)[1]+10]
            
        matches.append((m.span(0), '...' + before_context + orig_text[m.span(0)[0]:m.span(0)[1]] + after_context + '...'))
        
    if len(matches) == 1:
        return matches[0][0]
    else:
        for i,m in enumerate(matches):
            print('Index:', i, '-', m)
            
        choice = int(input('Choose a match index (number) or -1 for all: '))
        if choice == -1:
            return [m[0] for m in matches]
        else:
            return matches[choice][0]

In [17]:
def check_spelling(dictionary, parsed_sentences, origin_text,verbose=True):
    """Check the spelling of tagged words on an essay
    and return the list of misspelling words with their tags 
    and indexes of begining and end of those words"""
    wrong_words = dict()
    wrong_words['indices'] = list()
    
    if verbose:
        print('Format: (Start index, end index) word')
        print()
        
    for sentence in parsed_sentences:
        for w_tuple in sentence:
            word = w_tuple[0]
            if(word !=',' and word != "'s"):
                if dicc.check(word) is False:
                    tag = w_tuple[1]
                    start_offset = w_tuple[2]
                    end_offset = w_tuple[3]
                    wrong_words['indices'].append((word, tag, start_offset, end_offset))
                    if verbose:
                        print((start_offset, end_offset), orig_text[start_offset:end_offset])
                        print()
    wrong_words['num'] = len(wrong_words['indices'])
    if verbose:
        print('Num wrong words:', len(wrong_words['indices']))
        print()
    return wrong_words

# Choose file

In [22]:
orig_filename = essay_key.loc[0,'filename']
orig_text = essay_key.loc[0,'essay']

# Essay length in sentences
num_sentence_annotation() automatically parses the sentence and returns a dictionary.  
If you disagree with the parse then edit the dictionary object so it's correct

In [34]:
print(orig_text)

This is an important aspect of today time.
This products rathen are not much better, but today is not important the really character of the product, but only the money and the client not rappresented the important actor in this process.
Every day any people buy same products that is not rappresented the your necessity, but is only important buy any product.
To explain this argoment in my nation, at the television, there is an program that discuss of the problem rappresented by this.
More people go to this program television to talk about your problem, that is very radicate in my nation.
The modern society rappresented the perfect ambient to influenced the minds of all the person.
In my self is present the reasons of this statement, that is one of the problem of the life.
But not all the people and the time is in accord with this problem, because any time the person is too according with the make products.
Thus I agree with this statement, because this event is present in my life every 

In [59]:
output = num_sentence_annotation(constituency_parse(parser, orig_text, return_parse_obj=True), orig_text)

Format: (Start index, end index) Sentence

(0, 42) This is an important aspect of today time.

(43, 236) This products rathen are not much better, but today is not important the really character of the product, but only the money and the client not rappresented the important actor in this process.

(237, 359) Every day any people buy same products that is not rappresented the your necessity, but is only important buy any product.

(360, 487) To explain this argoment in my nation, at the television, there is an program that discuss of the problem rappresented by this.

(488, 593) More people go to this program television to talk about your problem, that is very radicate in my nation.

(594, 688) The modern society rappresented the perfect ambient to influenced the minds of all the person.

(689, 781) In my self is present the reasons of this statement, that is one of the problem of the life.

(782, 918) But not all the people and the time is in accord with this problem, because any time

'{"num": 14, "indices": [[0, 42], [43, 236], [237, 359], [360, 487], [488, 593], [594, 688], [689, 781], [782, 918], [919, 1053], [1054, 1147], [1148, 1302], [1303, 1450], [1451, 1634], [1635, 1702]]}'

In [63]:
sentence_sanity_check(output)

No errors


In [37]:
errors_num_sentences = output

# Spelling mistakes
Create ONE list of all misspelled word indices

In [34]:
print(orig_text)

This is an important aspect of today time.
This products rathen are not much better, but today is not important the really character of the product, but only the money and the client not rappresented the important actor in this process.
Every day any people buy same products that is not rappresented the your necessity, but is only important buy any product.
To explain this argoment in my nation, at the television, there is an program that discuss of the problem rappresented by this.
More people go to this program television to talk about your problem, that is very radicate in my nation.
The modern society rappresented the perfect ambient to influenced the minds of all the person.
In my self is present the reasons of this statement, that is one of the problem of the life.
But not all the people and the time is in accord with this problem, because any time the person is too according with the make products.
Thus I agree with this statement, because this event is present in my life every 

In [23]:
dicc = dictionary("en_US")
output = check_spelling(dicc, pos_tags(pos_tagger, orig_text), orig_text)

Format: (Start index, end index) word

(57, 63) rathen

(187, 199) rappresented

(288, 300) rappresented

(376, 384) argoment

(466, 478) rappresented

(571, 579) radicate

(613, 625) rappresented

(1009, 1021) rappresented

(1095, 1103) argoment

(1112, 1121) inportant

(1125, 1133) illustre

(1215, 1223) argoment

(1316, 1328) rappresented

(1505, 1515) rappresent

(1525, 1533) argoment

(1625, 1633) argoment

Num wrong words: 16



In [35]:
find_word(orig_text, 'rappresented')

Index: 0 - ((187, 199), '...lient not rappresented the impor...')
Index: 1 - ((288, 300), '...at is not rappresented the your ...')
Index: 2 - ((466, 478), '...e problem rappresented by this.\n...')
Index: 3 - ((613, 625), '...n society rappresented the perfe...')
Index: 4 - ((1009, 1021), '... day, and rappresented the probl...')
Index: 5 - ((1316, 1328), '...s opinion rappresented my self i...')


[(187, 199), (288, 300), (466, 478), (613, 625), (1009, 1021), (1316, 1328)]

In [None]:
# Build final errors
errors_spelling = [(187, 199), (288, 300), (466, 478), (613, 625), (1009, 1021), (1316, 1328), (), ...]

# Subject Verb Disagreement
Create ONE list of all disagreeing word indices.  
Choose one word of the 2 words to be the one disagreeing.  
Not completely sure about how to identify which word is at fault. Try identifying based on sentence / essay context for the purpose of annotation.

In [18]:
orig_text = essay_key.loc[0,'essay']
print(orig_text)

This is an important aspect of today time.
This products rathen are not much better, but today is not important the really character of the product, but only the money and the client not rappresented the important actor in this process.
Every day any people buy same products that is not rappresented the your necessity, but is only important buy any product.
To explain this argoment in my nation, at the television, there is an program that discuss of the problem rappresented by this.
More people go to this program television to talk about your problem, that is very radicate in my nation.
The modern society rappresented the perfect ambient to influenced the minds of all the person.
In my self is present the reasons of this statement, that is one of the problem of the life.
But not all the people and the time is in accord with this problem, because any time the person is too according with the make products.
Thus I agree with this statement, because this event is present in my life every 

In [25]:
find_word(orig_text, 'rappresented')

Index: 0 - ((187, 199), '...lient not rappresented the impor...')
Index: 1 - ((288, 300), '...at is not rappresented the your ...')
Index: 2 - ((466, 478), '...e problem rappresented by this.\n...')
Index: 3 - ((613, 625), '...n society rappresented the perfe...')
Index: 4 - ((1009, 1021), '... day, and rappresented the probl...')
Index: 5 - ((1316, 1328), '...s opinion rappresented my self i...')


(187, 199)

In [None]:
# Build final errors
errors_subj_verb = [(187, 199), (), ...]

# Verb tense / missing verb
Create ONE list of all word indices with the type of error

In [24]:
find_word(orig_text, 'rappresented')

Index: 0 - ((187, 199), '...lient not rappresented the impor...')
Index: 1 - ((288, 300), '...at is not rappresented the your ...')
Index: 2 - ((466, 478), '...e problem rappresented by this.\n...')
Index: 3 - ((613, 625), '...n society rappresented the perfe...')
Index: 4 - ((1009, 1021), '... day, and rappresented the probl...')
Index: 5 - ((1316, 1328), '...s opinion rappresented my self i...')


(187, 199)

In [None]:
# Build final errors
errors_verb = [{'offset': (187, 199), 'type': 'verb tense'}, {}, ...]

# Miscellaneous
Create ONE list of all indices with the type of error

In [26]:
find_word(orig_text, 'today')

Index: 0 - ((31, 36), '...aspect of today time.\nThi...')
Index: 1 - ((89, 94), '...tter, but today is not im...')


(31, 36)

In [None]:
errors_misc = [{'offset': 36, 'type': 'missing possessive'}, {}, ...]

# Combine results and output

In [61]:
essay_errors = pd.DataFrame({'filename': essay_key['filename'].tolist()})
essay_errors['errors'] = ''
essay_errors['length_score'] = np.nan
essay_errors['spelling_score'] = np.nan
essay_errors['subj_verb_score'] = np.nan
essay_errors['verb_score'] = np.nan
essay_errors['sentence_formation_score'] = np.nan
essay_errors['semantic_coherent_score'] = np.nan
essay_errors['semantic_topic_score'] = np.nan
essay_errors.head()

Unnamed: 0,filename,errors,length_score,spelling_score,subj_verb_score,verb_score,sentence_formation_score,semantic_coherent_score,semantic_topic_score
0,1004355.txt,,,,,,,,
1,1007363.txt,,,,,,,,
2,1079196.txt,,,,,,,,
3,1086343.txt,,,,,,,,
4,1096747.txt,,,,,,,,


In [None]:
errors = {'num sentences': errors_num_sentences,
          'subj verb': errors_subj_verb, 
          'verb': errors_verb, 
          'misc': errors_misc}
errors = json.dump(errors)

essay_errors.loc[lambda df: df['filename'] == orig_filename, 'errors'] = errors
essay_errors.loc[lambda df: df['filename'] == orig_filename, 'length_score'] = 0
essay_errors.loc[lambda df: df['filename'] == orig_filename, 'spelling_score'] = 4
essay_errors.loc[lambda df: df['filename'] == orig_filename, 'subj_verb_score'] = 0
essay_errors.loc[lambda df: df['filename'] == orig_filename, 'verb_score'] = 0
essay_errors.loc[lambda df: df['filename'] == orig_filename, 'sentence_formation_score'] = 0
essay_errors.loc[lambda df: df['filename'] == orig_filename, 'semantic_coherent_score'] = 0
essay_errors.loc[lambda df: df['filename'] == orig_filename, 'semantic_topic_score'] = 0

In [None]:
essay_errors.to_csv('../data/essay_dataset/essay_errors_john.csv', index=False)