# Annotation Template

In [1]:
import json
import re
import pandas as pd
import nltk
import numpy as np

from nltk.tree import Tree
# This uses corenlp server! Will need to alter code if using JAR files directly
# https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
from nltk.parse.corenlp import CoreNLPParser
from nltk.tag.stanford import CoreNLPTagger, CoreNLPPOSTagger, CoreNLPNERTagger

# For spelling checker
from enchant import Dict as dictionary
import seaborn as sns
import matplotlib.pyplot as plt

In [196]:
# Careful! CoreNLPTagger, CoreNLPPOSTagger, and CoreNLPNERTagger will all be replaced in the next NLTK version (3.2.6)
parser = CoreNLPParser(url='http://localhost:9000')
#pos_tagger = CoreNLPPOSTagger(url='http://localhost:9000')
#ner_tagger = CoreNLPNERTagger(url='http://localhost:9000')
pos_tagger = CoreNLPTagger(tagtype='pos', url='http://localhost:9000')
ner_tagger = CoreNLPTagger(tagtype='ner', url='http://localhost:9000')

In [3]:
# Get essays
essay_key = pd.read_csv('../data/essays_dataset/index.csv', sep=';')

essays = []
for filename in essay_key['filename']:
    with open('../data/essays_dataset/essays/'+filename, 'r') as f:
        essays.append(f.read().strip())
        
essay_key['essay'] = essays

In [4]:
essay_split = pd.read_csv('../data/essays_dataset/essay_split.csv')
essay_split.head()

Unnamed: 0,filename,grade,word_len,grader
0,990384.txt,high,568,Aldo
1,395987.txt,high,508,John
2,1949465.txt,high,458,Aldo
3,38209.txt,high,456,John
4,1834502.txt,high,454,Aldo


In [199]:
# Altered behavior of NLTK so CoreNLP performs sentence splits
def constituency_parse(parser, sentences, return_parse_obj=False):
    """Creates parse strings for each sentence.  
    Each parse string can be fed into Tree.fromstring() to create NLTK Tree objects.

    parser (CoreNLPParser): parser to parse sentences
    sentences (str): essay text
    return_parse_obj (bool): return parse object or string of trees
    RETURNS (list): a list of parses in string form
    """
    default_properties = {'outputFormat': 'json', 
                          'annotators': 'tokenize,pos,lemma,ssplit,parse'}
    parsed_data = parser.api_call(sentences, properties=default_properties)
    if return_parse_obj:
        return parsed_data
    else:
        parses = list()
        for parsed_sent in parsed_data['sentences']:
            parse = parsed_sent['parse']
            # Compress whitespace
            parse = re.sub('[\s]+', ' ', parse)
            parses.append(parse)
        return parses

def pos_tags(tagger, sentences, return_parse_obj=False):
    """Tags sentences with POS tags. Returns a list of (word, tag, start index, end index) tuples

    tagger (CoreNLPTagger): a tagger to tag sentences
    RETURNS (list): list of (word, tag) tuples
    """
    #tokenize, ssplit, pos, lemma, ner, parse, dcoref
    default_properties = {'annotators': 'tokenize,ssplit,pos,ner'}
    tagged_data = tagger.api_call(sentences, properties=default_properties)
    if return_parse_obj:
        return tagged_data
    else:
        tags = list()
        sent_index = 0
        for sent in tagged_data['sentences']:
            tags.append([(token['word'], token['pos'], token['characterOffsetBegin'], token['characterOffsetEnd'], token['index'], sent_index, token['ner']) for token in sent['tokens']])
            sent_index = sent_index + 1
        return tags

In [6]:
def tree_to_str(trees):
    """Joins a list of trees in string form"""
    return ' '.join(trees)

def str_to_trees(tree_str):
    """Splits a string into a list of trees in string form"""
    d = "(ROOT"
    return  [(d+sent).strip() for sent in tree_str.split(d) if sent]

In [7]:
def num_sentence_annotation(parsed_data, orig_text, verbose=True):
    sentences = dict()
    sentences['indices'] = list()
    
    if verbose:
        print('Format: (Start index, end index) Sentence')
        print()
        
    for sent in parsed_data['sentences']:
        start_offset = sent['tokens'][0]['characterOffsetBegin']
        end_offset = sent['tokens'][-1]['characterOffsetEnd']
        sentences['indices'].append((start_offset, end_offset))
        if verbose:
            print((start_offset, end_offset), orig_text[start_offset:end_offset])
            print()
        
    sentences['num'] = len(parsed_data['sentences'])
    if verbose:
        print('Num sentence:', len(parsed_data['sentences']))
        print()
    return sentences

In [8]:
def length_annotation(parsed_data, orig_text, verbose=True):
    sentences = dict()
    sentences['indices'] = list()
    sentences['constituents'] = list()
    words = nltk.word_tokenize(orig_text)
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    sentences['words'] = len([w for w in words if nonPunct.match(w)])
    
    if verbose:
        print('Format: (Start index, end index) Sentence')
        print()
        
    for sent in parsed_data['sentences']:
        start_offset = sent['tokens'][0]['characterOffsetBegin']
        end_offset = sent['tokens'][-1]['characterOffsetEnd']
        sentences['indices'].append((start_offset, end_offset))
        
        # Get the height of the parsed tree 
        t = Tree.fromstring(sent['parse'])
        sentences['constituents'].append(t.height())
        if verbose:
            print((start_offset, end_offset), orig_text[start_offset:end_offset])
            print()
        
    sentences['num_sentences'] = len(parsed_data['sentences'])
    sentences['num_constituents'] = sum(sentences['constituents'])
    if verbose:
        print('Num sentence:', len(parsed_data['sentences']))
        print()
    return sentences

In [9]:
def sentence_sanity_check(sentence_dict):
    error = False
    
    if sentence_dict['num'] != len(sentence_dict['indices']):
        print('Number of sentences does not match')
        error = True
        
    prev = 0
    for i,j in enumerate(sentence_dict['indices']):
        if j[0] >= j[1]:
            print('Sentence', i, ': Start/end indices overlap')
            error = True
        if prev >= j[0] and prev != 0:
            print('Sentence', i, ': Previous end index overlaps start index')
            error = True
        if j[0] - prev > 1:
            print('Sentence', i, ': Is gap between sentences > 1 character/space?')
            error = True
        prev = j[1]
    
    if not error:
        print('No errors')

In [10]:
def find_word(orig_text, word):
    matches = list()
    for m in re.finditer(word, orig_text, flags=re.I):
        #print(m)
        if m.span(0)[0] < 10:
            before_context = (' ' * (10-m.span(0)[0])) + orig_text[0:m.span(0)[0]]
        else:
            before_context = orig_text[m.span(0)[0]-10 : m.span(0)[0]]
        
        if len(orig_text) - m.span(0)[1] < 10:
            #print(orig_text[m.span(0)[1]:])
            #print(' ' * (len(orig_text) - m.span(0)[1]))
            after_context = orig_text[m.span(0)[1]:] + (' ' * (len(orig_text) - m.span(0)[1]))
        else:
            after_context = orig_text[m.span(0)[1] : m.span(0)[1]+10]
            
        matches.append((m.span(0), '...' + before_context + orig_text[m.span(0)[0]:m.span(0)[1]] + after_context + '...'))
        
    if len(matches) == 1:
        return matches[0][0]
    else:
        for i,m in enumerate(matches):
            print('Index:', i, '-', m)
        return [m[0] for m in matches]   
        #choice = int(input('Choose a match index (number) or -1 for all: '))
        #if choice == -1:
        #    return [m[0] for m in matches]
        #else:
        #    return matches[choice][0]

In [11]:
def check_spelling(dictionary, parsed_sentences, orig_text,verbose=True):
    false_positives = ['-LRB-','-RRB-',"''",',','\'s','\\','n\'t',':'';','!','?','\'m','\'d','\'\'','??','-','(',')','\'ve','\'','\'re','!','e.g.','[',']','_','>>','>','<','<<','!!','"','``']
    """Check the spelling of tagged words on an essay
    and return the list of misspelling words with their tags 
    and indexes of begining and end of those words"""
    wrong_words = dict()
    wrong_words['indices'] = list()
    
    if verbose:
        print('Format: (Start index, end index) word')
        print()
    
    found_words = list()
    for sentence in parsed_sentences:
        for w_tuple in sentence:
            word = w_tuple[0]
            if (word not in string.punctuation and word not in false_positives):
                if dicc.check(word) is False:
                    tag = w_tuple[1]
                    start_offset = w_tuple[2]
                    end_offset = w_tuple[3]
                    if word not in found_words:
                        wrong_words['indices'].append((word, tag, start_offset, end_offset))
                        found_words.append(word)
                    if verbose:
                        print((start_offset, end_offset), orig_text[start_offset:end_offset])
                        print()
    wrong_words['num'] = len(wrong_words['indices'])
    if verbose:
        print('Num wrong words:', len(wrong_words['indices']))
        print()
    return wrong_words

# Choose file

In [253]:
orig_filename = essay_key.loc[0,'filename']
orig_text = essay_key.loc[0,'essay']

<h3>Coherence</h3>

In [72]:
from nltk.corpus import wordnet as wn

In [249]:
# return (True, 'Female') if the word is a person and female, return (True, 'Male') if the word is male and (True,'Netrual') if 
# undeterminable and (False, None) if it is not a person
def person_and_gender(word_tags):
    #print("Looking for this word: ", word)
    if word_tags[6] == 'PERSON' or word_tags[0].lower()=='person':
        result = (True, None)
    else:
        check_for_nouns = wn.synsets(word_tags[0], pos='n')
        if len(check_for_nouns) > 0:
            s_word = wn.synset(check_for_nouns[0].name())
            hyper = lambda s: s.hypernyms()
            list_hyper = s_word.closure(hyper)
            assigned_tags = [x.name().split(".")[0] for x in list_hyper]
            human_tags = set(['professional','person','adult']) # this is hardcoded now
            actual_tags = set(assigned_tags)
            tag_matches = human_tags.intersection(actual_tags)
            if bool(tag_matches):
                result = (True, None)
            else:
                result =  (False, None)
        else:
            result = (False, None)
    return result

In [255]:
person_and_gender(('table', 'NN', 353, 358, 4, 4, 'O'))

(False, None)

In [12]:
# Return an list with sentences beginning and ending indexes
def get_sentence_indexes(constituents_parses):
    sentences_information = list()
    index = 0
    for sent in constituents_parses['sentences']:
        tokens = sent['tokens']
        sentences_information.append((index, tokens[0]['characterOffsetBegin'],tokens[len(tokens)-1]['characterOffsetEnd']))#(index,tokens[0]['characterOffsetBegin'], [len(tokens)-1]['characterOffsetEnd']))
        index = index+1
    return sentences_information
    

In [13]:
index_with_parse = pd.read_csv('../data/index_with_parse.csv')
index_with_parse
#pronouns = pd.DataFrame()
#for index, values in enumerate(index_with_parse.values):
#    print(values[2])
    

Unnamed: 0,filename,num_sentences,parsed_essay
0,1004355.txt,14,(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (NP (...
1,1007363.txt,10,(ROOT (S (NP (NP (DT A) (JJ geat) (NN challeng...
2,1079196.txt,17,"(ROOT (S (ADVP (RB Personally)) (, ,) (NP (PRP..."
3,1086343.txt,10,(ROOT (S (PP (IN In) (NP (JJ many) (NN country...
4,1096747.txt,6,(ROOT (S (NP (PRP I)) (VP (VBP disagree) (PP (...
5,1109085.txt,16,(ROOT (S (NP (NNS People)) (ADVP (RB often)) (...
6,1164913.txt,14,(ROOT (S (S (NP (NP (DT Some) (NNS tipes)) (PP...
7,1174920.txt,5,(ROOT (S (SBAR (IN In) (S (NP (PRP$ my) (NN op...
8,1181356.txt,11,(ROOT (S (S (NP (PRP We)) (VP (VBP grow) (PRT ...
9,1223368.txt,17,(ROOT (S (NP (NNS Universities)) (VP (VBP have...


In [14]:
# Get essays
essay_key2 = pd.read_csv('../data/essays_dataset/index.csv', sep=';')

essays = []
for filename in essay_key2['filename']:
    with open('../data/essays_dataset/essays/'+filename, 'r') as f:
        essays.append(f.read().strip())
        
essay_key2['essay'] = essays

In [15]:
constituency_parsed_sentences = list()
for essay_text in essay_key2['essay']:
    constituency_parsed_sentences.append(constituency_parse(parser, essay_text, return_parse_obj=True))

In [16]:
constituency_parsed_sentences[0]

{'sentences': [{'basicDependencies': [{'dep': 'ROOT',
     'dependent': 5,
     'dependentGloss': 'aspect',
     'governor': 0,
     'governorGloss': 'ROOT'},
    {'dep': 'nsubj',
     'dependent': 1,
     'dependentGloss': 'This',
     'governor': 5,
     'governorGloss': 'aspect'},
    {'dep': 'cop',
     'dependent': 2,
     'dependentGloss': 'is',
     'governor': 5,
     'governorGloss': 'aspect'},
    {'dep': 'det',
     'dependent': 3,
     'dependentGloss': 'an',
     'governor': 5,
     'governorGloss': 'aspect'},
    {'dep': 'amod',
     'dependent': 4,
     'dependentGloss': 'important',
     'governor': 5,
     'governorGloss': 'aspect'},
    {'dep': 'case',
     'dependent': 6,
     'dependentGloss': 'of',
     'governor': 8,
     'governorGloss': 'time'},
    {'dep': 'compound',
     'dependent': 7,
     'dependentGloss': 'today',
     'governor': 8,
     'governorGloss': 'time'},
    {'dep': 'nmod',
     'dependent': 8,
     'dependentGloss': 'time',
     'governor': 5,


In [200]:
tag_set = pd.DataFrame()
pos_tags_list = list()
file_names = list()
essay_content = list()
sentences_indexes = list()
# Return an array of each word with its corresponding POS (word, POS, word_start_index, word_end_index, word_index, sentence_index)
post_tag_list = list()
for index, row in essay_key2.iterrows():
    result_count = pos_tags(pos_tagger, row['essay'])
    constituency_parsed = constituency_parsed_sentences[index]
    file_names.append(row['filename'])
    essay_content.append(row['essay'])
    pos_tags_list.append(result_count)
    sentences_indexes.append(get_sentence_indexes(constituency_parsed))
#result_count = pos_tags(pos_tagger, essay_key2['essay'][0], return_parse_obj=False)
#pos_tags_list.append(result_count)
tag_set['filename'] = file_names
tag_set['essay'] = essay_content
tag_set['pos'] = pos_tags_list
tag_set['sentece_indexes'] = sentences_indexes
tag_set

Unnamed: 0,filename,essay,pos,sentece_indexes
0,1004355.txt,This is an important aspect of today time.\nTh...,"[[(This, DT, 0, 4, 1, 0, O), (is, VBZ, 5, 7, 2...","[(0, 0, 42), (1, 43, 236), (2, 237, 359), (3, ..."
1,1007363.txt,A geat challenge in a person's life span is hi...,"[[(A, DT, 0, 1, 1, 0, O), (geat, JJ, 2, 6, 2, ...","[(0, 0, 115), (1, 116, 148), (2, 149, 277), (3..."
2,1079196.txt,"Personally, I agree with the statement saying ...","[[(Personally, RB, 0, 10, 1, 0, O), (,, ,, 10,...","[(0, 0, 123), (1, 125, 191), (2, 192, 365), (3..."
3,1086343.txt,In many country their are many combanies of a...,"[[(In, IN, 0, 2, 1, 0, O), (many, JJ, 3, 7, 2,...","[(0, 0, 66), (1, 67, 113), (2, 115, 223), (3, ..."
4,1096747.txt,I disagree with this idea. in order to make a ...,"[[(I, PRP, 0, 1, 1, 0, O), (disagree, VBP, 2, ...","[(0, 0, 26), (1, 27, 137), (2, 138, 217), (3, ..."
5,1109085.txt,People often argue about what is more importan...,"[[(People, NNS, 0, 6, 1, 0, O), (often, RB, 7,...","[(0, 0, 100), (1, 101, 242), (2, 243, 326), (3..."
6,1164913.txt,Some tipes of advertisements use to make diffe...,"[[(Some, DT, 0, 4, 1, 0, O), (tipes, NNS, 5, 1...","[(0, 0, 149), (1, 151, 171), (2, 172, 190), (3..."
7,1174920.txt,In my opinion people always should try new thi...,"[[(In, IN, 0, 2, 1, 0, O), (my, PRP$, 3, 5, 2,...","[(0, 0, 162), (1, 163, 337), (2, 338, 560), (3..."
8,1181356.txt,We grow up in a society wich a has the mith of...,"[[(We, PRP, 0, 2, 1, 0, O), (grow, VBP, 3, 7, ...","[(0, 0, 61), (1, 63, 193), (2, 195, 275), (3, ..."
9,1223368.txt,Universities have considerably different educa...,"[[(Universities, NNS, 0, 12, 1, 0, O), (have, ...","[(0, 0, 61), (1, 62, 179), (2, 180, 318), (3, ..."


In [241]:
pronouns_POS_list = ['PRP','PRP$','WP','WP$']
third_pronouns_list = ['he','him','his','his','himself','she','her','her','hers','herself','it','it','its','itself','they','them','their','theirs','themselves']
pronouns_list = list()
    
for index, rows in tag_set.iterrows():
    aux_pronouns = list()
    for sentence in rows['pos']:
        for w_tuple in sentence:
            if w_tuple[1] in pronouns_POS_list:
                aux_pronouns.append(w_tuple)
    pronouns_list.append([x for x in aux_pronouns if x[0].lower() in third_pronouns_list])
tag_set['third_person_pronouns'] = pronouns_list               
# filter non third person pronouns
#filtered_list = [x for x in pronouns_list if x[0].lower() in third_pronouns_list]
#filtered_list

In [169]:
tag_set

Unnamed: 0,filename,essay,pos,sentece_indexes,third_person_pronouns
0,1004355.txt,This is an important aspect of today time.\nTh...,"[[(This, DT, 0, 4, 1, 0), (is, VBZ, 5, 7, 2, 0...","[(0, 0, 42), (1, 43, 236), (2, 237, 359), (3, ...",[]
1,1007363.txt,A geat challenge in a person's life span is hi...,"[[(A, DT, 0, 1, 1, 0), (geat, JJ, 2, 6, 2, 0),...","[(0, 0, 115), (1, 116, 148), (2, 149, 277), (3...","[(his, PRP$, 44, 47, 11, 0), (them, PRP, 620, ..."
2,1079196.txt,"Personally, I agree with the statement saying ...","[[(Personally, RB, 0, 10, 1, 0), (,, ,, 10, 11...","[(0, 0, 123), (1, 125, 191), (2, 192, 365), (3...","[(they, PRP, 107, 111, 18, 0), (their, PRP$, 2..."
3,1086343.txt,In many country their are many combanies of a...,"[[(In, IN, 0, 2, 1, 0), (many, JJ, 3, 7, 2, 0)...","[(0, 0, 66), (1, 67, 113), (2, 115, 223), (3, ...","[(their, PRP$, 17, 22, 4, 0), (their, PRP$, 12..."
4,1096747.txt,I disagree with this idea. in order to make a ...,"[[(I, PRP, 0, 1, 1, 0), (disagree, VBP, 2, 10,...","[(0, 0, 26), (1, 27, 137), (2, 138, 217), (3, ...","[(they, PRP, 114, 118, 20, 1), (they, PRP, 153..."
5,1109085.txt,People often argue about what is more importan...,"[[(People, NNS, 0, 6, 1, 0), (often, RB, 7, 12...","[(0, 0, 100), (1, 101, 242), (2, 243, 326), (3...","[(it, PRP, 176, 178, 16, 1), (their, PRP$, 409..."
6,1164913.txt,Some tipes of advertisements use to make diffe...,"[[(Some, DT, 0, 4, 1, 0), (tipes, NNS, 5, 10, ...","[(0, 0, 149), (1, 151, 171), (2, 172, 190), (3...","[(them, PRP, 74, 78, 15, 0), (it, PRP, 91, 93,..."
7,1174920.txt,In my opinion people always should try new thi...,"[[(In, IN, 0, 2, 1, 0), (my, PRP$, 3, 5, 2, 0)...","[(0, 0, 162), (1, 163, 337), (2, 338, 560), (3...","[(themselves, PRP, 102, 112, 18, 0), (they, PR..."
8,1181356.txt,We grow up in a society wich a has the mith of...,"[[(We, PRP, 0, 2, 1, 0), (grow, VBP, 3, 7, 2, ...","[(0, 0, 61), (1, 63, 193), (2, 195, 275), (3, ...","[(it, PRP, 410, 412, 8, 4), (it, PRP, 632, 634..."
9,1223368.txt,Universities have considerably different educa...,"[[(Universities, NNS, 0, 12, 1, 0), (have, VBP...","[(0, 0, 61), (1, 62, 179), (2, 180, 318), (3, ...","[(it, PRP, 295, 297, 20, 2), (them, PRP, 313, ..."


In [245]:
def check_coherence(pronoun_tuple, lookup_sentences):
    # Pronoun list to check
    third_singular_person = ['he', 'she', 'his', 'her']
    third_plural_person = ['they', 'them', 'their']
    referencing_singular_pos = ['NN', 'NNP']
    referencing_plural_pos = ['NNS', 'NNPS']
    pronoun = pronoun_tuple[0]
    result = True
    """
    For singular third person pronouns and possessives he/she/his/her, 
    check the existence of appropriate male/female antecedents as mentioned earlier.
    
    For  plural  third  person  pronouns  and  possessivesthey/them/their,  
    check  if  there  are  possible  antecedents:
    """
    #print(lookup_sentences)
    if pronoun.lower() in third_singular_person:
        #print("For singular third person pronouns and possessives he/she/his/her")
        # Remove all words that do not serve as a possible referencing word
        filtered_words = filter_nouns(lookup_sentences, 'singular')
        if len(filtered_words)<=0:
            print("ERROR, POSSIBLE SINGULAR ANTECEDENT NOT FOUND!")
            result = False
        elif pronoun.lower() == 'he':
            # female or neutral and inanimate
            # check if it is a person or not
            for possible_antecedent in filtered_words:
                if person_and_gender(possible_antecedent)[0]:
                    result = True
                    return result
            print("GENDER DOES NOT MATCH WITH PRONOUN HE")
            result = False
        elif pronoun.lower() == 'she':
            for possible_antecedent in filtered_words:
                if person_and_gender(possible_antecedent)[0]:
                    result = True
                    return result
            print("GENDER DOES NOT MATCH WITH PRONOUN SHE")
            result = False
        
        # Check for male female antecedents in sentences
       
    elif pronoun.lower() in third_plural_person:
        #print("For  plural  third  person  pronouns  and  possessives they/them/their")
        filtered_words = filter_nouns(lookup_sentences, 'plural')
        if len(filtered_words)<=0 :
            filtered_words = filter_nouns(lookup_sentences, 'singular')
            if len(filtered_words)<=0:
                print("ERROR, POSSIBLE ANTECEDENT NOT FOUND!")
                result = False
            elif len(filtered_words) < 2:
                print("NOT ENOUGH SINGULAR NPs TO SERVE AS A PLURAL REFERENCING")
                result = False
        else:
            possible_antecedents = dict()
            words = list()
            for possible_antecedent in filtered_words:
                if possible_antecedent[5] not in possible_antecedents:
                    possible_antecedents[possible_antecedent[5]] = 0
                possible_antecedents[possible_antecedent[5]] = possible_antecedents[possible_antecedent[5]] + 1;
                words.append(possible_antecedent[0])
                
            sentence_id = pronoun_tuple[5]    
            for x in range(0, 3):
                # Check the possible antecedents
                if sentence_id in possible_antecedents:
                    if possible_antecedents[sentence_id] > 2:
                        print("Possible Ambiguity, 3 or more Nouns".upper())
                        result = False
                        break
                    elif possible_antecedents[sentence_id] == 1:
                        result = True
                        break
                    elif possible_antecedents[sentence_id] > 1 and x==0:
                        result = True
                        break
                    elif possible_antecedents[sentence_id] > 1:
                        print("More than one possible Antecedent in previous Sentence".upper())
                        result = False
                        break
                else:
                    result = False
                sentence_id = sentence_id - 1
    return result    
            
        #print(pronoun)
        #print(filtered_words)
        #print(pronoun)
        #print(lookup_sentences)

In [22]:
def filter_nouns(words_list, number):
    referencing_singular_pos = ['NN', 'NNP']
    referencing_plural_pos = ['NNS', 'NNPS']
    result = list()
    if number=='singular':
        for x in words_list:
            for y in x:
                if y[1] in referencing_singular_pos:
                    result.append(y)
    else:
        for x in words_list:
            for y in x:
                if y[1] in referencing_plural_pos:
                    result.append(y)
    return result

In [23]:
def coherence_tags(tagger, sentence, return_parse_obj=False):
    """Tags sentences with POS tags. Returns a list of (word, tag, start index, end index) tuples

    tagger (CoreNLPTagger): a tagger to tag sentences
    RETURNS (list): list of (word, tag) tuples
    """
    #tokenize, ssplit, pos, lemma, ner, parse, dcoref
    default_properties = {'annotators': 'dcoref,gender'}
    tagged_data = tagger.api_call(sentence, properties=default_properties)
    if return_parse_obj:
        return tagged_data
    else:
        tags = list()
        for sent in tagged_data['sentences']:
            tags.append([(token['word'], token['pos'], token['characterOffsetBegin'], token['characterOffsetEnd'], token['index'], sent_index) for token in sent['tokens']])
            sent_index = sent_index + 1
        return tags

In [24]:
check_coherence('his', ["A geat challenge in a person's life span is his abbility to change to a better life where life is all about change."])

IndexError: string index out of range

In [25]:
current_essay = tag_set['essay'][1]
current_pronouns = tag_set['third_person_pronouns'][1]
current_sentence_indexes = tag_set['sentece_indexes'][1]
current_pronouns

[('his', 'PRP$', 44, 47, 11, 0),
 ('them', 'PRP', 620, 624, 14, 5),
 ('his', 'PRP$', 637, 640, 18, 5)]

In [251]:
"""
Dont look for sentences but look for the POS
"""
coherence_error_list = list()
for index, row in tag_set.iterrows():
    number_of_errors = 0
    current_sentence_indexes = row['sentece_indexes']
    current_essay = row['essay']
    print(row['filename'])
    print()
    current_sentence_id = None
    for pron in row['third_person_pronouns']:
        print(pron)
        lookup_sentences = list()
        if pron[5]==0: # if the pronoun is mentioned in the first sentence of the essay just look at that sentence
            lookup_sentences.append([x for x in row['pos'][pron[5]] if x[2]<pron[3]])
        elif pron[5]>2: # if the pronoun is mentioned in the 3rd or ahead sentence of the essay look the previous 2 sentences
            lookup_sentences.append([x for x in row['pos'][pron[5]] if x[2]<pron[3]])
            lookup_sentences.append(row['pos'][pron[5]-1])
            lookup_sentences.append(row['pos'][pron[5]-2])
        else:
             # First sentence to look up
            lookup_sentences.append([x for x in row['pos'][pron[5]] if x[2]<pron[3]])
            lookup_sentences.append(row['pos'][pron[5]-1])
        #print(pron[0])
        result = check_coherence(pron, lookup_sentences)   
        if current_sentence_id is None or current_sentence_id != pron[5]:
            current_sentence_id = pron[5]
            if result is not True:
                number_of_errors = number_of_errors + 1  
        print(current_essay[current_sentence_indexes[pron[5]][1] : current_sentence_indexes[pron[5]][2]])
    coherence_error_list.append(number_of_errors)
tag_set['coherence_errors'] = coherence_error_list

1004355.txt

1007363.txt

('his', 'PRP$', 44, 47, 11, 0, 'O')
A geat challenge in a person's life span is his abbility to change to a better life where life is all about change.
('them', 'PRP', 620, 624, 14, 5, 'O')
Ofcoures, there will be some risks the person has to go through them to build up his experince.
('his', 'PRP$', 637, 640, 18, 5, 'O')
Ofcoures, there will be some risks the person has to go through them to build up his experince.
1079196.txt

('they', 'PRP', 107, 111, 18, 0, 'O')
Personally, I agree with the statement saying that most advertisements make products seem much better than they really are.
('their', 'PRP$', 253, 258, 11, 2, 'O')
I have to admit that there are several companies advertising their products in an honest way, stating only facts about the product without any misleading slogans or jingles.
('their', 'PRP$', 446, 451, 18, 3, 'O')
On the other hand, the majority of today\s industries is clever enough to trick their customers into buying their products.
(

So by choosing the major subject at a university it is evident that the student knows in which position he wants to work in his or her later life.
('he', 'PRP', 1369, 1371, 20, 11, 'O')
So by choosing the major subject at a university it is evident that the student knows in which position he wants to work in his or her later life.
('his', 'PRP$', 1389, 1392, 25, 11, 'O')
So by choosing the major subject at a university it is evident that the student knows in which position he wants to work in his or her later life.
('her', 'PRP$', 1396, 1399, 27, 11, 'O')
So by choosing the major subject at a university it is evident that the student knows in which position he wants to work in his or her later life.
('he', 'PRP', 1417, 1419, 2, 12, 'O')
Will he or she go very far into detail of a subject or more have an overview about a subject.
('she', 'PRP', 1423, 1426, 4, 12, 'O')
Will he or she go very far into detail of a subject or more have an overview about a subject.
('It', 'PRP', 1595, 1597, 

In [152]:
xx = get_sentence_indexes(constituency_parse(parser, current_essay, return_parse_obj=True))


In [153]:
xx

[(0, 0, 1),
 (1, 116, 119),
 (2, 149, 155),
 (3, 278, 282),
 (4, 403, 410),
 (5, 556, 564),
 (6, 653, 658),
 (7, 840, 842),
 (8, 960, 963),
 (9, 1090, 1094)]

In [130]:
xx[0]['tokens'][0]['characterOffsetBegin']

0

In [132]:

xx[0]['tokens'][len(xx[0]['tokens'])-1]['characterOffsetEnd']

115

In [133]:
current_essay[0:115]

"A geat challenge in a person's life span is his abbility to change to a better life where life is all about change."

In [252]:
tag_set

Unnamed: 0,filename,essay,pos,sentece_indexes,third_person_pronouns,coherence_errors
0,1004355.txt,This is an important aspect of today time.\nTh...,"[[(This, DT, 0, 4, 1, 0, O), (is, VBZ, 5, 7, 2...","[(0, 0, 42), (1, 43, 236), (2, 237, 359), (3, ...",[],0
1,1007363.txt,A geat challenge in a person's life span is hi...,"[[(A, DT, 0, 1, 1, 0, O), (geat, JJ, 2, 6, 2, ...","[(0, 0, 115), (1, 116, 148), (2, 149, 277), (3...","[(his, PRP$, 44, 47, 11, 0, O), (them, PRP, 62...",0
2,1079196.txt,"Personally, I agree with the statement saying ...","[[(Personally, RB, 0, 10, 1, 0, O), (,, ,, 10,...","[(0, 0, 123), (1, 125, 191), (2, 192, 365), (3...","[(they, PRP, 107, 111, 18, 0, O), (their, PRP$...",1
3,1086343.txt,In many country their are many combanies of a...,"[[(In, IN, 0, 2, 1, 0, O), (many, JJ, 3, 7, 2,...","[(0, 0, 66), (1, 67, 113), (2, 115, 223), (3, ...","[(their, PRP$, 17, 22, 4, 0, O), (their, PRP$,...",1
4,1096747.txt,I disagree with this idea. in order to make a ...,"[[(I, PRP, 0, 1, 1, 0, O), (disagree, VBP, 2, ...","[(0, 0, 26), (1, 27, 137), (2, 138, 217), (3, ...","[(they, PRP, 114, 118, 20, 1, O), (they, PRP, ...",1
5,1109085.txt,People often argue about what is more importan...,"[[(People, NNS, 0, 6, 1, 0, O), (often, RB, 7,...","[(0, 0, 100), (1, 101, 242), (2, 243, 326), (3...","[(it, PRP, 176, 178, 16, 1, O), (their, PRP$, ...",0
6,1164913.txt,Some tipes of advertisements use to make diffe...,"[[(Some, DT, 0, 4, 1, 0, O), (tipes, NNS, 5, 1...","[(0, 0, 149), (1, 151, 171), (2, 172, 190), (3...","[(them, PRP, 74, 78, 15, 0, O), (it, PRP, 91, ...",1
7,1174920.txt,In my opinion people always should try new thi...,"[[(In, IN, 0, 2, 1, 0, O), (my, PRP$, 3, 5, 2,...","[(0, 0, 162), (1, 163, 337), (2, 338, 560), (3...","[(themselves, PRP, 102, 112, 18, 0, O), (they,...",1
8,1181356.txt,We grow up in a society wich a has the mith of...,"[[(We, PRP, 0, 2, 1, 0, O), (grow, VBP, 3, 7, ...","[(0, 0, 61), (1, 63, 193), (2, 195, 275), (3, ...","[(it, PRP, 410, 412, 8, 4, O), (it, PRP, 632, ...",0
9,1223368.txt,Universities have considerably different educa...,"[[(Universities, NNS, 0, 12, 1, 0, O), (have, ...","[(0, 0, 61), (1, 62, 179), (2, 180, 318), (3, ...","[(it, PRP, 295, 297, 20, 2, O), (them, PRP, 31...",3


In [179]:
humans = pd.DataFrame()
no_humans = pd.DataFrame()
referencing_singular_pos = ['NN', 'NNP']
humans_list = list()
no_humans_list = list()
human_syn_list = list()
no_human_syn_list = list()
for y_pos in tag_set['pos']:
    for s_pos in y_pos:
        for x_pos in s_pos:
            if x_pos[1] in referencing_singular_pos:
                #print(x_pos[0])
                tags_result = is_animate(x_pos[0])
                human_tags = set(['professional','person','adult'])
                actual_tags = set(tags_result)
                inter = human_tags.intersection(actual_tags)
                if bool(inter):
                    if x_pos[0] not in humans_list:
                        humans_list.append(x_pos[0])
                        human_syn_list.append(tags_result)
                else:
                    if x_pos[0] not in no_humans_list:
                        no_humans_list.append(x_pos[0])
                        no_human_syn_list.append(tags_result)
humans['word'] =  humans_list
humans['categories'] = human_syn_list

no_humans['word'] =  no_humans_list
no_humans['categories'] = no_human_syn_list
humans.to_csv('D:/UIC/Spring 2018/NLP/humans.csv', index=False)
no_humans.to_csv('D:/UIC/Spring 2018/NLP/no_humans.csv', index=False)

In [184]:
is_animate('Barbara')

[]