In [1]:
# Load in the python script containing the same code as the load the data notebook
%run loadData.py
# now we can access train, dev, and test
# along with trainSents, devSents testSents

In [2]:
# Imports

import pprint
pp = pprint.PrettyPrinter(indent=4)

import nltk
from nltk.corpus import stopwords

from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

from string import punctuation  

import re
import pickle
import os

import csv

from collections import defaultdict


In [3]:
# Core functions

classifier = './stanford/classifiers/english.all.3class.distsim.crf.ser.gz'
jar = './stanford/stanford-ner.jar'

sTagger = StanfordNERTagger(classifier,jar)

punct_tokens = set(punctuation)
extra_tokens = set(["what", "where", "how", "when", "who"])

stop_words = set(stopwords.words('english'))

filter_tokens = extra_tokens.union(punct_tokens).union(stop_words)

In [4]:
# Shim function for later clean

def getStanfordTagging(datasetName):
    fnameTrain = './preCompTags/stanfordTaggedTrain.txt'
    fnameDev = './preCompTags/stanfordTaggedDev.txt'
    fnameTest = './preCompTags/stanfordTaggedTest.txt'
    
    theFilePath = ''
    theSents = []
    if (datasetName == 'train'):
        theFilePath = fnameTrain
        theSents = trainSents
    elif (datasetName == 'dev'):
        theFilePath = fnameDev
        theSents = devSents
    elif (datasetName == 'test'):
        theFilePath = fnameTest
        theSents = testSents
    else :
        raise ValueError('Incorrect datasetName: ' + datasetName + ', choose from - "train", "dev", "test" ') 
    if (os.path.exists(theFilePath)):
        with open(theFilePath, "rb") as fp:
            stanfordTags = pickle.load(fp)
            return stanfordTags
    
    else :
        #Need to create taggings!
        taggedSentsList = []
        for sents in theSents:
            tokenisedSents = [word_tokenize(sent) for sent in sents]
            classifiedSents = sTagger.tag_sents(tokenisedSents)
            taggedSentsList.append(classifiedSents)
        #And save them
        with open(theFilePath, "wb") as fp: 
            pickle.dump(taggedSentsList, fp)
        return taggedSentsList

In [5]:
tagged_train_set = getStanfordTagging('train')
tagged_dev_set = getStanfordTagging('dev')
tagged_test_set = getStanfordTagging('test')

In [10]:
# Shim function for later clean

# Thanks for this list to save me typing it : http://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers\n",
numInWords = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"
       , "hundred", "thousand", "million", "billion", "trillion"]

punctuation = ["''",'``','(','.',':', ',',')']


months = ["January","February","March","April","May","June","July","August","September","October","November","December"]

def isPunctuation(word):
    return word in punctuation

def isCapitalised (word):
    if len(word) == 0:
        return False
    return word[0].isupper()

# Obtained from training data
postUnits = [u'%', u'century', u'years', u'percent', u'years ago', u'days', u'months', u'km', u'hours', u'times', u'inches', u'\xb0C', u'minutes', u'acres', u'\xb0F', u'weeks', u'people', u'sq mi', u'mi', u'ft', u'feet', u'metres', u'mm', u'square miles', u'miles', u'pm', u'per cent', u'year', u'copies', u'yuan', u'men', u'square feet', u'third', u'kilometres', u'nm', u'tonnes', u'species', u'decades', u'barrels', u'tons', u'largest', u'centuries', u'km2']
preUnits = [u'$',u'around', u'late', u'early', u'nearly', u'since', u'approximately', u'number']

# Returns true if the word represents a number\n",
def isNumber(word):
    pattern = ".?(\\d)+((,|.)(\\d)+)*"
    if re.match(pattern,word) :
        return True
    if word.lower() in numInWords:
        return True
    if word in months:
        return True
    return False

def isStopWord(word):
    return word.lower() in stop_words

In [29]:
grammar = """ ANS: {<JJ>?<N.*>*}
                   {<DT>?<N.*>*}
                   }<UH|POS|VB|VBG|RP|DT|MD|PRP$|TO|RB|JJS|PDT|IN|PRP|VBP|VBN|RBS|WRB|WP|EX|VBZ|WDT|VBD>{
                    """
cp = nltk.RegexpParser(grammar) 

def chunk(words):
    tokenWS = nltk.pos_tag(nltk.word_tokenize(words))
    chunks =  cp.parse(tokenWS)
    possAnswers = []
    for subtree in chunks.subtrees():
        if subtree.label() == 'ANS':
            possAnswers.append((' '.join(word for word, pos in subtree.leaves()),'O'))
    possAnswers.append(("Nope", "CRAP")) # To ensure nothing has 0 tags
    return possAnswers    

In [47]:
nltk.pos_tag(['it'])

[('it', 'PRP')]

In [30]:
def add_non_chunked_words_as_single_tags(words):
    chunked_output = chunk(words)
    token_words = nltk.pos_tag(nltk.word_tokenize(words))
    if len(chunked_output) == 0:
        chunked_words = ["DEREKWANG"]
    else:
        chunked_words = [nltk.word_tokenize(word_tag_pair[0]) for word_tag_pair in chunked_output ]    
    all_word_tags = []
    
    current_chunk_index = 0
    current_chunk_word = 0
    current_chunk_list = chunked_words[0]
    
    for word_tag_pair in token_words:
        word = word_tag_pair[0]
        if word == current_chunk_list[current_chunk_word]:
            # Need to move onto next word
            if current_chunk_word == len(current_chunk_list) - 1:
                # last word in this current chunk
                all_word_tags.append(chunked_output[current_chunk_index])
                current_chunk_index += 1
                current_chunk_word = 0
                if current_chunk_index == len(chunked_words):
                    current_chunk_list = ["NOPE"]
                else:
                    current_chunk_list = chunked_words[current_chunk_index]
            else :
                current_chunk_word += 1
        else :
            # Need to add word, as it's not in a chunk :(
            all_word_tags.append((word,'O'))
    return all_word_tags


In [32]:
# Shim function for later clean

def refine_word_tags(taggedWordList):
    newWordTags = []
    for (word, tag) in taggedWordList:
        if (tag == 'ORGANIZATION'):
            tag = 'O'
        if (tag == 'O'):
            #Might be a number
            if isNumber(word):
                tag = 'NUMBER'
            elif word in preUnits:
                tag = 'PRENUM'
            elif isPunctuation(word):
                tag = 'PUNC'
            elif word in postUnits:
                tag = 'POSTNUM'
            elif isCapitalised(word):
                tag = "OTHERCAP"
        newWordTags.append((word, tag))
    
    newWordTags = combineTags (newWordTags)
    other_processed_tags = process_others(newWordTags)
    return other_processed_tags
        
def combineTags(wordTags):
    
    newTags = []
    prevWord = wordTags[0][0]
    prevTag = wordTags[0][1]
    
    for (word, tag) in wordTags[1:]:
        if tag == 'NUMBER' and prevTag == 'PRENUM':
            prevTag = 'NUMBER'
        elif prevTag == 'PRENUM':
            prevTag = 'O'
        if tag == 'POSTNUM' and prevTag == "NUMBER":
            tag = "NUMBER"
        elif tag == "POSTNUM":
            tag = "O"
        newTags.append((prevWord, prevTag))
        prevWord = word
        prevTag = tag
    newTags.append((prevWord, prevTag))
        
    newNewTags = []
    prevWord = newTags[0][0]
    prevTag = newTags[0][1]
    if (prevTag == "OTHERCAP" and newTags[1][1] != "OTHERCAP"):
        prevTag = "O"
        
    for (word, tag) in newTags[1:]:
#         print tag, prevTag
        if tag == prevTag :
            if word == '%':
                prevWord += word
            else :
                if prevWord == '$':
                    prevWord += word
                else :
                    prevWord += ' ' + word
        else :
            newNewTags.append((prevWord, prevTag))
            prevWord = word
            prevTag = tag
            
    newNewTags.append((prevWord, prevTag))
    
    return newNewTags

def process_others(words_with_tags):
    new_taggings = []
    for (words, tag) in words_with_tags:
        if tag == 'O':
            chunk_results = add_non_chunked_words_as_single_tags(words)
            for (word,tag) in chunk_results:
                new_taggings.append((word, tag))
            #new_taggings.append((words,tag))
        else :
            new_taggings.append((words, tag))
    return new_taggings
        

## Setup functions

In [15]:
from collections import defaultdict
def getAnswerDict(qss):
    sentDicts = defaultdict(list)
    for docID in range(0, len(qss)):
        qs = qss[docID]
        for q in qs:
            answer = q["answer"]
            answerSent = (docID, q["answer_sentence"])
            sentDicts[answerSent].append(answer)
    return sentDicts
train_sentence_contained_answers = getAnswerDict(train)


In [16]:
train_sentence_contained_answers[(0,2)] # contains the answers in that sentence

[u'long playing', u'12", 10", 7"', u'rpm']

In [17]:
def getAnswerIndex(sent, answer):
    tokenised_answer = nltk.word_tokenize(answer)
    len_tokenised_answer = len(tokenised_answer)
    tokenised_sent =  nltk.word_tokenize(sent)
    highest_index = (len(tokenised_sent) - len_tokenised_answer) + 1
    
    for i in range (0, highest_index):
            sentence_fragment = tokenised_sent[i:i+len_tokenised_answer]
            if (sentence_fragment == tokenised_answer):
                return (i, i+len_tokenised_answer)
    #print "Problem, cannot find answer index"
    #print sent
    #print answer
    #print
    return (-1,-1)


In [18]:
the_sentence = trainSents[0][0]
the_answers_contained = train_sentence_contained_answers[(0,0)]

print the_sentence
print
print the_answers_contained
print
print getAnswerIndex (the_sentence, the_answers_contained[0])

A gramophone record (phonograph record in American English) or vinyl record, commonly known as a "record", is an analogue sound storage medium in the form of a flat polyvinyl chloride (previously shellac) disc with an inscribed, modulated spiral groove.

[u'analogue sound storage medium']

(24, 28)


In [19]:
def convert_candidates_to_range(candidates):
    candidatesIndexRange = []
    index = 0
    for (words, tag) in candidates:
        tokenisedWords = nltk.word_tokenize(words)
        nextIndex = index + len(tokenisedWords)
        candidatesIndexRange.append((index,nextIndex ))
        index = nextIndex
    return candidatesIndexRange

In [20]:
example_candidates = refine_word_tags(tagged_train_set[0][0])

convert_candidates_to_range(example_candidates)[0:5]

[(0, 1), (1, 3), (3, 4), (4, 6), (6, 7)]

In [21]:
# Get the taggings of the (possibly overlapping) candidates list

def get_taggings_in_range((ans_start,ans_end),candidates):
    candidate_ranges = convert_candidates_to_range(candidates)
    
    current_considered_candidate_id = -1
    candidates_containing_answer = []
    
    for (cand_start, cand_end) in candidate_ranges:
        current_considered_candidate_id += 1
        if ans_start == cand_start and ans_end == cand_end :
            #candidates and answer the same
            candidates_containing_answer.append(current_considered_candidate_id)
            break
        if cand_start <= ans_start and cand_end >= ans_end :
            #candidate contains the answer, but has extra words
            candidates_containing_answer.append(current_considered_candidate_id)
            break

        if cand_start >= ans_start or cand_end > ans_start:
            if (cand_end <= ans_end):
                candidates_containing_answer.append(current_considered_candidate_id)
            else :
                # we have finished, as 
                if (cand_start < ans_end ):
                    candidates_containing_answer.append(current_considered_candidate_id)
                break
    return candidates_containing_answer
    

In [22]:
example_candidates = refine_word_tags(tagged_train_set[0][1])
example_answer_range = (3,6)
print example_candidates
print
print get_taggings_in_range(example_answer_range,example_candidates)

[(u'The', 'O'), (u'groove', 'O'), (u'usually', 'O'), (u'starts', 'O'), (u'near', 'O'), (u'the', 'O'), (u'periphery', 'O'), (u'and', 'O'), (u'ends', 'O'), (u'near', 'O'), (u'the', 'O'), (u'center', 'O'), (u'of', 'O'), (u'the', 'O'), (u'disc', 'O'), (u'.', 'PUNC')]

[3, 4, 5]


## Analysis

In [35]:
def get_NER_tag_analysis() :
    correct_taggings = []
    incorrect_taggings = []
    evil_ones = []
    for i in range(0, len(trainSents)):
        sent_set = trainSents[i]
        for j in range(0, len(sent_set)):
            sent = sent_set[j]
            answers_in_sent = train_sentence_contained_answers[(i,j)]
            candidates = refine_word_tags(tagged_train_set[i][j])
            for answer in answers_in_sent:
                (ans_start, ans_end) = getAnswerIndex(sent, answer)
                if (ans_start, ans_end) == (-1,-1) :
                    #question = train[i][j]["question"]
                    evil_ones.append((i,j, answer))
                    continue
                   
                candidate_ids = get_taggings_in_range((ans_start, ans_end),candidates)
                if len(candidate_ids) == 1:
                    possible_correct_candidate = candidates[candidate_ids[0]]
                    if possible_correct_candidate[0] == answer :
                        # Correct tagging!
                        correct_taggings.append((possible_correct_candidate, i, j))
                    else :
                        incorrect_taggings.append(([possible_correct_candidate], answer, i, j))
                else :
                    incorrect_candidates = ([candidates[index] for index in candidate_ids],answer, i, j)
                    incorrect_taggings.append(incorrect_candidates)

    return (correct_taggings, incorrect_taggings, evil_ones)

In [36]:
(correct_taggings, incorrect_taggings, evil_ones) = get_NER_tag_analysis()



In [37]:
print len(evil_ones)

1051


In [38]:
print len(correct_taggings)
print len(incorrect_taggings)

33534
35574


In [39]:
from collections import Counter
correctCounter = Counter()
for correct_tags in correct_taggings:
    correctCounter[correct_tags[0][1]] += 1
    

In [40]:
correctCounter

Counter({u'LOCATION': 3503,
         'NUMBER': 9459,
         'O': 9554,
         'OTHERCAP': 6460,
         u'PERSON': 4558})

In [41]:
from collections import Counter
from collections import defaultdict
wrongCounter = Counter()
wrongList = defaultdict(list)

for wrong_tags in incorrect_taggings:
    tagList = []
    for (words, tag) in wrong_tags[0]:
        tagList.append(tag)
    wrongCounter[tuple(tagList)] += 1
    wrongList[tuple(tagList)].append(wrong_tags)

In [42]:
wrongCounter.most_common(10)

[(('O',), 2875),
 (('O', 'O'), 2805),
 (('O', 'O', 'O'), 2406),
 (('NUMBER',), 1298),
 (('O', 'O', 'O', 'O'), 1223),
 (('OTHERCAP', 'O', 'OTHERCAP'), 1090),
 (('OTHERCAP', 'O'), 1074),
 (('O', 'OTHERCAP'), 939),
 (('O', 'O', 'O', 'O', 'O'), 770),
 (('OTHERCAP',), 768)]

In [46]:
wrongList[('O',)][2]

([(u'mirror-like finish', 'O')], u'mirror-like', 0, 285)

In [196]:
other_answer_pos_tags = Counter()
for item in wrongList[('O',)]:
    answer = item[1]
    tokenisedAnswer = nltk.word_tokenize(answer)
    posTagedWord = nltk.pos_tag(tokenisedAnswer)
    just_the_tags = [pair[1] for pair in posTagedWord]
    other_answer_pos_tags[tuple(just_the_tags)] += 1
print len(wrongList[('O',)])

total_count = 0
for (tag_tuple, count) in other_answer_pos_tags.most_common(20):
    total_count += count
    print tag_tuple
print total_count

print (total_count + 0.0 )/ len(wrongList[('O',)])



75
('NN',)
('JJ',)
('NNP', 'NNP')
('NNP',)
('CD',)
('NN', 'NN')
('NNS',)
('VBN',)
('NNP', 'NNP', 'NNP')
('NNP', 'NNS')
('NNP', 'NN')
('VB',)
('NNP', 'NNP', 'NN')
('JJ', 'NN', 'NN')
('NN', 'NN', 'NNS')
('NNP', 'NNP', 'NNS')
('JJ', 'NN')
('VBG',)
('NNS', 'VBP')
75
1.0


In [75]:
test_sent = wrongList[('O',)][3][0][0][0]
test_sent_tokenised = nltk.word_tokenize(test_sent)
test_sent_pos_tagged = nltk.pos_tag(test_sent_tokenised)

print test_sent_pos_tagged


cp = nltk.RegexpParser(grammar) 
result = cp.parse(test_sent_pos_tagged)
print result
print
print chunk(test_sent)


[(u"''", "''"), (u')', ')'), (u',', ','), (u'the', 'DT'), (u'rotational', 'JJ'), (u'speed', 'NN'), (u'in', 'IN'), (u'rpm', 'NN'), (u'at', 'IN'), (u'which', 'WDT'), (u'they', 'PRP'), (u'are', 'VBP'), (u'played', 'VBN'), (u'(', '(')]
(S
  ''/''
  )/)
  ,/,
  the/DT
  (ANS rotational/JJ speed/NN)
  in/IN
  (ANS rpm/NN)
  at/IN
  which/WDT
  they/PRP
  are/VBP
  played/VBN
  (/()

[(u'rotational speed', 'O'), (u'rpm', 'O'), ('Nope', 'CRAP')]


## Evaluation on Dev sent for all:


In [33]:
def evaluateNERonDev():
    correct = []
    wrong = []
    
    for i in range(0, len(dev)):
        qs = dev[i]
        for j in range(0, len(qs)):
            q = qs[j]
            idSent  = q["answer_sentence"]
            sent = devSents[i][idSent]
            answer = q["answer"]
            possAnswers = refine_word_tags(tagged_dev_set[i][idSent])
            inThere = False
            for possAnswer in possAnswers:
                if possAnswer[0] == answer:
                    inThere = True
                    break
            if inThere:
                correct.append((i, j, idSent, possAnswers))
            else :
                wrong.append((i, j, idSent, possAnswers))
    return (correct, wrong)


(correct,wrong) = evaluateNERonDev()



In [34]:
print (len(correct) + 0.0)/ (len(correct) + len(wrong))


0.524045846626


In [None]:
# Base! 0.261491196975 ... With others in one clump, ie all grouped

# 0.412146992792     With others seperated into single units

# Add months: 0.416046319272 Every little helps

# POS with chunker made from top 10 rules :D
# 0.458584426326

# including others as singles:
# 0.488597424081

# slightly longer and more complecated grammar 


# POS tag on others
# 900 tags.

# 0.524045846626 -> Chunking with capital tag

# adding stopwords tag reduces to 0.517074323526 UNDO


In [98]:
grammar = """ ANS: {<NNP>*}
                   {<JJ>?<NN>*}
                   {<NN><NNS>}
"""
cp = nltk.RegexpParser(grammar) 

def chunk_dev(words):
    tokenWS = nltk.pos_tag(nltk.word_tokenize(words))
    chunks =  cp.parse(tokenWS)
    possAnswers = []
    for subtree in chunks.subtrees():
        if subtree.label() == 'ANS':
            possAnswers.append((' '.join(word for word, pos in subtree.leaves()),'O'))
    return possAnswers    

In [108]:
# New plan, get chunk to return all the words :D


words = "The cat sat on the mat"

chunked_output =  chunk_mark_2(words)



In [None]:
# For each answer in train
# Any tags that aren't in answers?


In [243]:
answer_tag_counter = Counter()
tag_counter_sents = Counter()
ans_length_counter = Counter()

for i in range(0, len(train)):
    qs = train[i]
    for j in range(0, len(qs)):
        q = qs[j]
        idSent  = q["answer_sentence"]
        sent = trainSents[i][idSent]
        answer = q["answer"]
        
        pos_tags_answer = nltk.pos_tag(nltk.word_tokenize(answer))
        pos_tags_sent = nltk.pos_tag(nltk.word_tokenize(sent))
        ans_length_counter[len(pos_tags_answer)] += 1
        
        for (word, tag) in pos_tags_answer:
            answer_tag_counter[tag] += 1
            
        for (word, tag) in pos_tags_sent:
            tag_counter_sents[tag] += 1



In [244]:
tag_proportion_counter = Counter()
for tag in tag_counter.keys():
    percentage = (answer_tag_counter[tag]  + 0.0)/ (answer_tag_counter[tag] + tag_counter_sents[tag])
    tag_proportion_counter[tag] = percentage


In [249]:
list_of_rare_tags = []
for (tag, percentage) in  tag_proportion_counter.most_common(100000)[-30:]:
    list_of_rare_tags.append(tag)


In [250]:
list_of_rare_tags

['UH',
 'POS',
 'VB',
 "''",
 '``',
 'VBG',
 'RP',
 'DT',
 'MD',
 'PRP$',
 'TO',
 'RB',
 'JJS',
 'PDT',
 'IN',
 'PRP',
 'VBP',
 'VBN',
 'RBS',
 'WRB',
 'WP',
 '(',
 'EX',
 ')',
 ',',
 'VBZ',
 'WDT',
 'VBD',
 '.',
 ':']

In [239]:
sum([val for (key,val) in ans_length_counter.most_common(10)])

66104

In [None]:
}<,|EX|WP|WRB|VBZ|WDT|RBS|VDB|.|:>+{