In [1]:
%run loadData.py

In [2]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

# The required jar files : https://nlp.stanford.edu/software/CRF-NER.shtml#Download
# It's 171mb so I've added to the gitignore
# If you download it, and rename the folder name "stanford" in the main directory
classifier = './stanford/classifiers/english.all.3class.distsim.crf.ser.gz'
jar = './stanford/stanford-ner.jar'

sTagger = StanfordNERTagger(classifier,jar)

In [3]:
import pickle # Useful for read / write of list file
import os #Needed to check if file exists

In [4]:
# Lets store the stanford tagger output in a file
# This function returns the tagging output of stanford for each dataset
# with datasetName - 'train', 'dev', test' 

def getStanfordTagging(datasetName):
    fnameTrain = './preCompTags/stanfordTaggedTrain.txt'
    fnameDev = './preCompTags/stanfordTaggedDev.txt'
    fnameTest = './preCompTags/stanfordTaggedTest.txt'
    fnameDevAnswers = './preCompTags/stanfordTaggedDevAnswers.txt'
    fnameTrainAnswers = './preCompTags/stanfordTaggedTrainAnswers.txt'
    
    theFilePath = ''
    theSents = []
    if (datasetName == 'train'):
        theFilePath = fnameTrain
        theSents = trainSents
    elif (datasetName == 'dev'):
        theFilePath = fnameDev
        theSents = devSents
    elif (datasetName == 'test'):
        theFilePath = fnameTest
        theSents = testSents
    elif (datasetName == 'ansDev' ):
        theFilePath = fnameDevAnswers 
        devAnswers = []
        for sents in dev:
            sentsAnswers = []
            for question in sents:
                sentsAnswers.append(question["answer"])
            devAnswers.append(sentsAnswers)
        theSents = devAnswers
    elif (datasetName == 'ansTrain' ):
            theFilePath = fnameTrainAnswers 
            trainAnswers = []
            for sents in train:
                sentsAnswers = []
                for question in sents:
                    sentsAnswers.append(question["answer"])
                trainAnswers.append(sentsAnswers)
            theSents = trainAnswers
    else :
        raise ValueError('Incorrect datasetName: ' + datasetName + ', choose from - "train", "dev", "test" ') 
    if (os.path.exists(theFilePath)):
        with open(theFilePath, "rb") as fp:
            stanfordTags = pickle.load(fp)
            return stanfordTags
    
    else :
        #Need to create taggings!
        taggedSentsList = []
        for sents in theSents:
            tokenisedSents = [word_tokenize(sent) for sent in sents]
            classifiedSents = sTagger.tag_sents(tokenisedSents)
            taggedSentsList.append(classifiedSents)
        #And save them
        with open(theFilePath, "wb") as fp: 
            pickle.dump(taggedSentsList, fp)
        return taggedSentsList
    

In [5]:
taggedTrain = getStanfordTagging('train')
taggedDev = getStanfordTagging('dev')
taggedTest = getStanfordTagging('test')

taggedTrainAnswers = getStanfordTagging('ansTrain')
taggedDevAnswers = getStanfordTagging('ansDev')

In [149]:
import re
from nltk.corpus import stopwords
from collections import Counter

stop_words = set(stopwords.words('english'))


# Given a stanford tagged list, refines the list by:\n",
# Grouping all contiguous words with the same tag\n",
# Relabels Organisations as Other\n",
# Labels Number\n",
def refineWordTags(taggedWordList):
    newWordTags = []
    for (word, tag) in taggedWordList:
        if (tag == 'ORGANIZATION'):
            tag = 'O'
        if (tag == 'O'):
            #Might be a number
            if isNumber(word):
                tag = 'NUMBER'
            elif isCapitalised(word):
                tag = 'OTHERCAP'
        newWordTags.append((word, tag))
    return groupSameTags(newWordTags)       

def groupSameTags(taggedSent):
    #taggedSent = addCommasToNums(taggedSent)
    newWordTags = []
    prevWord = taggedSent[0][0]
    prevTag = taggedSent[0][1]
    numInWord = 1
    for (word,tag) in taggedSent[1:] :
        if prevTag != tag and numInWord == 1 and prevTag == 'OTHERCAP':
            prevTag = 'O'
        if tag == prevTag:
            prevWord += ' ' + word
            numInWord += 1
        else :
            #Save prevWord, make new
            newWordTags.append((prevWord, prevTag))
            numInWord = 1
            prevWord = word
            prevTag = tag
    newWordTags.append((prevWord, prevTag))
    newWordTags = groupOthers(newWordTags)
    return newWordTags
        
def groupOthers(wordTags):
    newWordTags = []
    for (word,tag) in wordTags:
        if tag != 'O':
            newWordTags.append((word,tag))
        else :
            newChunked = chunk(word)
            if newChunked != []:
                newWordTags.append(newChunked)
    return newWordTags
# Thanks : http://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers\n",
numInWords = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"
       , "hundred", "thousand", "million", "billion", "trillion"]

months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]

def isCapitalised (word):
    if len(word) == 0:
        return False
    return word[0].isupper()

# Returns true if the word represents a number\n",
def isNumber(word):
    pattern = ".?(\\d)+((,|.)(\\d)+)*"
    if re.match(pattern,word) :
        return True
    if word.lower() in numInWords:
        return True
    if word.lower() in months:
        return True
    return False

In [150]:
import nltk

chunkParser = nltk.RegexpParser(chunkRules)

def chunk(words):
    tokenWS = nltk.pos_tag(nltk.word_tokenize(words))
    chunks =  chunkParser.parse(tokenWS)
    possAnswers = []
    for subtree in chunks.subtrees():
        if subtree.label() == 'ANS':
            possAnswers.append((' '.join(word for word, pos in subtree.leaves()),'O'))
    #possAnswers.append(("Nope", "CRAP"))
    return possAnswers    

In [151]:
# For each question, evaluate if the answer is present as an entity
def evaluateNER(questionsList,documentsList, numToEval):
    correct = []
    wrong = []
    correctTagCounter = Counter()
    incorrectTagCounter = Counter()
    for i in range (0, numToEval):
        documents = documentsList[i]
        questions = questionsList[i]
        for j in range (0, len(questions)):
            answer = questionsList[i][j]["answer"]
            answerID = questionsList[i][j]["answer_sentence"]
            possAnswers = refineWordTags(taggedDev[i][answerID])
            inThere = False
            for possAnswer in possAnswers:
                if possAnswer[0] == answer:
                    inThere = True
                    correctTagCounter[possAnswer[1]] += 1
                    break
            if inThere:
                correct.append((i,j, possAnswers))
            else :
                wrong.append((i,j, answer, possAnswers))
                # Tag answer
    return (correct, wrong, correctTagCounter, incorrectTagCounter)


In [152]:
(corNER, wrongNER, correctTagCounter, incorrectTagCounter) = evaluateNER(dev, devSents,len(dev))

print("Number Correct : " + str(len(corNER)))
print("Number incorrect: " + str(len(wrongNER)))
print("Average correct : " + str((len(corNER) + 0.0) / (len(corNER)+len(wrongNER))))

Number Correct : 2760
Number incorrect: 5703
Average correct : 0.326125487416


In [None]:
# 0.326125487416 with no others!



In [162]:
# Visual Evaluation:

def visualEvaluateNER():
    numTrue = 0
    numTrials = 0
    for i in range(5):
        trainy = train[i]
        numTrials += len(trainy)
        for qa in trainy:
            answer = qa["answer"]
            answerID = qa["answer_sentence"]
            sentence = trainSents[i][answerID]
            tags = refineWordTags(taggedTrain[i][answerID])
            inThere = False
            for possAnswer in tags:
                if possAnswer[0] == answer:
                    inThere = True
                    break
            if inThere:
                numTrue += 1
    print numTrue
    print numTrials
    
        #print ("Answer: " + answer)
        #print
        #print ("Sentence: " + sentence)
        #print
        #print (tags)
        #print
        #print inThere
        #print 
        #print
visualEvaluateNER()

378
1073


In [163]:
378 / 1073.0

0.35228331780055916

In [110]:
words = "Buckingham Palace, in London, is the capital of England, and where the queen lives"
print chunk(words)
#print refineWordTags(taggedDev[1][1])
#print devSents[1][1]

[('Buckingham Palace', 'O'), ('London', 'O'), ('capital of England', 'O'), ('the queen', 'O'), ('lives', 'O'), ('Nope', 'CRAP')]


In [104]:
    tokenWS = nltk.pos_tag(nltk.word_tokenize(words))
    chunks =  chunkParser.parse(tokenWS)

In [109]:
#for subtree in chunks.subtrees():
    #print subtree.label()
#    if subtree.label() == 'S':
#        for subtree2 in subtree.subtrees():
            #print subtree2.label()
            #print (' '.join(word for word, pos in t.leaves()),'O')

(S
  Buckingham/NNP
  Palace/NNP
  ,/,
  in/IN
  London/NNP
  ,/,
  is/VBZ
  the/DT
  capital/NN
  of/IN
  England/NNP
  ,/,
  and/CC
  where/WRB
  the/DT
  queen/NN
  lives/NNS)


(('NN',), 8929)

In [25]:
mostCommon = [(('NN',), 8929),
 (('CD',), 7556),
 (('NNP', 'NNP'), 5135),
 (('NNS',), 2316),
 (('JJ',), 2056),
 (('JJ', 'NN'), 1819),
 (('NNP', 'NNP', 'NNP'), 1528),
 (('NNP',), 1471),
 (('CD', 'NN'), 1124),
 (('JJ', 'NNS'), 1069),
 (('NN', 'NN'), 1063),
 (('NN', 'NNS'), 727),
 (('NNP', 'CD'), 625),
 (('CD', 'NNS'), 610),
 (('NNP', 'NN'), 551),
 (('DT', 'NNP', 'NNP'), 543),
 (('DT', 'NN'), 527),
 (('DT', 'JJ', 'NN'), 453),
 (('NNP', 'CD', ',', 'CD'), 418),
 (('JJ', 'NNP'), 394),
 (('CD', 'NNP', 'CD'), 387),
 (('CD', 'NNP'), 379),
 (('NNP', 'NNS'), 367),
 (('NNP', 'NNP', 'NNP', 'NNP'), 347),
 (('NNP', 'CC', 'NNP'), 341),
 (('DT', 'NNP'), 336),
 (('CD', 'CD'), 330),
 (('NNP', 'IN', 'NNP'), 320),
 (('NN', 'CC', 'NN'), 296),
 (('VBG',), 251),
 (('JJ', 'NN', 'NN'), 235),
 (('IN', 'CD'), 201),
 (('JJ', 'NN', 'NNS'), 200),
 (('DT', 'NN', 'NN'), 195),
 (('$', 'CD', 'CD'), 194),
 (('DT', 'NNP', 'NNP', 'NNP'), 188),
 (('RB',), 185),
 (('DT', 'NNP', 'NN'), 185),
 (('JJ', 'NNP', 'NNP'), 181),
 (('VBN',), 176),
 (('DT', 'NNP', 'IN', 'NNP'), 160),
 (('NN', 'IN', 'NN'), 156),
 (('NN', 'NNP'), 143),
 (('NNS', 'CC', 'NNS'), 138),
 (('JJ', 'JJ', 'NN'), 137),
 (('NNP', 'NNP', 'CC', 'NNP', 'NNP'), 135),
 (('JJ', 'CC', 'JJ'), 122),
 (('$', 'CD'), 119),
 (('NN', 'IN', 'NNP'), 117),
 (('NNP', 'NNP', 'IN', 'NNP'), 116)]

In [28]:
def mostCommonToList(mostCommon):
    newList = []
    for (tagTuple, val) in mostCommon[0:50]:
        tagList = []
        for tag in tagTuple:
            tagList.append(tag)
        newList.append(tagList)
    return newList
mostCommonList = mostCommonToList(mostCommon)
mostCommonList = sorted(mostCommonList, key=len)
mostCommonList = mostCommonList[::-1]
mostCommonList[0:5]

[['NNP', 'NNP', 'CC', 'NNP', 'NNP'],
 ['NNP', 'NNP', 'IN', 'NNP'],
 ['DT', 'NNP', 'IN', 'NNP'],
 ['DT', 'NNP', 'NNP', 'NNP'],
 ['NNP', 'NNP', 'NNP', 'NNP']]

In [46]:
chunkRules = "ANS: "
for posTags in mostCommonList:
    ruleInStr = "{"
    for tag in posTags:
        ruleInStr = ruleInStr + "<"+ tag + ">"
    chunkRules += ruleInStr + '}\n'
print chunkRules

ANS: {<NNP><NNP><CC><NNP><NNP>}
{<NNP><NNP><IN><NNP>}
{<DT><NNP><IN><NNP>}
{<DT><NNP><NNP><NNP>}
{<NNP><NNP><NNP><NNP>}
{<NNP><CD><,><CD>}
{<NN><IN><NNP>}
{<JJ><CC><JJ>}
{<JJ><JJ><NN>}
{<NNS><CC><NNS>}
{<NN><IN><NN>}
{<JJ><NNP><NNP>}
{<DT><NNP><NN>}
{<$><CD><CD>}
{<DT><NN><NN>}
{<JJ><NN><NNS>}
{<JJ><NN><NN>}
{<NN><CC><NN>}
{<NNP><IN><NNP>}
{<NNP><CC><NNP>}
{<CD><NNP><CD>}
{<DT><JJ><NN>}
{<DT><NNP><NNP>}
{<NNP><NNP><NNP>}
{<$><CD>}
{<NN><NNP>}
{<IN><CD>}
{<CD><CD>}
{<DT><NNP>}
{<NNP><NNS>}
{<CD><NNP>}
{<JJ><NNP>}
{<DT><NN>}
{<NNP><NN>}
{<CD><NNS>}
{<NNP><CD>}
{<NN><NNS>}
{<NN><NN>}
{<JJ><NNS>}
{<CD><NN>}
{<JJ><NN>}
{<NNP><NNP>}
{<VBN>}
{<RB>}
{<VBG>}
{<NNP>}
{<JJ>}
{<NNS>}
{<CD>}
{<NN>}



In [82]:
chunkRules = """
ANS:    {<NNP><NNP><CC><NNP><NNP>}
        {<NNP><NNP><IN><NNP>}
        {<DT><NNP><IN><NNP>}
        {<DT><NNP><NNP><NNP>}
        {<NNP><NNP><NNP><NNP>}
        {<NNP><CD><,><CD>}
        {<NN><IN><NNP>}
        {<JJ><CC><JJ>}
        {<JJ><JJ><NN>}
        {<NNS><CC><NNS>}
        {<NN><IN><NN>}
        {<JJ><NNP><NNP>}
        {<DT><NNP><NN>}
        {<$><CD><CD>}
        {<DT><NN><NN>}
        {<JJ><NN><NNS>}
        {<JJ><NN><NN>}
        {<NN><CC><NN>}
        {<NNP><IN><NNP>}
        {<NNP><CC><NNP>}
        {<CD><NNP><CD>}
        {<DT><JJ><NN>}
        {<DT><NNP><NNP>}
        {<NNP><NNP><NNP>}
        {<$><CD>}
        {<NN><NNP>}
        {<IN><CD>}
        {<CD><CD>}
        {<DT><NNP>}
        {<NNP><NNS>}
        {<CD><NNP>}
        {<JJ><NNP>}
        {<DT><NN>}
        {<NNP><NN>}
        {<CD><NNS>}
        {<NNP><CD>}
        {<NN><NNS>}
        {<NN><NN>}
        {<JJ><NNS>}
        {<CD><NN>}
        {<JJ><NN>}
        {<NNP><NNP>}
        {<VBN>}
        {<RB>}
        {<VBG>}
        {<NNP>}
        {<JJ>}
        {<NNS>}
        {<CD>}
        {<NN>}
"""




In [80]:
import nltk

def prepareForNLP(text):
	sentences = nltk.sent_tokenize(text)
	sentences = [nltk.word_tokenize(sent) for sent in sentences]
	sentences = [nltk.pos_tag(sent) for sent in sentences]
	return sentences

def chunk(sentence):
    chunkToExtract = """
    ANS: {<NNP>*}
        {<DT>?<JJ>?<NNS>}
        {<NN><NN>}"""
    parser = nltk.RegexpParser(chunkToExtract)
    result = parser.parse(sentence)
    print result
    for subtree in result.subtrees():
		if subtree.label() == 'ANS':
			t = subtree
			t = ' '.join(word for word, pos in t.leaves())
			print(t)



sentences = prepareForNLP("A prison riot left six members of staff needing hospital treatment earlier this month, the BBC learns")
for sentence in sentences:
	chunk(sentence)

(S
  A/DT
  (ANS prison/NN riot/NN)
  left/VBD
  six/CD
  (ANS members/NNS)
  of/IN
  staff/NN
  needing/VBG
  (ANS hospital/NN treatment/NN)
  earlier/RBR
  this/DT
  month/NN
  ,/,
  the/DT
  (ANS BBC/NNP)
  (ANS learns/NNS))
prison riot
members
hospital treatment
BBC
learns
