In [1]:
# Load in the python script containing the same code as the load the data notebook
%run loadData.py
# now we can access train, dev, and test
# along with trainSents, devSents testSents

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [11]:
# Tuning functions

import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Follow lemmatize function from guide notebook: WSTA_N1B_preprocessing.ipynb
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

In [12]:
# Core functions

def vectorize_documents(text_documents):

    vectorizer = TfidfVectorizer(stop_words='english')
    vector_documents = vectorizer.fit_transform(text_documents)
    
    return [vector_documents, vectorizer]

def vectorize_query(vectorizer, text_query):
    return vectorizer.transform([text_query])

def process_neighbours(vector_documents):
    
    neighbours = NearestNeighbors(1, algorithm="brute", metric="cosine")
    neighbours.fit(vector_documents)
    
    return neighbours

def closest_document(neighbours, vector_query):

    result = neighbours.kneighbors(vector_query, 1, return_distance=True)

    result_index = result[1][0][0]
    result_distance = result[0][0][0]
    
    return [result_distance, result_index]

In [13]:
# Lets make a list of the questions part A gets right
correct = []
for i in range (0, len(dev)):
    documents = devSents[i]
    questions = dev[i]

    vector_documents, vectorizer = vectorize_documents(documents)
    analyze = vectorizer.build_analyzer()
    neighbours = process_neighbours(vector_documents)

    for j in range (0, len(questions)):
        text_query = questions[j]["question"]
        vector_query = vectorize_query(vectorizer, text_query)
        result_similarity, result_index  = closest_document(neighbours, vector_query)
        if result_index == int(questions[j]["answer_sentence"]):
            correct.append((i,j,result_index))


In [14]:
partACorrect = correct

In [15]:
print len(partACorrect)

4864


# Entity Extraction

In [4]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

# The required jar files : https://nlp.stanford.edu/software/CRF-NER.shtml#Download
# It's 171mb so I've added to the gitignore
# If you download it, and rename the folder name "stanford" in the main directory
classifier = './stanford/classifiers/english.all.3class.distsim.crf.ser.gz'
jar = './stanford/stanford-ner.jar'

sTagger = StanfordNERTagger(classifier,jar)

In [5]:
import pickle # Useful for read / write of list file
import os #Needed to check if file exists

In [6]:
# Lets store the stanford tagger output in a file
# This function returns the tagging output of stanford for each dataset
# with datasetName - 'train', 'dev', test' 

def getStanfordTagging(datasetName):
    fnameTrain = './preCompTags/stanfordTaggedTrain.txt'
    fnameDev = './preCompTags/stanfordTaggedDev.txt'
    fnameTest = './preCompTags/stanfordTaggedTest.txt'
    
    theFilePath = ''
    theSents = []
    if (datasetName == 'train'):
        theFilePath = fnameTrain
        theSents = trainSents
    elif (datasetName == 'dev'):
        theFilePath = fnameDev
        theSents = devSents
    elif (datasetName == 'test'):
        theFilePath = fnameTest
        theSents = testSents
    else :
        raise ValueError('Incorrect datasetName: ' + datasetName + ', choose from - "train", "dev", "test" ') 
    if (os.path.exists(theFilePath)):
        with open(theFilePath, "rb") as fp:
            stanfordTags = pickle.load(fp)
            return stanfordTags
    
    else :
        #Need to create taggings!
        taggedSentsList = []
        for sents in theSents:
            tokenisedSents = [word_tokenize(sent) for sent in sents]
            classifiedSents = sTagger.tag_sents(tokenisedSents)
            taggedSentsList.append(classifiedSents)
        #And save them
        with open(theFilePath, "wb") as fp: 
            pickle.dump(taggedSentsList, fp)
        return taggedSentsList
    

In [7]:
taggedTrain = getStanfordTagging('train')
taggedDev = getStanfordTagging('dev')
taggedTest = getStanfordTagging('test')

The second main part of your basic QA system is an NER system. In this initial system you should have at least four
answer types: PERSON, LOCATION, NUMBER, and OTHER. You should run the Stanford NER system over your
sentences to extract people and location entities (Hint: make use of the "tag_sents" method in the NLTK interface to do
this efficiently for multiple sentences in a single call, otherwise this will be very slow; you may also want to cache the
entity information during development of your system, rather than calling Stanford NER for each run). Note that
contiguous words tagged as the same type should be considered part of the same entity. ORGANIZATION entities
extracted by the NER system should be considered OTHER. You should also extract and treat as OTHER any other
non-sentence initial sequence of capitalized words not tagged by Stanford NER. Finally, you should label all numbers as
NUMBER. In this process, you might notice errors related to your preprocessing (e.g. tokenization), errors which can be
easily corrected should be addressed at this stage.

In [79]:
from collections import Counter
counter = Counter()
for subTrain in taggedTrain:
    for sent in subTrain:
        for (word, tag) in sent:
            counter[tag] += 1

In [80]:
counter

Counter({u'LOCATION': 59363,
         u'O': 1903787,
         u'ORGANIZATION': 46508,
         u'PERSON': 47497})

In [147]:
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


# Given a stanford tagged list, refines the list by:\n",
# Grouping all contiguous words with the same tag\n",
# Relabels Organisations as Other\n",
# Labels Number\n",
def refineWordTags(taggedWordList):
    newWordTags = []
    for (word, tag) in taggedWordList:
        if (tag == 'ORGANIZATION'):
            tag = 'O'
        if (tag == 'O'):
            #Might be a number
            if isNumber(word):
                tag = 'NUMBER'
            elif isCapitalised(word):
                tag = 'OTHERCAP'
            elif word in preUnits:
                tag = 'PRENUM'
            elif word in postUnits:
                tag = 'POSTNUM'
            elif isStopWord(word):
                tag = 'STOPWORD'
            elif isPunctuation(word):
                tag = 'PUNC'

        newWordTags.append((word, tag))
    
    newWordTags = combineTags (newWordTags)
    return newWordTags
        
def combineTags(wordTags):
    newTags = []
    prevWord = wordTags[0][0]
    prevTag = wordTags[0][1]
    for (word, tag) in wordTags[1:]:
        if tag == 'NUMBER' and prevTag == 'PRENUM':
            prevTag = 'NUMBER'
        elif prevTag == 'PRENUM':
            prevTag = 'O'
        if tag == 'POSTNUM' and prevTag == "NUMBER":
            tag = "NUMBER"
        elif tag == "POSTNUM":
            tag = "O"
        newTags.append((prevWord, prevTag))
        prevWord = word
        prevTag = tag
    newTags.append((prevWord, prevTag))
    
    newNewTags = []
    prevWord = newTags[0][0]
    prevTag = newTags[0][1]
    if (prevTag == "OTHERCAP"):
        prevTag = "O"
    for (word, tag) in wordTags[1:]:            
        if tag == prevTag :
            prevWord += ' ' + word
        else :
            newNewTags.append((prevWord, prevTag))
            prevWord = word
            prevTag = tag
    newNewTags.append((prevWord, prevTag))
    return newNewTags

# Thanks for this list to save me typing it : http://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers\n",
numInWords = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"
       , "hundred", "thousand", "million", "billion", "trillion"]

punctuation = ['.',',',';',':']

def isPunctuation(word):
    return word in punctuation
def isCapitalised (word):
    if len(word) == 0:
        return False
    return word[0].isupper()

# Obtained from training data
postUnits = [u'%', u'century', u'years', u'percent', u'years ago', u'days', u'months', u'km', u'hours', u'times', u'inches', u'\xb0C', u'minutes', u'acres', u'\xb0F', u'weeks', u'people', u'sq mi', u'mi', u'ft', u'feet', u'metres', u'mm', u'square miles', u'miles', u'pm', u'per cent', u'year', u'copies', u'yuan', u'men', u'square feet', u'third', u'kilometres', u'nm', u'tonnes', u'species', u'decades', u'barrels', u'tons', u'largest', u'centuries', u'km2']
preUnits = [u'$', u'around', u'late', u'early', u'nearly', u'since', u'approximately', u'number']

# Returns true if the word represents a number\n",
def isNumber(word):
    pattern = ".?(\\d)+((,|.)(\\d)+)*"
    if re.match(pattern,word) :
        return True
    if word.lower() in numInWords:
        return True
    return False

def isStopWord(word):
    return word.lower() in stop_words


In [145]:
# For each question, evaluate if the answer is present as an entity

def evaluateNER():
    correct = []
    wrong = []
    for (doc, questID, sentID) in partACorrect:
        answer = dev[doc][questID]["answer"]
        possAnswers = refineWordTags(taggedDev[doc][sentID])        
        inThere = False
        for possAnswer in possAnswers:
            if possAnswer[0] == answer:
                inThere = True
                break
        if inThere:
            correct.append((doc,sentID,answer, possAnswers))
        else :
            wrong.append((doc,sentID, answer, possAnswers))
    return (correct, wrong)


In [148]:
(nerCorrectList, nerWrongList) = evaluateNER()
print (len(nerCorrectList) + 0.0) / (len(nerCorrectList) +len(nerWrongList))

0.458881578947


In [None]:
0.462787828947
With mistake :D


In [136]:
def showWrong(wrongItem):
    print "ANS: " + wrongItem[2]
    print 
    print wrongItem[3]

In [138]:
showWrong(nerWrongList[0])

ANS: Infrared astronomy

[(u'Infrared astronomy uses sensor-equipped telescopes', 'O'), (u'to', 'STOPWORD'), (u'penetrate dusty regions', u'O'), (u'of', 'STOPWORD'), (u'space', u'O'), (u',', 'PUNC'), (u'such as', 'STOPWORD'), (u'molecular clouds', u'O'), (u';', 'PUNC'), (u'detect objects', u'O'), (u'such as', 'STOPWORD'), (u'planets', u'O'), (u',', 'PUNC'), (u'and to', 'STOPWORD'), (u'view highly red-shifted objects', u'O'), (u'from the', 'STOPWORD'), (u'early', 'PRENUM'), (u'days', 'POSTNUM'), (u'of the', 'STOPWORD'), (u'universe', u'O'), (u'.', 'PUNC')]
