In [1]:
# Load in the python script containing the same code as the load the data notebook
%run loadData.py
# now we can access train, dev, and test
# along with trainSents, devSents testSents
documents = testSents[0]
questions = test[0]

## Part A

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [3]:
# Tuning functions

import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Follow lemmatize function from guide notebook: WSTA_N1B_preprocessing.ipynb
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

word_tokenizer = nltk.tokenize.WordPunctTokenizer() #word_tokenize #tokenize.regexp.WordPunctTokenizer()

def pre_process(line):
    tokenized_sentence = word_tokenizer.tokenize(line.lower())
    lemmatized_sentence = [lemmatize(token) for token in tokenized_sentence]
    return lemmatized_sentence

In [4]:
# Core functions

def vectorize_documents(text_documents):

    vectorizer = TfidfVectorizer(stop_words='english', tokenizer=pre_process)
    vector_documents = vectorizer.fit_transform(text_documents)
    
    return [vector_documents, vectorizer]

def vectorize_query(vectorizer, text_query):
    return vectorizer.transform([text_query])

def process_neighbours(vector_documents):
    
    neighbours = NearestNeighbors(1, algorithm="brute", metric="cosine")
    neighbours.fit(vector_documents)
    
    return neighbours

def closest_document(neighbours, vector_query):

    result = neighbours.kneighbors(vector_query, 1, return_distance=True)

    result_index = result[1][0][0]
    result_distance = result[0][0][0]
    
    return [result_distance, result_index]

In [5]:
def generatePartAOutput(qs, sents):
    # Output for part A
    partAOutput = []
    for i in range (0, len(qs)):
        documents = sents[i]
        questions = qs[i]

        vector_documents, vectorizer = vectorize_documents(documents)
        analyze = vectorizer.build_analyzer()
        neighbours = process_neighbours(vector_documents)

        for j in range (0, len(questions)):
            text_query = questions[j]["question"]
            vector_query = vectorize_query(vectorizer, text_query)
            result_similarity, result_index  = closest_document(neighbours, vector_query)
            partAOutput.append((i,j,result_index))
    return partAOutput

partADevAnswers = generatePartAOutput(dev, devSents)
partATestAnswers = generatePartAOutput(test, testSents)

## Bigram time!


### First lets get the stanford taggings

In [8]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

# The required jar files : https://nlp.stanford.edu/software/CRF-NER.shtml#Download
# It's 171mb so I've added to the gitignore
# If you download it, and rename the folder name "stanford" in the main directory
classifier = './stanford/classifiers/english.all.3class.distsim.crf.ser.gz'
jar = './stanford/stanford-ner.jar'

sTagger = StanfordNERTagger(classifier,jar)

In [9]:
import pickle # Useful for read / write of list file
import os #Needed to check if file exists

In [10]:
# Lets store the stanford tagger output in a file
# This function returns the tagging output of stanford for each dataset
# with datasetName - 'train', 'dev', test' 

def getStanfordTagging(datasetName):
    fnameTrain = './preCompTags/stanfordTaggedTrain.txt'
    fnameDev = './preCompTags/stanfordTaggedDev.txt'
    fnameTest = './preCompTags/stanfordTaggedTest.txt'
    
    theFilePath = ''
    theSents = []
    if (datasetName == 'train'):
        theFilePath = fnameTrain
        theSents = trainSents
    elif (datasetName == 'dev'):
        theFilePath = fnameDev
        theSents = devSents
    elif (datasetName == 'test'):
        theFilePath = fnameTest
        theSents = testSents
    else :
        raise ValueError('Incorrect datasetName: ' + datasetName + ', choose from - "train", "dev", "test" ') 
    if (os.path.exists(theFilePath)):
        with open(theFilePath, "rb") as fp:
            stanfordTags = pickle.load(fp)
            return stanfordTags
    
    else :
        #Need to create taggings!
        taggedSentsList = []
        for sents in theSents:
            tokenisedSents = [word_tokenize(sent) for sent in sents]
            classifiedSents = sTagger.tag_sents(tokenisedSents)
            taggedSentsList.append(classifiedSents)
        #And save them
        with open(theFilePath, "wb") as fp: 
            pickle.dump(taggedSentsList, fp)
        return taggedSentsList
    

In [11]:
taggedTrain = getStanfordTagging('train')
taggedDev = getStanfordTagging('dev')
taggedTest = getStanfordTagging('test')

In [12]:
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


# Given a stanford tagged list, refines the list by:\n",
# Grouping all contiguous words with the same tag\n",
# Relabels Organisations as Other\n",
# Labels Number\n",
def refineWordTags(taggedWordList):
    newWordTags = []
    for (word, tag) in taggedWordList:
        if (tag == 'ORGANIZATION'):
            tag = 'O'
        if (tag == 'O'):
            #Might be a number
            if isNumber(word):
                tag = 'NUMBER'
            elif isCapitalised(word):
                tag = 'OTHERCAP'
            elif isStopWord(word):
                tag = 'STOPWORD'
            elif isPunctuation(word):
                tag = 'PUNC'

        newWordTags.append((word, tag))
    
    return newWordTags

# Thanks for this list to save me typing it : http://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers\n",
numInWords = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"
       , "hundred", "thousand", "million", "billion", "trillion"]

punctuation = ['.',',',';',':']

def isPunctuation(word):
    return word in punctuation
def isCapitalised (word):
    if len(word) == 0:
        return False
    return word[0].isupper()

# Obtained from training data
postUnits = [u'%', u'century', u'years', u'percent', u'years ago', u'days', u'months', u'km', u'hours', u'times', u'inches', u'\xb0C', u'minutes', u'acres', u'\xb0F', u'weeks', u'people', u'sq mi', u'mi', u'ft', u'feet', u'metres', u'mm', u'square miles', u'miles', u'pm', u'per cent', u'year', u'copies', u'yuan', u'men', u'square feet', u'third', u'kilometres', u'nm', u'tonnes', u'species', u'decades', u'barrels', u'tons', u'largest', u'centuries', u'km2']
preUnits = [u'$', u'around', u'late', u'early', u'nearly', u'since', u'approximately', u'number']

# Returns true if the word represents a number\n",
def isNumber(word):
    pattern = ".?(\\d)+((,|.)(\\d)+)*"
    if re.match(pattern,word) :
        return True
    if word.lower() in numInWords:
        return True
    return False

def isStopWord(word):
    return word.lower() in stop_words


In [23]:
def refineTheTags(dataset):
    tempList = []
    for doc in dataset:
        innerList = []
        for sent in doc:
            innerList.append(refineWordTags(sent))
        tempList.append(innerList)
    return tempList

refinedTrain = refineTheTags(taggedTrain)
refinedDev = refineTheTags(taggedDev)
refinedTest = refineTheTags(taggedTest)

## Bigram Creation Data


In [36]:
from collections import defaultdict
def getAnswerDict(qss):
    sentDicts = defaultdict(list)
    for docID in range(0, len(qss)):
        qs = qss[docID]
        for q in qs:
            answer = q["answer"]
            answerSent = (docID, q["answer_sentence"])
            sentDicts[answerSent].append(answer)
    return sentDicts
trainSentDicts = getAnswerDict(train)
devSentDicts = getAnswerDict(dev)

In [30]:
#This function returns a list of the indexes of words that are an answer to a question in the train set
def getAnswerPos(docID, sentID, sents, answerDict):
    sententence = sents[docID][sentID]
    listAnswers = answerDict[(docID, sentID)]
    tokenisedSent =  nltk.word_tokenize(sententence)
    answerPosList = []
    for ans in listAnswers:
        tokenisedAns = nltk.word_tokenize(ans)
        # Inefficient
        
        for i in range (0, len(tokenisedSent) - len(tokenisedAns)):
            answerFragment = tokenisedSent[i:i+len(tokenisedAns)]
            if (answerFragment == tokenisedAns):
                for j in range (0, len(tokenisedAns)):
                    answerPosList.append(i+ j)
                break
    return answerPosList

getAnswerPos(0,0,trainSents, trainSentDicts)

[24, 25, 26, 27]

In [31]:
def getModelData(docID, sentID, sents, stanfordTags, answerDict):
    combTags = []
    theSent = sents[docID][sentID]
    posTags = nltk.pos_tag(nltk.word_tokenize(theSent))
    stanfordTags =refineWordTags(stanfordTags[docID][sentID])
    if len(posTags) != len(stanfordTags):
        print(docID, sentID)
        return[]
    answerPos = getAnswerPos(docID, sentID, sents , answerDict)
    #print answerPos
    for i in range(0, len(posTags)):
        word = posTags[i][0]
        posTag = posTags[i][1]
        stanfordTag = stanfordTags[i][1]
        answer = i in answerPos
        combTags.append((posTag, stanfordTag, answer))
    return combTags

In [34]:
from collections import Counter
answerCounter = Counter()
for i in range(0, len(trainSents)):
    for j in range(0, len(trainSents[i])):
        groupTags = getModelData(i,j,trainSents, taggedTrain, trainSentDicts)
        for k in range(0, len(groupTags) - 2):
            answerCounter[tuple(groupTags[k:k+2])] += 1
        if len(groupTags) > 0:
            answerCounter[('S',groupTags[0])] += 1

(68, 172)
(209, 364)
(329, 133)


## Predicting From Model:


In [37]:
groupTags = getModelData(0,0,devSents, taggedDev, devSentDicts)

In [42]:
print devSentDicts[(0,0)]
print devSents[0][0]
groupTags

[u'scientific']
Infrared radiation is used in industrial, scientific, and medical applications.


[('VBN', 'OTHERCAP', False),
 ('NN', u'O', False),
 ('VBZ', 'STOPWORD', False),
 ('VBN', u'O', False),
 ('IN', 'STOPWORD', False),
 ('JJ', u'O', False),
 (',', 'PUNC', False),
 ('JJ', u'O', True),
 (',', 'PUNC', False),
 ('CC', 'STOPWORD', False),
 ('JJ', u'O', False),
 ('NNS', u'O', False),
 ('.', 'PUNC', False)]

In [43]:
print answerCounter[('S',('VBN', 'OTHERCAP', True) )]
print answerCounter[('S',('VBN', 'OTHERCAP', False) )]


31
447


In [68]:
def predictAnswer(groupTags):
    newTags = []
    if len(groupTags) == 0:
        return newTags
    prevTag = 'S'
    multiplier = 1
    for tag in groupTags:
        trueTag = (tag[0], tag[1], True)
        falseTag = (tag[0], tag[1], False)
        numTrue = answerCounter[(prevTag, trueTag)]
        numFalse = answerCounter[(prevTag, falseTag)]
        if numTrue * multiplier >= numFalse:
            newTags.append(trueTag)
            prevTag = trueTag
            multiplier = 1
        else :
            newTags.append(falseTag)
            prevTag = falseTag
            multiplier = 50
    return newTags

In [71]:
theSent =  devSents[0][1]
groupTags = getModelData(0,1,devSents, taggedDev, devSentDicts)
prediction  = predictAnswer(groupTags)
tokenSent = nltk.word_tokenize(theSent)
for i in range (0, len(tokenSent)):
    print tokenSent[i]
    print groupTags[i]
    print prediction[i]
    print

Night-vision
('NN', 'OTHERCAP', False)
('NN', 'OTHERCAP', False)

devices
('NNS', u'O', False)
('NNS', u'O', False)

using
('VBG', u'O', False)
('VBG', u'O', False)

active
('JJ', u'O', True)
('JJ', u'O', True)

near-infrared
('JJ', u'O', True)
('JJ', u'O', True)

illumination
('NN', u'O', True)
('NN', u'O', True)

allow
('IN', u'O', False)
('IN', u'O', True)

people
('NNS', u'O', False)
('NNS', u'O', True)

or
('CC', 'STOPWORD', False)
('CC', 'STOPWORD', True)

animals
('NNS', u'O', False)
('NNS', u'O', True)

to
('TO', 'STOPWORD', False)
('TO', 'STOPWORD', True)

be
('VB', 'STOPWORD', False)
('VB', 'STOPWORD', True)

observed
('VBN', u'O', False)
('VBN', u'O', True)

without
('IN', u'O', False)
('IN', u'O', True)

the
('DT', 'STOPWORD', False)
('DT', 'STOPWORD', True)

observer
('NN', u'O', False)
('NN', u'O', True)

being
('VBG', 'STOPWORD', False)
('VBG', 'STOPWORD', True)

detected
('VBN', u'O', False)
('VBN', u'O', True)

.
('.', 'PUNC', False)
('.', 'PUNC', True)

