# Load Data

In [1]:
# Load in the python script containing the same code as the load the data notebook
%run loadData.py
# now we can access train, dev, and test
# along with trainSents, devSents testSents

# Demo Data

In [2]:
trainSents[0][2]

u'Phonograph records are generally described by their diameter in inches (12", 10", 7"), the rotational speed in rpm at which they are played (16 2\u20443, 33 1\u20443, 45, 78), and their time capacity resulting from a combination of those parameters (LP \u2013 long playing 33 1\u20443 rpm, SP \u2013 78 rpm single, EP \u2013 12-inch single or extended play, 33 or 45 rpm); their reproductive quality or level of fidelity (high-fidelity, orthophonic, full-range, etc.), and the number of audio channels provided (mono, stereo, quad, etc.).'

In [3]:
train[0][0]

{u'answer': u'long playing',
 u'answer_sentence': 2,
 u'question': u'What does LP stand for when it comes to time capacity?'}

In [108]:
documents = testSents[0]
questions = test[0]

In [109]:
print len(questions)

287


# Useful Imports

In [5]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

# Shared Workflow Thoughts (dealing with .ipynb notebooks)

Think with each feature we do below, create generalized functions that can be easily composed with easy names, split by type.

Then create/use small demo template below using the locked document/questions above to get intuition, check sanity, iterate quickly, to help keep us all on the same page.

This way we keep everything well contained/documented/explainable, will help with report writing.

Then in separate notebook we do statistical valid/testing for error exploration/analysis, using generalized functions above - easily changeable/copyable.

Finally put it all in a python file that will do full run. Write TODOs to illustrate next steps/improvements, that way can stay on top/track/improve upon easily.

# Sentence Retreival

The first part of your basic QA system will use a bag-of-words (BOW) vector space model to identify the sentence in the Wikipedia article which is most likely to contain the answer to a question, using standard information retrieval techniques. Here the "query" is the question, the "documents" are actually sentences, and each Wikipedia article should be viewed as separate "document collection". You should apply various preprocessing steps appropriate to this situation, including term weighting; if you are at all uncertain about what choices to make, you should evaluate them using the dev data, and use the results to justify your choice in your report.

##### TODO

* Improving tuning of preprocessing/lemmatize functions for use QA case

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [12]:
# Tuning functions

import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Follow lemmatize function from guide notebook: WSTA_N1B_preprocessing.ipynb
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

In [13]:
# Core functions

def vectorize_documents(text_documents):

    vectorizer = TfidfVectorizer(stop_words='english')
    vector_documents = vectorizer.fit_transform(text_documents)
    
    return [vector_documents, vectorizer]

def vectorize_query(vectorizer, text_query):
    return vectorizer.transform([text_query])

def process_neighbours(vector_documents):
    
    neighbours = NearestNeighbors(1, algorithm="brute", metric="cosine")
    neighbours.fit(vector_documents)
    
    return neighbours

def closest_document(neighbours, vector_query):

    result = neighbours.kneighbors(vector_query, 1, return_distance=True)

    result_index = result[1][0][0]
    result_distance = result[0][0][0]
    
    return [result_distance, result_index]

In [16]:
# Demonstration function

def demo_process_set(questions, documents):
    
    vector_documents, vectorizer = vectorize_documents(documents)
    analyze = vectorizer.build_analyzer()
    neighbours = process_neighbours(vector_documents)

    print "=" * 20
    print "Vector documents shape: {0}".format(vector_documents.shape)
    print "Actual documents length: {0}".format(len(documents))
    print "=" * 20, "\n"
    
    for question in questions[10:10+3]:
        
        text_query = question["question"]

        print "Text query:\n\n\t{0}\n".format(text_query)

        vector_query = vectorize_query(vectorizer, text_query)

        print "Vector query shape:\n\n\t{0}".format(vector_query.shape)

        result_distance, result_index  = closest_document(neighbours, vector_query)
        
        print

        print "Result:\n\n\tDistance ({0}), Index ({1})\n".format(result_distance, result_index)

        print

        print "Query (text):\n\n\t{0}\n".format(text_query)
        print "Document (text):\n\n\t{0}".format(documents[result_index].encode("utf-8"))

        print

        print "Query (vector text):\n"
        pp.pprint(analyze(text_query))
        print
        
        print "Document (vector text): \n\n"
        pp.pprint(analyze(documents[result_index]))
        
        print "\n", "=" * 20, "\n"

In [17]:
demo_process_set(questions, documents)

Vector documents shape: (381, 1886)
Actual documents length: 381

Text query:

	Where did the war begin?

Vector query shape:

	(1, 1886)
  (0, 1835)	0.336166251799
  (0, 602)	0.569600436833
  (0, 309)	0.750031728336

Result:

	Distance (0.675636845783), Index (313)


Query (text):

	Where did the war begin?

Document (text):

	However, Greece did not coordinate its plans with Russia, did not declare war, and received no outside military or financial support.

Query (vector text):

[u'did', u'war', u'begin']

Document (vector text): 


[   u'greece',
    u'did',
    u'coordinate',
    u'plans',
    u'russia',
    u'did',
    u'declare',
    u'war',
    u'received',
    u'outside',
    u'military',
    u'financial',
    u'support']


Text query:

	Russian troops took over which provinces first?

Vector query shape:

	(1, 1886)
  (0, 1767)	0.464837369809
  (0, 1740)	0.565353995174
  (0, 1519)	0.303980942175
  (0, 1372)	0.609833310475

Result:

	Distance (0.644976635632), Index (10)


Que

# Entity Extraction

The second main part of your basic QA system is an NER system. In this initial system you should have at least four
answer types: PERSON, LOCATION, NUMBER, and OTHER. You should run the Stanford NER system over your
sentences to extract people and location entities (Hint: make use of the "tag_sents" method in the NLTK interface to do
this efficiently for multiple sentences in a single call, otherwise this will be very slow; you may also want to cache the
entity information during development of your system, rather than calling Stanford NER for each run). Note that
contiguous words tagged as the same type should be considered part of the same entity. ORGANIZATION entities
extracted by the NER system should be considered OTHER. You should also extract and treat as OTHER any other
non-sentence initial sequence of capitalized words not tagged by Stanford NER. Finally, you should label all numbers as
NUMBER. In this process, you might notice errors related to your preprocessing (e.g. tokenization), errors which can be
easily corrected should be addressed at this stage.

##### TODO

* Finish NER Tagging 
* Test run stanford function on another machine (+ record times)

In [32]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

# The required jar files : https://nlp.stanford.edu/software/CRF-NER.shtml#Download
# It's 171mb so I've added to the gitignore
# If you download it, and rename the folder name "stanford" in the main directory
classifier = './stanford/classifiers/english.all.3class.distsim.crf.ser.gz'
jar = './stanford/stanford-ner.jar'

sTagger = StanfordNERTagger(classifier,jar)

In [33]:
#Quick test of Tagger
text = "There were three geese, Derek, Joshi, and Alex, who lived in Melbourne, and worked at Microsoft"
tokenizedText = word_tokenize(text)
classifiedText = sTagger.tag_sents([tokenizedText])

# Warning, it takes a little while ~ 5 seconds on my comp
print(classifiedText)

[[(u'There', u'O'), (u'were', u'O'), (u'three', u'O'), (u'geese', u'O'), (u',', u'O'), (u'Derek', u'PERSON'), (u',', u'O'), (u'Joshi', u'PERSON'), (u',', u'O'), (u'and', u'O'), (u'Alex', u'PERSON'), (u',', u'O'), (u'who', u'O'), (u'lived', u'O'), (u'in', u'O'), (u'Melbourne', u'LOCATION'), (u',', u'O'), (u'and', u'O'), (u'worked', u'O'), (u'at', u'O'), (u'Microsoft', u'ORGANIZATION')]]


In [34]:
import pickle # Useful for read / write of list file
import os #Needed to check if file exists

In [35]:
# Lets store the stanford tagger output in a file
# This function returns the tagging output of stanford for each dataset
# with datasetName - 'train', 'dev', test' 

def getStanfordTagging(datasetName):
    fnameTrain = './preCompTags/stanfordTaggedTrain.txt'
    fnameDev = './preCompTags/stanfordTaggedDev.txt'
    fnameTest = './preCompTags/stanfordTaggedTest.txt'
    
    theFilePath = ''
    theSents = []
    if (datasetName == 'train'):
        theFilePath = fnameTrain
        theSents = trainSents
    elif (datasetName == 'dev'):
        theFilePath = fnameDev
        theSents = devSents
    elif (datasetName == 'test'):
        theFilePath = fnameTest
        theSents = testSents
    else :
        raise ValueError('Incorrect datasetName: ' + datasetName + ', choose from - "train", "dev", "test" ') 
    if (os.path.exists(theFilePath)):
        with open(theFilePath, "rb") as fp:
            stanfordTags = pickle.load(fp)
            return stanfordTags
    
    else :
        #Need to create taggings!
        taggedSentsList = []
        for sents in theSents:
            tokenisedSents = [word_tokenize(sent) for sent in sents]
            classifiedSents = sTagger.tag_sents(tokenisedSents)
            taggedSentsList.append(classifiedSents)
        #And save them
        with open(theFilePath, "wb") as fp: 
            pickle.dump(taggedSentsList, fp)
        return taggedSentsList
    

In [36]:
taggedTrain = getStanfordTagging('train')
taggedDev = getStanfordTagging('dev')
taggedTest = getStanfordTagging('test')

In [141]:
print taggedTest[0][2]

[(u'The', u'O'), (u'French', u'O'), (u'promoted', u'O'), (u'the', u'O'), (u'rights', u'O'), (u'of', u'O'), (u'Catholics', u'O'), (u',', u'O'), (u'while', u'O'), (u'Russia', u'LOCATION'), (u'promoted', u'O'), (u'those', u'O'), (u'of', u'O'), (u'the', u'O'), (u'Eastern', u'O'), (u'Orthodox', u'O'), (u'Christians', u'O'), (u'.', u'O')]


In [38]:
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


# Given a stanford tagged list, refines the list by:\n",
# Grouping all contiguous words with the same tag\n",
# Relabels Organisations as Other\n",
# Labels Number\n",
def refineWordTags(taggedWordList):
    newWordTags = []
    prevWord = ''
    prevTag = taggedWordList[0][1] # Ie the first tag\n",
    for (word, tag) in taggedWordList:
        if (tag == 'ORGANIZATION'):
            tag = 'O'
        if (tag == 'O'):
            #Might be a number
            if isNumber(word):
                tag = 'NUMBER'
            elif isCapitalised(word):
                tag = 'OTHERCAP'
            elif isStopWord(word):
                tag = 'STOPWORD'
        if (tag == prevTag):
            prevWord += ' ' + word
        else :
            newWordTags.append((prevWord, prevTag))
            prevWord = word
            prevTag = tag
    # Need to add the final ones
    newWordTags.append((prevWord, prevTag))
    return newWordTags
        
# Thanks : http://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers\n",
numInWords = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"
       , "hundred", "thousand", "million", "billion", "trillion"]

def isCapitalised (word):
    if len(word) == 0:
        return False
    return word[0].isupper()

# Returns true if the word represents a number\n",
def isNumber(word):
    pattern = ".?(\\d)+((,|.)(\\d)+)*"
    if re.match(pattern,word) :
        return True
    if word.lower() in numInWords:
        return True
    return False

def isStopWord(word):
    return word.lower() in stop_words


In [42]:
# Small test:
print (taggedTrain[1][0])
print 
print refineWordTags(taggedTrain[1][0])

[(u'Raleigh', u'O'), (u'(', u'O'), (u'\u02c8r\u0251\u02d0li', u'O'), (u';', u'O'), (u'RAH-lee', u'O'), (u')', u'O'), (u'is', u'O'), (u'the', u'O'), (u'capital', u'O'), (u'of', u'O'), (u'the', u'O'), (u'state', u'O'), (u'of', u'O'), (u'North', u'LOCATION'), (u'Carolina', u'LOCATION'), (u'as', u'O'), (u'well', u'O'), (u'as', u'O'), (u'the', u'O'), (u'seat', u'O'), (u'of', u'O'), (u'Wake', u'LOCATION'), (u'County', u'LOCATION'), (u'in', u'O'), (u'the', u'O'), (u'United', u'LOCATION'), (u'States', u'LOCATION'), (u'.', u'O')]

[('', u'O'), (u'Raleigh', 'OTHERCAP'), (u'( \u02c8r\u0251\u02d0li ;', u'O'), (u'RAH-lee', 'OTHERCAP'), (u')', u'O'), (u'is the', 'STOPWORD'), (u'capital', u'O'), (u'of the', 'STOPWORD'), (u'state', u'O'), (u'of', 'STOPWORD'), (u'North Carolina', u'LOCATION'), (u'as', 'STOPWORD'), (u'well', u'O'), (u'as the', 'STOPWORD'), (u'seat', u'O'), (u'of', 'STOPWORD'), (u'Wake County', u'LOCATION'), (u'in the', 'STOPWORD'), (u'United States', u'LOCATION'), (u'.', u'O')]


In [22]:
#Quick regex test:
print isNumber('.25')
print isNumber('123.123')
print isNumber('163.2342.234  ')
print isNumber('123,123,123.2  ')
print isNumber('joshi')
print isNumber('$123')
print isNumber('four') 
print isNumber('h123') # Should this be a number?
print isNumber('123m')

True
True
True
True
False
True
True
True
True
True


In [19]:
# For each question, evaluate if the answer is present as an entity

def evaluateNER(questionsList,documentsList, numToEval):
    correct = []
    wrong = []
    for i in range (0, numToEval):
        documents = documentsList[i]
        questions = questionsList[i]
        for j in range (0, len(questions)):
            answer = questionsList[i][j]["answer"]
            answerID = questionsList[i][j]["answer_sentence"]
            possAnswers = refineWordTags(taggedDev[i][answerID])
            inThere = False
            for possAnswer in possAnswers:
                if possAnswer[0] == answer:
                    inThere = True
                    break
            if inThere:
                correct.append((i,j, possAnswers))
            else :
                wrong.append((i,j, answer, possAnswers))
    return (correct, wrong)


In [20]:
(corNER, wrongNER) = evaluateNER(dev, devSents,len(dev))

print("Number Correct : " + str(len(corNER)))
print("Number incorrect: " + str(len(wrongNER)))
print("Average correct : " + str((len(corNER) + 0.0) / (len(corNER)+len(wrongNER))))

Number Correct : 3408
Number incorrect: 5055
Average correct : 0.402694080113


In [21]:
# Show the wrong taggings
for num in range (0, 10):
    (i,j, answer, possAnswers) = wrongNER[ num]
    print "ANSWER : " + answer
    for possAnswer in possAnswers:
        print possAnswer[0]
    print

ANSWER : active near-infrared illumination

Night-vision
devices using active near-infrared illumination allow people
or
animals
to be
observed without
the
observer
being
detected .

ANSWER : Infrared astronomy

Infrared
astronomy uses sensor-equipped telescopes
to
penetrate dusty regions
of
space ,
such as
molecular clouds ; detect objects
such as
planets ,
and to
view highly red-shifted objects
from the
early days
of the
universe .

ANSWER : red

Infrared
astronomy uses sensor-equipped telescopes
to
penetrate dusty regions
of
space ,
such as
molecular clouds ; detect objects
such as
planets ,
and to
view highly red-shifted objects
from the
early days
of the
universe .

ANSWER : Infrared thermal-imaging cameras

Infrared
thermal-imaging cameras
are
used
to
detect heat loss
in
insulated systems ,
to
observe changing blood flow
in the
skin ,
and to
detect overheating
of
electrical apparatus .

ANSWER : scientific

Infrared
radiation
is
used
in
industrial , scientific ,
and
medical appli

# Answer Ranking

After you have extracted a set of entities from your sentence, you will rank them to choose the best answer. The ranking
should be based on three factors. First, answers whose content words all appear in the question should be ranked
lowest. Second, answers which match the question type should be ranked higher than those that don't; for this, you
should build a simple rule-based question type classifier based on key words (e.g. questions which contain "who" are
people). Third, among entities of the same type, the prefered entity should be the one which is closer in the sentence to a
closed-class word from the question.

##### TODO

* TODO

In [78]:
# Part A gives us a most likely sentence
# Part B splits into entities


# Given a question, returns a tag for the answer form
# From PERSON, LOCATION, NUMBER, OTHER 
# Assuming question is lowercased
def getQuestionType(question):
    if 'Who' in question:
        return "PERSON"
    if 'where' in question:
        return "LOCATION"
    if 'How many' in question:
        return "NUMBER"
    if 'How much' in question:
        return "NUMBER"
    if 'When' in question:
        return "NUMBER"
    if 'what year' in question:
        return "NUMBER"
    else:
        return "O"

In [None]:
What did Herschel call the infrared spectrum?
[('', u'O'), (u'He', 'OTHERCAP'), (u'was', 'STOPWORD'), (u'surprised', u'O'), (u'at the', 'STOPWORD'), (u'result', u'O'), (u'and', 'STOPWORD'), (u'called', u'O'), (u'them', 'STOPWORD'), (u'``', u'O'), (u'Calorific Rays', 'OTHERCAP'), (u"'' .", u'O')]
Calorific Rays
was

In [152]:
question = "What did Herschel call the infrared spectrum?"
a = [('', u'O'), (u'He', 'OTHERCAP'), (u'was', 'STOPWORD'), (u'surprised', u'O'), (u'at the', 'STOPWORD'), (u'result', u'O'), (u'and', 'STOPWORD'), (u'called', u'O'), (u'them', 'STOPWORD'), (u'``', u'O'), (u'Calorific Rays', 'OTHERCAP'), (u"'' .", u'O')]

In [164]:
test = first_filter(question,a)
print test

{'ranking_list': [(u'surprised', u'O'), (u'at the', 'STOPWORD'), (u'result', u'O'), (u'called', u'O'), (u'``', u'O'), (u'Calorific Rays', 'OTHERCAP'), (u"'' .", u'O')], 'answer_entities_list': ['', u'He', u'was', u'surprised', u'at the', u'result', u'and', u'called', u'them', u'``', u'Calorific Rays', u"'' ."], 'same_word_list': [u'He']}


In [165]:
test2 = second_filter(question, test)
print test2

{'ranking_list': [u'surprised', u'at the', u'result', u'called', u'``', u'Calorific Rays', u"'' ."], 'Other_tags_list': [], 'answer_entities_list': ['', u'He', u'was', u'surprised', u'at the', u'result', u'and', u'called', u'them', u'``', u'Calorific Rays', u"'' ."], 'same_word_list': [u'He']}


In [72]:
#print cal_distance_words('Melbourne',test2["same_word_list"],test2["answer_entities_list"])
test3 = third_filter(question, test2,test2["answer_entities_list"])
print test3

NameError: name 'third_filter' is not defined

In [75]:
#first:  whose anwsers all appear in the questions rank the lowest
#assumption: input question in its dictionary value

def first_filter(question, anwser_entities):
    ranking_dict_1 = {}
    ranking_list = []
    merge_list = []
    answer_entities_list = []
    for entity in anwser_entities:
        answer_entities_list.append(entity[0])
        if entity[0] in question:
            if entity[0] not in stop_words and entity[0]!='':
                #print entity[0]
                merge_list.append(entity[0])
        else:
            if entity[0] not in stop_words and entity[0]!='':
                ranking_list.append(entity)
    ranking_dict_1["ranking_list"] = ranking_list
    ranking_dict_1["same_word_list"] = merge_list
    ranking_dict_1["answer_entities_list"] = answer_entities_list
    return ranking_dict_1

In [76]:
#second: answers which match the question type should be ranked higher than those that dont

#assumption: save questions' type in the dictionary format quesiton1 = 
#{u'answer': u'long playing',u'answer_sentence': 2, u'question':......., 'question_type:'PERSON'}

def second_filter(question, ranking_dict_1):
    question_with_type ={}
    question_with_type['question_type']= getQuestionType(question)
    question_with_type['question'] = question
    #print question_with_type
    ranking_dict_2 = {}
    ranking_list =[]
    merge_list = []
    for entity in ranking_dict_1["ranking_list"]:
        if question_with_type['question_type'] == 'O':
            ranking_list.append(entity[0])
        else:
            if entity[1] == question_with_type['question_type']:
                ranking_list.append(entity[0])
            else:
                merge_list.append(entity[0])
    ranking_dict_2["same_word_list"] = ranking_dict_1["same_word_list"]
    ranking_dict_2["ranking_list"] = ranking_list
    ranking_dict_2["Other_tags_list"] = merge_list
    ranking_dict_2["answer_entities_list"] = ranking_dict_1["answer_entities_list"]
    return ranking_dict_2
    

In [80]:
#Thrid: based on second, the prefered entity should be the one which is close in 
#the sentence to a closed-class word form the question
from collections import OrderedDict

def cal_distance_words(entity,same_words, anwser_entities):
    temp = 0
    for same_word in same_words:
        temp += abs(anwser_entities.index(entity) - anwser_entities.index(same_word))
    return float(temp)/float(len(same_words))

def sort_orderedDict(orderdict):
    return OrderedDict(sorted(orderdict.items(), key = lambda x:x[1], reverse = False))
        

def third_filter(question,second_filter,anwser_entities):
    dict_ranking ={}
    if (len(second_filter["same_word_list"])==0):
        if len(second_filter['ranking_list']) != 0:
            return second_filter['ranking_list'][0]
        else:
            return 0
    else:
        for entity in second_filter["ranking_list"]:
            dict_ranking[entity]= cal_distance_words(entity, second_filter["same_word_list"],anwser_entities)
        #print dict_ranking
        dict_ranking = sort_orderedDict(dict_ranking)
        #print dict_ranking
        if len(dict_ranking.items()) ==0:
            return 0
        else:
            return dict_ranking.items()[0][0]

In [67]:

question = "where does Jack live?"
possAnswers = [(u'Jack', u'PERSON'),("live",'O'),("in",'O'),("Melbourne",'LOCATION')]

In [47]:
print first_filter(question, a)
print second_filter(question,first_filter(question, a))
print third_filter(question,second_filter(question,first_filter(question, a)),second_filter(question,first_filter(question, a))['answer_entities_list'])

{'ranking_list': [(u'Jack', u'PERSON'), ('live', 'O'), ('in', 'O'), ('Footscray', 'LOCATION'), ('work', 'O'), ('Carlton', 'LOCATION')], 'answer_entities_list': [u'Jack', 'live', 'in', 'Footscray', 'work', 'Carlton'], 'same_word_list': []}
{'ranking_list': [u'Jack', 'Footscray', 'Carlton'], 'Other_tags_list': ['live', 'in', 'work'], 'answer_entities_list': [u'Jack', 'live', 'in', 'Footscray', 'work', 'Carlton'], 'same_word_list': []}
Jack


## Evaluation 
I've added the evaluation here to save exporting the above functions


In [239]:
# For each question, evaluate if the answer is present as an entity

def evaluateNER(questionsList,documentsList, numToEval):
    correct = []
    wrong = []
    for i in range (0, numToEval):
        documents = documentsList[i]
        questions = questionsList[i]
        for j in range (0, len(questions)):
            answer = questionsList[i][j]["answer"]
            answerID = questionsList[i][j]["answer_sentence"]
            possAnswers = refineWordTags(taggedDev[i][answerID])
            inThere = False
            for possAnswer in possAnswers:
                if possAnswer[0] == answer:
                    inThere = True
                    break
            if inThere:
                correct.append((i,j, possAnswers))
            else :
                wrong.append((i,j, answer, possAnswers))
    return (correct, wrong)


In [240]:
# For each question, evaluate if the answer is present as an entity

def evaluateAnswerRanking(questionsList,documentsList, numToEval):
    correct = []
    wrong = []
    (corNER, wrongNER) = evaluateNER(questionsList,documentsList, numToEval)
    entityListsWithAnswer = corNER
    for (i,j,possAnswers) in entityListsWithAnswer:
        question = questionsList[i][j]["question"]
        answer =  questionsList[i][j]["answer"]
        answerPredicited = third_filter(question,second_filter(question,first_filter(question, possAnswers)),second_filter(question,first_filter(question, possAnswers))['answer_entities_list']) 
        #print question
        #print possAnswers
        #print answer
        #print answerPredicited
        #print '%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%'
        # TODO add Dereks part here
        if (answerPredicited == answer):
            correct.append((i,j))
        else :
            wrong.append((i,j,answerPredicited))
        #print correct
    return (correct, wrong)
    

In [241]:
(corAns, wrongAns) = evaluateAnswerRanking(dev, devSents,len(dev))
print("Number Correct : " + str(len(corAns)))
print("Number incorrect: " + str(len(wrongAns)))
print ("Average correct : " + str((len(corAns) + 0.0) / (len(corAns)+len(wrongAns))))

Number Correct : 1000
Number incorrect: 2408
Average correct : 0.293427230047


## submission funtions
run it here.
Start with Joshi's closest sentences in each doc

In [203]:
# Demonstration function

def process_set(questions, documents):
    vector_documents, vectorizer = vectorize_documents(documents)
    analyze = vectorizer.build_analyzer()
    neighbours = process_neighbours(vector_documents)
    
    joshi_list = []
    for question in questions:
        joshi_dict = {}
        text_query = question["question"]
        vector_query = vectorize_query(vectorizer, text_query)
        result_distance, result_index  = closest_document(neighbours, vector_query)
        
        joshi_dict["query"] = text_query
        joshi_dict["answer_sentence"] =documents[result_index].encode("utf-8")
        joshi_dict["index"]=result_index
        joshi_list.append(joshi_dict)
    return joshi_list

In [204]:
joshi_list =[]
for index in range(len(testSents)):
    documents = testSents[index]
    questions = test[index]
    joshi_list.append(process_set(questions, documents))

append Alex's tagged part

In [207]:
## alex function
print joshi_list[0][0]

{'query': u'What year did the Crimean War begin?', 'answer_sentence': 'The Russians did nothing and he evacuated to Batum in February of the following year.', 'index': 281}


In [218]:
for doc_id in range(len(joshi_list)):
    for each_one_index in range(len(joshi_list[doc_id])):
        index = joshi_list[doc_id][each_one_index]['index']
        joshi_list[doc_id][each_one_index]["possAnswer"] = taggedTest[doc_id][index]

In [223]:
for doc in joshi_list:
    for ele in doc:
        ele['possAnswer'] = refineWordTags(ele['possAnswer'])

use Derek's answer ranking and output the submission file

In [227]:
refine_list =[]
for doc in joshi_list:
    for ele in doc:
        refine_list.append(ele)

In [230]:
answer_list = []
for ele in refine_list:
    answer_dict = {}
    question = ele['query']
    possAnswers = ele['possAnswer']
    answer = third_filter(question,second_filter(question,first_filter(question, possAnswers)),second_filter(question,first_filter(question, possAnswers))['answer_entities_list'])
    answer_dict['sentence'] = possAnswers
    answer_dict['question'] = question
    answer_dict['answer'] = answer
    answer_list.append(answer_dict)

In [233]:
import csv
headers = ['id', 'answer']

with open('submit.csv','w') as f:
    f_csv = csv.DictWriter(f, headers)
    f_csv.writeheader()
    for index in range(len(answer_list)):
        if isinstance( answer_list[index]['answer'], int):
            f_csv.writerows([{'id':index+1,'answer':answer_list[index]['answer']}])
        else:
            f_csv.writerows([{'id':index+1,'answer':answer_list[index]['answer'].encode("utf-8")}])