In [1]:
# Load in the python script containing the same code as the load the data notebook
%run loadData.py
# now we can access train, dev, and test
# along with trainSents, devSents testSents

In [2]:
# Shim names for later clean

train_question_set = train
train_document_set = trainSents

dev_question_set = dev
dev_document_set = devSents

test_question_set = test
test_document_set = testSents

In [3]:
rapid_size = 1

rapid_question_set = train_question_set[:rapid_size]
rapid_document_set = train_document_set[:rapid_size]

In [4]:
# Shim for easier name spacing

DATA = {
    "rapid" : {
            "question_set": rapid_question_set,
            "document_set": rapid_document_set,
    },
    "train" : {
            "question_set": train_question_set,
            "document_set": train_document_set,
    },
    "dev" : {
            "question_set": dev_question_set,
            "document_set": dev_document_set,
    },
    "test" : {
            "question_set": test_question_set,
            "document_set": test_document_set,
    }    
}

In [5]:
# Imports

import pprint
pp = pprint.PrettyPrinter(indent=4)

import nltk
from nltk.corpus import stopwords

from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

from string import punctuation  

import re
import pickle
import os

import csv

from collections import defaultdict


In [6]:
# Core functions

classifier = './stanford/classifiers/english.all.3class.distsim.crf.ser.gz'
jar = './stanford/stanford-ner.jar'

sTagger = StanfordNERTagger(classifier,jar)

punct_tokens = set(punctuation)
extra_tokens = set(["what", "where", "how", "when", "who"])

stop_words = set(stopwords.words('english'))

filter_tokens = extra_tokens.union(punct_tokens).union(stop_words)

In [7]:
# Shim function for later clean

def getStanfordTagging(datasetName):
    fnameTrain = './preCompTags/stanfordTaggedTrain.txt'
    fnameDev = './preCompTags/stanfordTaggedDev.txt'
    fnameTest = './preCompTags/stanfordTaggedTest.txt'
    
    theFilePath = ''
    theSents = []
    if (datasetName == 'train'):
        theFilePath = fnameTrain
        theSents = trainSents
    elif (datasetName == 'dev'):
        theFilePath = fnameDev
        theSents = devSents
    elif (datasetName == 'test'):
        theFilePath = fnameTest
        theSents = testSents
    else :
        raise ValueError('Incorrect datasetName: ' + datasetName + ', choose from - "train", "dev", "test" ') 
    if (os.path.exists(theFilePath)):
        with open(theFilePath, "rb") as fp:
            stanfordTags = pickle.load(fp)
            return stanfordTags
    
    else :
        #Need to create taggings!
        taggedSentsList = []
        for sents in theSents:
            tokenisedSents = [word_tokenize(sent) for sent in sents]
            classifiedSents = sTagger.tag_sents(tokenisedSents)
            taggedSentsList.append(classifiedSents)
        #And save them
        with open(theFilePath, "wb") as fp: 
            pickle.dump(taggedSentsList, fp)
        return taggedSentsList

In [8]:
tagged_train_set = getStanfordTagging('train')
tagged_dev_set = getStanfordTagging('dev')
tagged_test_set = getStanfordTagging('test')

In [9]:
tagged_rapid_set = tagged_train_set[:rapid_size]

In [10]:
# Shim for easier name spacing

DATA["rapid"]["tagged_set"] = tagged_rapid_set
DATA["train"]["tagged_set"] = tagged_train_set
DATA["dev"]["tagged_set"] = tagged_dev_set
DATA["test"]["tagged_set"] = tagged_test_set

In [11]:
# Preprocessing tuning functions

# Follow lemmatize function from guide notebook: WSTA_N1B_preprocessing.ipynb
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
word_tokenizer = nltk.tokenize.WordPunctTokenizer() #word_tokenize #tokenize.regexp.WordPunctTokenizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def pre_process_tf_idf(line):
    tokenized_sentence = word_tokenizer.tokenize(line.lower())
    lemmatized_sentence = [lemmatize(token) for token in tokenized_sentence]
    filtered_sentence = [token for token in lemmatized_sentence if token not in filter_tokens]
    return filtered_sentence

In [12]:
# Core functions

def vectorize_documents(text_documents):

    vectorizer = TfidfVectorizer(stop_words='english', tokenizer=pre_process_tf_idf)
    vector_documents = vectorizer.fit_transform(text_documents)
    
    return [vector_documents, vectorizer]

def vectorize_query(vectorizer, text_query):
    return vectorizer.transform([text_query])

def process_neighbours(vector_documents):
    
    neighbours = NearestNeighbors(1, algorithm="brute", metric="cosine")
    neighbours.fit(vector_documents)
    
    return neighbours

def closest_document(neighbours, vector_query):

    result = neighbours.kneighbors(vector_query, 1, return_distance=True)

    result_index = result[1][0][0]
    result_distance = result[0][0][0]
    
    return [result_distance, result_index]

In [13]:
def generate_part_a_output(name, data):
    
    question_set = data[name]["question_set"]
    document_set = data[name]["document_set"]
    
    part_a_output = []
    
    for i, questions in enumerate(question_set):
        
        sentences = document_set[i]

        vector_sentences, vectorizer = vectorize_documents(sentences)
        neighbours = process_neighbours(vector_sentences)

        for j, question in enumerate(questions):
            
            text_query = question["question"]
            vector_query = vectorize_query(vectorizer, text_query)
            result_similarity, result_index  = closest_document(neighbours, vector_query)
            
            result = {
                "set_index" : i,
                "question_index" : j,
                "sentence_index" : result_index
            }
            
            part_a_output.append(result)
            
    return part_a_output

In [14]:
def process_part_a(name, data):
    
    data[name]["a_output_answer_set"] = generate_part_a_output(name, data)
    print
    print "Part A Output: "
    pp.pprint(data[name]["a_output_answer_set"][:rapid_size])
    print

In [16]:
# Shim function for later clean

def evaluate_retrieval(name, data):
    
    question_set = data[name]["question_set"]
    a_output_answer_set = data[name]["a_output_answer_set"]
    
    correct = []
    wrong = []
    
    for result_a in a_output_answer_set:
        
        question = question_set[result_a["set_index"]][result_a["question_index"]]
        
        answer_sentence = question["answer_sentence"]
        predicted_answer_sentence = result_a["sentence_index"]
        
        if answer_sentence == predicted_answer_sentence:
            correct.append(result_a)
        else :
            wrong.append(result_a)
            
    return (correct, wrong)

In [17]:
def process_generic(name, data, process_type, process_func):

    (correct, wrong) = process_func(name, data)
    
    data[name][process_type + "_correct"] = correct
    data[name][process_type + "_wrong"] = wrong
#     data[name][process_type + "_full"] = full
    
    total = len(correct) + len(wrong)
    avg = len(correct) * 1.0 / total
    
    print process_type.capitalize() + " Correct: ", len(correct)
    print process_type.capitalize() + " Wrong: ", len(wrong)
    print process_type.capitalize() + " Total: ", total
    print process_type.capitalize() + " Overall Average %: ", avg

In [18]:
def process_retrieval(name, data, stats=False):
    print "Processing retrieval: ", name
    process_part_a(name, data)
    if stats:
        process_generic(name, data, "retrieval", evaluate_retrieval)
        
    print

In [19]:
process_retrieval("rapid", DATA, True)

Processing retrieval:  rapid

Part A Output: 
[{   'question_index': 0, 'sentence_index': 149, 'set_index': 0}]

Retrieval Correct:  156
Retrieval Wrong:  248
Retrieval Total:  404
Retrieval Overall Average %:  0.386138613861



In [20]:
process_retrieval("train", DATA, True)

Processing retrieval:  train

Part A Output: 
[{   'question_index': 0, 'sentence_index': 149, 'set_index': 0}]

Retrieval Correct:  43679
Retrieval Wrong:  26480
Retrieval Total:  70159
Retrieval Overall Average %:  0.622571587394



In [21]:
process_retrieval("dev", DATA, True)

Processing retrieval:  dev

Part A Output: 
[{   'question_index': 0, 'sentence_index': 71, 'set_index': 0}]

Retrieval Correct:  5060
Retrieval Wrong:  3403
Retrieval Total:  8463
Retrieval Overall Average %:  0.597896726929



In [22]:
process_retrieval("test", DATA, False)

Processing retrieval:  test

Part A Output: 
[{   'question_index': 0, 'sentence_index': 283, 'set_index': 0}]




In [23]:
# Shim function for later clean

# Thanks for this list to save me typing it : http://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers\n",
numInWords = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"
       , "hundred", "thousand", "million", "billion", "trillion"]

punctuation = ['.',',',';',':']

def isPunctuation(word):
    return word in punctuation
def isCapitalised (word):
    if len(word) == 0:
        return False
    return word[0].isupper()

# Obtained from training data
postUnits = [u'%', u'century', u'years', u'percent', u'years ago', u'days', u'months', u'km', u'hours', u'times', u'inches', u'\xb0C', u'minutes', u'acres', u'\xb0F', u'weeks', u'people', u'sq mi', u'mi', u'ft', u'feet', u'metres', u'mm', u'square miles', u'miles', u'pm', u'per cent', u'year', u'copies', u'yuan', u'men', u'square feet', u'third', u'kilometres', u'nm', u'tonnes', u'species', u'decades', u'barrels', u'tons', u'largest', u'centuries', u'km2']
preUnits = [u'$', u'around', u'late', u'early', u'nearly', u'since', u'approximately', u'number']

# Returns true if the word represents a number\n",
def isNumber(word):
    pattern = ".?(\\d)+((,|.)(\\d)+)*"
    if re.match(pattern,word) :
        return True
    if word.lower() in numInWords:
        return True
    return False

def isStopWord(word):
    return word.lower() in stop_words

In [24]:
# Shim function for later clean

def refine_word_tags(taggedWordList):
    newWordTags = []
    for (word, tag) in taggedWordList:
        if (tag == 'ORGANIZATION'):
            tag = 'O'
        if (tag == 'O'):
            #Might be a number
            if isNumber(word):
                tag = 'NUMBER'
            elif isCapitalised(word):
                tag = 'OTHERCAP'
            elif word in preUnits:
                tag = 'PRENUM'
            elif word in postUnits:
                tag = 'POSTNUM'
            elif isStopWord(word):
                tag = 'STOPWORD'
            elif isPunctuation(word):
                tag = 'PUNC'

        newWordTags.append((word, tag))
    
    newWordTags = combineTags (newWordTags)
    return newWordTags
        
def combineTags(wordTags):
    
    newTags = []
    prevWord = wordTags[0][0]
    prevTag = wordTags[0][1]
    
    for (word, tag) in wordTags[1:]:
        if tag == 'NUMBER' and prevTag == 'PRENUM':
            prevTag = 'NUMBER'
        elif prevTag == 'PRENUM':
            prevTag = 'O'
        if tag == 'POSTNUM' and prevTag == "NUMBER":
            tag = "NUMBER"
        elif tag == "POSTNUM":
            tag = "O"
        newTags.append((prevWord, prevTag))
        prevWord = word
        prevTag = tag
    newTags.append((prevWord, prevTag))
    
#     print newTags
    
    newNewTags = []
    prevWord = newTags[0][0]
    prevTag = newTags[0][1]
    if (prevTag == "OTHERCAP"):
        prevTag = "O"
        
    for (word, tag) in newTags[1:]:
#         print tag, prevTag
        if tag == prevTag :
            prevWord += ' ' + word
        else :
            newNewTags.append((prevWord, prevTag))
            prevWord = word
            prevTag = tag
            
    newNewTags.append((prevWord, prevTag))
    
    return newNewTags

In [25]:
# tags1 = [   (u'Because', 'O'),
#     (u'of', 'STOPWORD'),
#     (u'financial hardships', u'O'),
#     (u'that', 'STOPWORD'),
#     (u'plagued', u'O'),
#     (u'the', 'STOPWORD'),
#     (u'recording industry', u'O'),
#     (u'during that', 'STOPWORD'),
#     (u'period (', u'O'),
#     (u'and', 'STOPWORD'),
#     (u'RCA', 'OTHERCAP'),
#     (u"'s", u'O'),
#     (u'own', 'STOPWORD'),
#     (u'parched revenues )', u'O'),
#     (u',', 'PUNC'),
#     (u'Victor', u'PERSON'),
#     (u"'s long-playing records", u'O'),
#     (u'were', 'STOPWORD'),
#     (u'discontinued', u'O'),
#     (u'by', 'STOPWORD'),
#     (u'early', 'PRENUM'),
#     (u'1933', 'NUMBER'),
#     (u'.', 'PUNC')]

# tags2 = [   (u'At', 'O'),
#     (u'the', 'STOPWORD'),
#     (u'beginning', u'O'),
#     (u'of the', 'STOPWORD'),
#     (u'20th', 'NUMBER'),
#     (u'century', 'POSTNUM'),
#     (u',', 'PUNC'),
#     (u'the', 'STOPWORD'),
#     (u'early', 'PRENUM'),
#     (u'discs played', u'O'),
#     (u'for', 'STOPWORD'),
#     (u'two', 'NUMBER'),
#     (u'minutes', 'POSTNUM'),
#     (u',', 'PUNC'),
#     (u'the same as', 'STOPWORD'),
#     (u'early', 'PRENUM'),
#     (u'cylinder records', u'O'),
#     (u'.', 'PUNC')]

# tags3 = [       (u'early', 'PRENUM'),
#     (u'1933', 'NUMBER'), (u'for', 'STOPWORD'),
#     (u'two', 'NUMBER'),
#     (u'minutes', 'POSTNUM'), ]

# pp.pprint(combineTags(tags3))

In [26]:
def generate_part_b_output(name, data):
    
    question_set = data[name]["question_set"]
    a_output_answer_set = data[name]["a_output_answer_set"]
    tagged_set = data[name]["tagged_set"]
    
    part_b_output = []
    
    for result_a in a_output_answer_set:
        
        stanford_tags = tagged_set[result_a["set_index"]][result_a["sentence_index"]]
        
        filtered_tags = refine_word_tags(stanford_tags)
        
        question = question_set[result_a["set_index"]][result_a["question_index"]]["question"]
        
        result_b = {
            "set_index"  : result_a["set_index"],
            "question_index" : result_a["question_index"],
            "sentence_index" : result_a["sentence_index"],
            "candidates" : filtered_tags
        }
        
        part_b_output.append(result_b)
        
    return part_b_output

In [27]:
def process_part_b(name, data):
    
    data[name]["b_output_answer_set"] = generate_part_b_output(name, data)
    
    print
    print "Part B Output: "
    pp.pprint(data[name]["b_output_answer_set"][:rapid_size])
    print    

## Eval setup stuff

In [36]:
from collections import defaultdict
def getAnswerDict(qss):
    sentDicts = defaultdict(list)
    for docID in range(0, len(qss)):
        qs = qss[docID]
        for q in qs:
            answer = q["answer"]
            answerSent = (docID, q["answer_sentence"])
            sentDicts[answerSent].append(answer)
    return sentDicts
trainSentDicts = getAnswerDict(train)
devSentDicts = getAnswerDict(dev)

In [54]:
#This function returns a list of the indexes of words that are an answer to a question in the train set
def getAnswerPos(docID, sentID, sents, answerDict):
    sententence = sents[docID][sentID]
    listAnswers = answerDict[(docID, sentID)]
    tokenisedSent =  nltk.word_tokenize(sententence)
    answerPosList = []
    for ans in listAnswers:
        tokenisedAns = nltk.word_tokenize(ans)
        # Inefficient
        
        for i in range (0, len(tokenisedSent) - len(tokenisedAns)):
            answerFragment = tokenisedSent[i:i+len(tokenisedAns)]
            if (answerFragment == tokenisedAns):
                answerPosList.append((i,i+len(tokenisedAns)))

                break
    return answerPosList

getAnswerPos(0,0,trainSents, trainSentDicts)

[(24, 28)]

In [44]:
getAnswerPos(0,9,trainSents, trainSentDicts)

[[30, 31], [5, 6], [8]]

In [29]:
partBoutput = generate_part_b_output("train", DATA)

In [69]:
partBoutput[0]

{'candidates': [(u'They', 'O'),
  (u'had a', 'STOPWORD'),
  (u'playing time', u'O'),
  (u'of', 'STOPWORD'),
  (u'eight minutes', 'NUMBER'),
  (u'.', 'PUNC')],
 'question_index': 0,
 'sentence_index': 149,
 'set_index': 0}

In [94]:
print train[0][0]

{u'answer': u'long playing', u'question': u'What does LP stand for when it comes to time capacity?', u'answer_sentence': 2}


In [138]:



def convertCandidatesToRange(candidates):
    candidatesIndexRange = []
    index = 0
    for (words, tag) in candidates:
        tokenisedWords = nltk.word_tokenize(words)
        nextIndex = index + len(tokenisedWords)
        candidatesIndexRange.append((index,nextIndex ))
        index = nextIndex
    return candidatesIndexRange

def nerAdvancedEvaluation(partBoutput,train,trainSents,trainSentDicts):
    correctAnswers = []
    incorrectAnswers = []
    for partBOut in partBoutput[7:8]:
        candidates = partBOut["candidates"]
        set_index = partBOut['set_index']
        sentence_index = partBOut['sentence_index']
        question_index = partBOut['question_index']
        answer_positions = getAnswerPos(set_index,sentence_index,trainSents, trainSentDicts)
        question= train[set_index][question_index]["question"]
        correct_answer = train[set_index][question_index]["answer"]

        if train[set_index][question_index]["answer_sentence"] != sentence_index:
            continue


        candidatesIndexRange = convertCandidatesToRange(candidates)
        print answer_positions
        print candidatesIndexRange
        print

        for (ans_start, ans_end) in answer_positions:
            incorrectCandidates = []
            candidate_ID = -1
            for (cand_start, cand_end) in candidatesIndexRange:
                candidate_ID += 1
                if cand_start == ans_start and cand_end ==ans_end : #If the candidate is the answer
                    correctAnswers.append([candidates[candidate_ID]])
                    break
                if cand_start >= ans_start and cand_end >= ans_end :
                    incorrectAnswers.append([candidates[candidate_ID], correct_answer])
                    break
                if cand_start >= ans_start:
                    if (cand_end <= ans_end):
                        incorrectCandidates.append(candidates[candidate_ID])
                    else :
                        incorrectAnswers.append((incorrectCandidates, correct_answer))
                        break
        
    return (correctAnswers,incorrectAnswers)

In [139]:
(correctAnswers,incorrectAnswers) = nerAdvancedEvaluation(partBoutput,train,trainSents,trainSentDicts)

[(25, 31), (47, 48)]
[(0, 1), (1, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 9), (9, 11), (11, 12), (12, 13), (13, 14), (14, 16), (16, 19), (19, 20), (20, 22), (22, 24), (24, 25), (25, 28), (28, 29), (29, 30), (30, 31), (31, 32), (32, 33), (33, 34), (34, 35), (35, 37), (37, 38), (38, 39), (39, 40), (40, 41), (41, 44), (44, 46), (46, 47), (47, 48), (48, 49)]



In [142]:
print incorrectAnswers[0]

[(u's', 'STOPWORD'), u'disc jockeys (DJ)s']


In [252]:
# Shim function for later clean

def evaluate_ner(name, data):
    
    question_set = data[name]["question_set"]
    b_output_answer_set = data[name]["b_output_answer_set"]
    
    correct = []
    wrong = []
    
    for result_b in b_output_answer_set:
        
        answer = question_set[result_b["set_index"]][result_b["question_index"]]["answer"]
        
        possible_candidates = result_b["candidates"]
        
        answer_exists_in_candidates = False
        
        for candidate in possible_candidates:
            
            candidate_string = candidate[0]
            
            if candidate_string == answer:
                
                answer_exists_in_candidates = True
                
                break
        
        if answer_exists_in_candidates:
            correct.append(result_b)
        else :
            wrong.append(result_b)
            
    return (correct, wrong)

In [253]:
def process_ner(name, data, stats=False):
    print "Processing ner: ", name
    process_part_b(name, data)
    if stats:
        process_generic(name, data, "ner", evaluate_ner)
        
        correct_ner = len(data[name]["ner_correct"])
        correct_ret = len(data[name]["retrieval_correct"])
        
        avg = correct_ner * 1.0 / correct_ret
        
        print "ner".capitalize() + " Correct Average of Previous %: ", avg
    print

In [254]:
process_ner("rapid", DATA, True)

Processing ner:  rapid

Part B Output: 
[   {   'candidates': [   (u'They', 'O'),
                          (u'had a', 'STOPWORD'),
                          (u'playing time', u'O'),
                          (u'of', 'STOPWORD'),
                          (u'eight minutes', 'NUMBER'),
                          (u'.', 'PUNC')],
        'question_index': 0,
        'sentence_index': 149,
        'set_index': 0}]

Ner Correct:  67
Ner Wrong:  337
Ner Total:  404
Ner Overall Average %:  0.165841584158
Ner Correct Average of Previous %:  0.429487179487



In [271]:
process_ner("train", DATA, True)

Processing ner:  train

Part B Output: 
[   {   'candidates': [   (u'They', 'O'),
                          (u'had a', 'STOPWORD'),
                          (u'playing time', u'O'),
                          (u'of', 'STOPWORD'),
                          (u'eight minutes', 'NUMBER'),
                          (u'.', 'PUNC')],
        'question_index': 0,
        'sentence_index': 149,
        'set_index': 0}]

Ner Correct:  20067
Ner Wrong:  50092
Ner Total:  70159
Ner Overall Average %:  0.286021750595
Ner Correct Average of Previous %:  0.459419858513



In [255]:
process_ner("dev", DATA, True)

Processing ner:  dev

Part B Output: 
[   {   'candidates': [   (u'Infrared', 'O'),
                          (u'is', 'STOPWORD'),
                          (u'used', u'O'),
                          (u'in', 'STOPWORD'),
                          (u'night vision equipment', u'O'),
                          (u'when there is', 'STOPWORD'),
                          (u'insufficient visible light', u'O'),
                          (u'to', 'STOPWORD'),
                          (u'see', u'O'),
                          (u'.', 'PUNC')],
        'question_index': 0,
        'sentence_index': 71,
        'set_index': 0}]

Ner Correct:  2415
Ner Wrong:  6048
Ner Total:  8463
Ner Overall Average %:  0.285359801489
Ner Correct Average of Previous %:  0.477272727273



In [276]:
process_ner("test", DATA, False)

Processing ner:  test

Part B Output: 
[   {   'candidates': [   (u'a', 'STOPWORD'),
                          (u'forgotten theatre', u'O'),
                          (u'of the', 'STOPWORD'),
                          (u'Crimean War', 'OTHERCAP'),
                          (u'.', 'PUNC')],
        'question_index': 0,
        'sentence_index': 283,
        'set_index': 0}]




In [257]:
# Shim function for later clean

def getQuestionType(question):
    if 'Who' in question:
        return "PERSON"
    if 'where' in question:
        return "LOCATION"
    if 'How many' in question:
        return "NUMBER"
    if 'How much' in question:
        return "NUMBER"
    if 'When' in question:
        return "NUMBER"
    if 'what year' in question:
        return "NUMBER"
    else:
        return "O"

In [258]:
# First, answers whose content words all appear in the question should be ranked lowest.

def first_filter(question, answer_entities):
   
    ranked_list = []
    
    question = set(pre_process_tf_idf(question))
    
#     print question
#     print
    
    for entity in answer_entities:

        raw_span = entity[0]
        span_tag = entity[1]
        
        set_span = set(pre_process_tf_idf(raw_span))
        
        if span_tag != "O" and span_tag != "STOPWORD" and span_tag !="PUNC":
            
            if set_span.issubset(question):
                
                ranked_list.append([entity, 1])
#                 print "IN", raw_span, span_tag, set_span, question
                
            else:
                
                ranked_list.append([entity, 2])
#                 print "OUT", raw_span, span_tag, set_span, question
    
    return sorted(ranked_list, key=lambda x: x[1], reverse=True)

In [259]:
# First, answers whose content words all appear in the question should be ranked lowest.

def first_filter_object(question, answer_entities):
   
    ranked_list = []
    
    question = set(pre_process_tf_idf(question))
    
#     print question
#     print
    
    for entity in answer_entities:

        raw_span = entity[0]
        span_tag = entity[1]
        
        set_span = set(pre_process_tf_idf(raw_span))
        
        if span_tag != "STOPWORD" and span_tag !="PUNC": #span_tag != "O" and
            
            if span_tag == "O":
                
                if len(set_span) > 2:
                    ranked_list.append([entity, 0])
            
            elif set_span.issubset(question):
                
                ranked_list.append([entity, 1])
#                 print "IN", raw_span, span_tag, set_span, question
                
            else:
                
                ranked_list.append([entity, 2])
#                 print "OUT", raw_span, span_tag, set_span, question
    
    return sorted(ranked_list, key=lambda x: x[1], reverse=True)

In [283]:
# First, answers whose content words all appear in the question should be ranked lowest.

def first_filter_object_stop(question, answer_entities):
   
    ranked_list = []
    
    question = set(pre_process_tf_idf(question))
    
#     print question
#     print
    
    for entity in answer_entities:

        raw_span = entity[0]
        span_tag = entity[1]
        
        set_span = set(pre_process_tf_idf(raw_span))
        
        if span_tag !="PUNC": #span_tag != "O" and
            
            if span_tag == "O" or span_tag == "STOPWORD":
                
                if len(set_span) > 2:
                    
                    ranked_list.append([entity, 0])
            
            elif set_span.issubset(question):
                
                ranked_list.append([entity, 1])
#                 print "IN", raw_span, span_tag, set_span, question
                
            else:
                
                ranked_list.append([entity, 2])
#                 print "OUT", raw_span, span_tag, set_span, question
    
    return sorted(ranked_list, key=lambda x: x[1], reverse=True)

In [284]:
# Second, answers which match the question type should be ranked higher than those that don't; for this, you
# should build a simple rule-based question type classifier based on key words (e.g. questions which contain "who" are
# people).

# First, answers whose content words all appear in the question should be ranked lowest.

def second_filter(question, ranked_list):
   
    question_type = getQuestionType(question)
#     print question_type
    
    for index, answer in enumerate(ranked_list):
        
        entity_tag = answer[0][1]
        
        if entity_tag == question_type:
#             print "MATCH", answer[0], question_type, question
            ranked_list[index].append(2)
#             ranked_list[index][1] += 1
        else:
            ranked_list[index].append(1)
#             ranked_list[index][1] -= 1
            
    return ranked_list

In [285]:
def pre_process_open_class(line):
    tokenized_sentence = word_tokenizer.tokenize(line.lower())
    lemmatized_sentence = [lemmatize(token) for token in tokenized_sentence]
    filtered_sentence = [token for token in lemmatized_sentence if token not in filter_tokens]
    tagged_sent = nltk.pos_tag(lemmatized_sentence)
    final = []
    for word, tag in tagged_sent:
        if "V" in tag or "NN" in tag:
#             final.append((word,tag))
            final.append(word)
            
#     print "RESULT: ", final
    return final

In [286]:
# Third, among entities of the same type, the prefered entity should be the one which is closer in the sentence to a
# closed-class word from the question.

def third_filter(question, possAnswers, ranked_list):
    
    question = pre_process_open_class(question)

    answer_sent = " ".join([x[0] for x in possAnswers])
    answer_sent = pre_process_tf_idf(answer_sent)
    raw_answer_sent = " ".join(answer_sent)
    
#     print "QUESTION: "
#     pp.pprint(question)
#     print "ANSWER: "
#     pp.pprint(answer_sent)
#     pp.pprint(raw_answer_sent)
    
    for index, answer in enumerate(ranked_list):

        span_tag = answer[0][1]
        raw_span = answer[0][0]

        proc_span = pre_process_tf_idf(raw_span)

        raw_proc_span = " ".join(proc_span)
        new_raw_proc_span = "-".join(proc_span)

        raw_answer_sent = raw_answer_sent.replace(raw_proc_span, new_raw_proc_span)
    
    answer_sent = raw_answer_sent.split(" ")
    
    avg_dict = defaultdict(float)
    
    for open_class in question:
        
        if open_class in answer_sent:
            
            open_class_locations = [i for i, x in enumerate(answer_sent) if x == open_class]
            
#             print "OPEN CLASS: ", repr(open_class)

            for index, answer in enumerate(ranked_list):

                span_tag = answer[0][1]
                raw_span = answer[0][0]

                proc_span = pre_process_tf_idf(raw_span)
                
                raw_proc_span = " ".join(proc_span)
                new_raw_proc_span = "-".join(proc_span)
                
                proc_span_locations = [i for i, x in enumerate(answer_sent) if x == new_raw_proc_span]
                
                min_dist = len(answer_sent)
                min_dist_ind = (None, None)
                
                for loc1 in proc_span_locations:
                    
                    for loc2 in open_class_locations:
                        
                        dist = abs(loc1 - loc2)
                        
                        if dist < min_dist:
                            
                            min_dist = dist
                            min_dist_ind = (loc1, loc2)
                
#                 print "PROC: ", proc_span_locations
#                 print "OPEN CLASS: ", open_class_locations                
                scale = (len(answer_sent) - min_dist) * 1.0 / len(answer_sent)
#                 print "JOINT: ", min_dist_ind, scale
                avg_dict[index] += scale
#                 ranked_list[index][1] *= scale
    
    for key, value in avg_dict.iteritems():
        ranked_list[key].append(value / len(question))

    return ranked_list

In [287]:
def reduce_rank(ranking_list):
    
    new_ranking = []
    
    for rank in ranking_list:
        
        new_rank = ( rank[1] + rank[2] )
        
        if len(rank) == 4:
             new_rank *= rank[3]
        
        new_ranking.append([rank[0], new_rank])
        
    return sorted(new_ranking, key=lambda x: x[1], reverse=True)

In [288]:
import random


In [298]:
def generate_part_c_output(name, data):
    
    part_c_output = []

    question_set = data[name]["question_set"]
    document_set = data[name]["document_set"]
        
    b_output_answer_set = data[name]["b_output_answer_set"]
    
    for result_b in b_output_answer_set:
        
        question = question_set[result_b["set_index"]][result_b["question_index"]]["question"]
        
        first_pass = first_filter(question, result_b["candidates"])
        
        second_pass = second_filter(question, first_pass)
        
        third_pass = third_filter(question, result_b["candidates"], second_pass)
        
        fourth_pass = reduce_rank(third_pass)
                
#         pp.pprint(third_pass)

        predicted_answer = None

        if len(fourth_pass) > 0:

            top_answer = fourth_pass[0]
    #         pp.pprint(top_answer)        
            predicted_answer = top_answer[0][0]
        
        else:
            
            first_pass = first_filter_object(question, result_b["candidates"])

            second_pass = second_filter(question, first_pass)

            third_pass = third_filter(question, result_b["candidates"], second_pass)

            fourth_pass = reduce_rank(third_pass)   
            
            if len(fourth_pass) > 0:

                top_answer = fourth_pass[0]
        #         pp.pprint(top_answer)        
                predicted_answer = top_answer[0][0]
            
            else:
                
                first_pass = first_filter_object_stop(question, result_b["candidates"])

                second_pass = second_filter(question, first_pass)

                third_pass = third_filter(question, result_b["candidates"], second_pass)

                fourth_pass = reduce_rank(third_pass)   

                if len(fourth_pass) > 0:

                    top_answer = fourth_pass[0]
            #         pp.pprint(top_answer)        
                    predicted_answer = top_answer[0][0]                
                
                else:

        #         pp.pprint(top_answer)        
                    predicted_answer = random.choice(result_b["candidates"])[0]
            
        
        result_c = {
            "set_index"  : result_b["set_index"],
            "question_index" : result_b["question_index"],
            "sentence_index" : result_b["sentence_index"],
            "candidates": result_b["candidates"],
            "ranked_answers": fourth_pass,
            "predicted_answer" : predicted_answer
        }
        
        part_c_output.append(result_c)        

    return part_c_output

In [299]:
def process_part_c(name, data):
    
    data[name]["c_output_answer_set"] = generate_part_c_output(name, data)
    
    print
    print "Part C Output: "
    pp.pprint(data[name]["c_output_answer_set"][:rapid_size])
    print    

In [300]:
# For each question, evaluate if the answer is present as an entity

def evaluate_rank(name, data):
    
    question_set = data[name]["question_set"]
    document_set = data[name]["document_set"]
    
    correct = []
    wrong = []
    
    c_output_answer_set = data[name]["c_output_answer_set"]
    
    for result_c in c_output_answer_set:
        
        question = question_set[result_c["set_index"]][result_c["question_index"]]["question"]
        answer =  question_set[result_c["set_index"]][result_c["question_index"]]["answer"]
        
        predicted_answer = result_c["predicted_answer"]

        if (predicted_answer == answer):
            correct.append(result_c)
        else :
            wrong.append(result_c)
#         break
        #print correct
    return (correct, wrong)
    

In [301]:
def process_rank(name, data, stats=False):
    print "Processing rank: ", name
    process_part_c(name, data)
    if stats:
        process_generic(name, data, "rank", evaluate_rank)
        
        
        correct_rank = len(data[name]["rank_correct"])
        correct_ner = len(data[name]["ner_correct"])
        
        avg = correct_rank * 1.0 / correct_ner
        
        print "rank".capitalize() + " Correct Average of Previous %: ", avg        
        
    print

In [302]:
process_rank("rapid", DATA, True)

Processing rank:  rapid

Part C Output: 
[   {   'candidates': [   (u'They', 'O'),
                          (u'had a', 'STOPWORD'),
                          (u'playing time', u'O'),
                          (u'of', 'STOPWORD'),
                          (u'eight minutes', 'NUMBER'),
                          (u'.', 'PUNC')],
        'predicted_answer': u'eight minutes',
        'question_index': 0,
        'ranked_answers': [[(u'eight minutes', 'NUMBER'), 0.4]],
        'sentence_index': 149,
        'set_index': 0}]

Rank Correct:  36
Rank Wrong:  368
Rank Total:  404
Rank Overall Average %:  0.0891089108911
Rank Correct Average of Previous %:  0.537313432836



In [272]:
process_rank("train", DATA, True)

Processing rank:  train

Part C Output: 
[   {   'candidates': [   (u'They', 'O'),
                          (u'had a', 'STOPWORD'),
                          (u'playing time', u'O'),
                          (u'of', 'STOPWORD'),
                          (u'eight minutes', 'NUMBER'),
                          (u'.', 'PUNC')],
        'predicted_answer': u'eight minutes',
        'question_index': 0,
        'ranked_answers': [[(u'eight minutes', 'NUMBER'), 0.4]],
        'sentence_index': 149,
        'set_index': 0}]

Rank Correct:  9895
Rank Wrong:  60264
Rank Total:  70159
Rank Overall Average %:  0.141036787868
Rank Correct Average of Previous %:  0.493098121294



In [303]:
process_rank("dev", DATA, True)

Processing rank:  dev

Part C Output: 
[   {   'candidates': [   (u'Infrared', 'O'),
                          (u'is', 'STOPWORD'),
                          (u'used', u'O'),
                          (u'in', 'STOPWORD'),
                          (u'night vision equipment', u'O'),
                          (u'when there is', 'STOPWORD'),
                          (u'insufficient visible light', u'O'),
                          (u'to', 'STOPWORD'),
                          (u'see', u'O'),
                          (u'.', 'PUNC')],
        'predicted_answer': u'night vision equipment',
        'question_index': 0,
        'ranked_answers': [   [   (u'night vision equipment', u'O'),
                                  0.26666666666666666],
                              [   (u'insufficient visible light', u'O'),
                                  0.19999999999999998]],
        'sentence_index': 71,
        'set_index': 0}]

Rank Correct:  1135
Rank Wrong:  7328
Rank Total:  8463
Rank Overal

In [15]:
def log_wrong_debug(name, data):
    
    question_set = data[name]["question_set"]
    document_set = data[name]["document_set"]
    rank_wrong = data[name]["rank_wrong"]
    
    for result_wrong in rank_wrong:
        
        question = question_set[result_wrong["set_index"]][result_wrong["question_index"]]
        candidate_sentence = document_set[result_wrong["set_index"]][result_wrong["sentence_index"]]
        correct_sentence = document_set[result_wrong["set_index"]][question["answer_sentence"]]
        
        candidates = result_wrong["candidates"]
        ranked_answers = result_wrong["ranked_answers"]
        predicted_answer = result_wrong["predicted_answer"]
        
        if question["answer_sentence"] == result_wrong["sentence_index"]:
            
            print "=" * 20
            print "=" * 20
        
            print "Question: "
            print
            pp.pprint(question["question"])

            print
            print "Correct Sentence: (Part A)"
            print
            pp.pprint(correct_sentence)
            print
            print "Chosen Sentence: (Part A)"
            print
            pp.pprint(candidate_sentence)
            print

            print "Candidate Answers: (Part B)"
            print
            pp.pprint(candidates)
            print
            print "Ranked Answers: (Part C)"
            print
            pp.pprint(ranked_answers)
            print
            print "Predicted Answer: (Part C)"
            print
            pp.pprint(predicted_answer)
            print
            print "Correct Answer: (Part C)"
            print
            pp.pprint(question["answer"])     
            break

In [282]:
log_wrong_debug("rapid", DATA)

Question: 

u'What was the primary use of a phonographic disc record?'

Correct Sentence: (Part A)

u'The phonograph disc record was the primary medium used for music reproduction until late in the 20th century, replacing the phonograph cylinder record\u2013with which it had co-existed from the late 1880s through to the 1920s\u2013by the late 1920s.'

Chosen Sentence: (Part A)

u'The phonograph disc record was the primary medium used for music reproduction until late in the 20th century, replacing the phonograph cylinder record\u2013with which it had co-existed from the late 1880s through to the 1920s\u2013by the late 1920s.'

Candidate Answers: (Part B)

[   (u'The phonograph disc record', 'O'),
    (u'was the', 'STOPWORD'),
    (u'primary medium used', u'O'),
    (u'for', 'STOPWORD'),
    (u'music reproduction', u'O'),
    (u'until', 'STOPWORD'),
    (u'late', 'O'),
    (u'in the', 'STOPWORD'),
    (u'20th century', 'NUMBER'),
    (u',', 'PUNC'),
    (u'replacing', u'O'),
    (u'the'

In [304]:
process_rank("test", DATA, False)

Processing rank:  test

Part C Output: 
[   {   'candidates': [   (u'a', 'STOPWORD'),
                          (u'forgotten theatre', u'O'),
                          (u'of the', 'STOPWORD'),
                          (u'Crimean War', 'OTHERCAP'),
                          (u'.', 'PUNC')],
        'predicted_answer': u'Crimean War',
        'question_index': 0,
        'ranked_answers': [[(u'Crimean War', 'OTHERCAP'), 2]],
        'sentence_index': 283,
        'set_index': 0}]




In [278]:
def process_submit(name, data):
    
    headers = ['id', 'answer']
    
    c_output_answer_set = data[name]["c_output_answer_set"]       

    with open(name + '.submit.csv', 'w') as f:

        f_csv = csv.DictWriter(f, headers)
        f_csv.writeheader()

        for index, result_c in enumerate(c_output_answer_set):
            
            predicted_answer = result_c["predicted_answer"]
            
            if predicted_answer is not None:
                f_csv.writerows([{'id':index+1,'answer':predicted_answer.encode("utf-8")}])
            else:
                f_csv.writerows([{'id':index+1,'answer':"NONE"}])
            
#             if isinstance( answer_list[index]['answer'], int):
                
#                 f_csv.writerows([{'id':index+1,'answer':answer_list[index]['answer'][0][0]}])
                
#             else:
                
#                 f_csv.writerows([{'id':index+1,'answer':answer_list[index]['answer'][0][0].encode("utf-8")}])        
                

In [279]:
process_submit("rapid", DATA)

In [305]:
process_submit("test", DATA)