In [127]:
# Load in the python script containing the same code as the load the data notebook
%run loadData.py
# now we can access train, dev, and test
# along with trainSents, devSents testSents

In [128]:
documents = testSents[0]
questions = test[0]

## Joshi part

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [13]:
# Tuning functions

import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Follow lemmatize function from guide notebook: WSTA_N1B_preprocessing.ipynb
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

word_tokenizer = nltk.tokenize.WordPunctTokenizer() #word_tokenize #tokenize.regexp.WordPunctTokenizer()

def pre_process(line):
    tokenized_sentence = word_tokenizer.tokenize(line.lower())
    lemmatized_sentence = [lemmatize(token) for token in tokenized_sentence]
    return lemmatized_sentence

In [14]:
# Core functions

def vectorize_documents(text_documents):

    vectorizer = TfidfVectorizer(stop_words='english', tokenizer=pre_process)
    vector_documents = vectorizer.fit_transform(text_documents)
    
    return [vector_documents, vectorizer]

def vectorize_query(vectorizer, text_query):
    return vectorizer.transform([text_query])

def process_neighbours(vector_documents):
    
    neighbours = NearestNeighbors(1, algorithm="brute", metric="cosine")
    neighbours.fit(vector_documents)
    
    return neighbours

def closest_document(neighbours, vector_query):

    result = neighbours.kneighbors(vector_query, 1, return_distance=True)

    result_index = result[1][0][0]
    result_distance = result[0][0][0]
    
    return [result_distance, result_index]

In [15]:
def generatePartAOutput(qs, sents):
    # Output for part A
    partAOutput = []
    for i in range (0, len(qs)):
        documents = sents[i]
        questions = qs[i]

        vector_documents, vectorizer = vectorize_documents(documents)
        analyze = vectorizer.build_analyzer()
        neighbours = process_neighbours(vector_documents)

        for j in range (0, len(questions)):
            text_query = questions[j]["question"]
            vector_query = vectorize_query(vectorizer, text_query)
            result_similarity, result_index  = closest_document(neighbours, vector_query)
            partAOutput.append((i,j,result_index))
    return partAOutput

partADevAnswers = generatePartAOutput(dev, devSents)
partATestAnswers = generatePartAOutput(test, testSents)

## Alex part

In [16]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

# The required jar files : https://nlp.stanford.edu/software/CRF-NER.shtml#Download
# It's 171mb so I've added to the gitignore
# If you download it, and rename the folder name "stanford" in the main directory
classifier = './stanford/classifiers/english.all.3class.distsim.crf.ser.gz'
jar = './stanford/stanford-ner.jar'

sTagger = StanfordNERTagger(classifier,jar)

In [17]:
import pickle # Useful for read / write of list file
import os #Needed to check if file exists

In [18]:
# Lets store the stanford tagger output in a file
# This function returns the tagging output of stanford for each dataset
# with datasetName - 'train', 'dev', test' 

def getStanfordTagging(datasetName):
    fnameTrain = './preCompTags/stanfordTaggedTrain.txt'
    fnameDev = './preCompTags/stanfordTaggedDev.txt'
    fnameTest = './preCompTags/stanfordTaggedTest.txt'
    
    theFilePath = ''
    theSents = []
    if (datasetName == 'train'):
        theFilePath = fnameTrain
        theSents = trainSents
    elif (datasetName == 'dev'):
        theFilePath = fnameDev
        theSents = devSents
    elif (datasetName == 'test'):
        theFilePath = fnameTest
        theSents = testSents
    else :
        raise ValueError('Incorrect datasetName: ' + datasetName + ', choose from - "train", "dev", "test" ') 
    if (os.path.exists(theFilePath)):
        with open(theFilePath, "rb") as fp:
            stanfordTags = pickle.load(fp)
            return stanfordTags
    
    else :
        #Need to create taggings!
        taggedSentsList = []
        for sents in theSents:
            tokenisedSents = [word_tokenize(sent) for sent in sents]
            classifiedSents = sTagger.tag_sents(tokenisedSents)
            taggedSentsList.append(classifiedSents)
        #And save them
        with open(theFilePath, "wb") as fp: 
            pickle.dump(taggedSentsList, fp)
        return taggedSentsList
    

In [19]:
taggedTrain = getStanfordTagging('train')
taggedDev = getStanfordTagging('dev')
taggedTest = getStanfordTagging('test')

In [20]:
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))


# Given a stanford tagged list, refines the list by:\n",
# Grouping all contiguous words with the same tag\n",
# Relabels Organisations as Other\n",
# Labels Number\n",
def refineWordTags(taggedWordList):
    newWordTags = []
    for (word, tag) in taggedWordList:
        if (tag == 'ORGANIZATION'):
            tag = 'O'
        if (tag == 'O'):
            #Might be a number
            if isNumber(word):
                tag = 'NUMBER'
            elif isCapitalised(word):
                tag = 'OTHERCAP'
            elif word in preUnits:
                tag = 'PRENUM'
            elif word in postUnits:
                tag = 'POSTNUM'
            elif isStopWord(word):
                tag = 'STOPWORD'
            elif isPunctuation(word):
                tag = 'PUNC'

        newWordTags.append((word, tag))
    
    newWordTags = combineTags (newWordTags)
    return newWordTags
        
def combineTags(wordTags):
    newTags = []
    prevWord = wordTags[0][0]
    prevTag = wordTags[0][1]
    for (word, tag) in wordTags[1:]:
        if tag == 'NUMBER' and prevTag == 'PRENUM':
            prevTag = 'NUMBER'
        elif prevTag == 'PRENUM':
            prevTag = 'O'
        if tag == 'POSTNUM' and prevTag == "NUMBER":
            tag = "NUMBER"
        elif tag == "POSTNUM":
            tag = "O"
        newTags.append((prevWord, prevTag))
        prevWord = word
        prevTag = tag
    newTags.append((prevWord, prevTag))
    
    newNewTags = []
    prevWord = newTags[0][0]
    prevTag = newTags[0][1]
    if (prevTag == "OTHERCAP"):
        prevTag = "O"
    for (word, tag) in wordTags[1:]:            
        if tag == prevTag :
            prevWord += ' ' + word
        else :
            newNewTags.append((prevWord, prevTag))
            prevWord = word
            prevTag = tag
    newNewTags.append((prevWord, prevTag))
    return newNewTags

# Thanks for this list to save me typing it : http://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers\n",
numInWords = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"
       , "hundred", "thousand", "million", "billion", "trillion"]

punctuation = ['.',',',';',':']

def isPunctuation(word):
    return word in punctuation
def isCapitalised (word):
    if len(word) == 0:
        return False
    return word[0].isupper()

# Obtained from training data
postUnits = [u'%', u'century', u'years', u'percent', u'years ago', u'days', u'months', u'km', u'hours', u'times', u'inches', u'\xb0C', u'minutes', u'acres', u'\xb0F', u'weeks', u'people', u'sq mi', u'mi', u'ft', u'feet', u'metres', u'mm', u'square miles', u'miles', u'pm', u'per cent', u'year', u'copies', u'yuan', u'men', u'square feet', u'third', u'kilometres', u'nm', u'tonnes', u'species', u'decades', u'barrels', u'tons', u'largest', u'centuries', u'km2']
preUnits = [u'$', u'around', u'late', u'early', u'nearly', u'since', u'approximately', u'number']

# Returns true if the word represents a number\n",
def isNumber(word):
    pattern = ".?(\\d)+((,|.)(\\d)+)*"
    if re.match(pattern,word) :
        return True
    if word.lower() in numInWords:
        return True
    return False

def isStopWord(word):
    return word.lower() in stop_words


In [26]:
# Get the output for part B:
# In the form [(docID, questID, entities)]
def generatePartBOutput(qs, partAOutput, taggedSents):   
    partBOutput = []
    for (docIndex, questionIndex, sentenceIndex) in partAOutput:
        stanfordTags = taggedSents[docIndex][sentenceIndex]
        newTags = refineWordTags(stanfordTags)
        question = qs[docIndex][questionIndex]["question"]
        partBOutput.append((docIndex, questionIndex, sentenceIndex, newTags))
    return partBOutput

partBDevAnswers = generatePartBOutput(dev, partADevAnswers, taggedDev)
partBTestAnswers = generatePartBOutput(test, partATestAnswers, taggedTest)

In [27]:
partBDevAnswers[0]

(0,
 0,
 1,
 [(u'Night-vision devices using active near-infrared illumination allow', 'O'),
  (u'people', 'POSTNUM'),
  (u'or', 'STOPWORD'),
  (u'animals', u'O'),
  (u'to be', 'STOPWORD'),
  (u'observed without', u'O'),
  (u'the', 'STOPWORD'),
  (u'observer', u'O'),
  (u'being', 'STOPWORD'),
  (u'detected', u'O'),
  (u'.', 'PUNC')])

In [34]:
print partBTestAnswers



In [35]:
## Example output:
print "\nDocIndex: " + str(partBTestAnswers[0][0])
print "\nQuestionIndex: " + str(partBTestAnswers[0][1])
print "\nSentenceIndex: " + str(partBTestAnswers[0][2])
print "\nQuestion: " + test[0][0]["question"]
print "\nPossibleAnswers: " 
print partBTestAnswers[0][3]


DocIndex: 0

QuestionIndex: 0

SentenceIndex: 283

Question: What year did the Crimean War begin?

PossibleAnswers: 
[(u'a', 'STOPWORD'), (u'forgotten theatre', u'O'), (u'of the', 'STOPWORD'), (u'Crimean War', 'OTHERCAP'), (u'.', 'PUNC')]


## Quick evaluation of the first two parts on the dev set:


In [29]:
def evaluateNERonDev():
    correct = []
    wrong = []
    for (docIndex, questionIndex, sentenceIndex, newTags) in partBDevAnswers:
        answer = dev[docIndex][questionIndex]["answer"]
        possAnswers = newTags 
        inThere = False
        for possAnswer in newTags:
            if possAnswer[0] == answer:
                inThere = True
                break
        if inThere:
            correct.append((docIndex, questionIndex, sentenceIndex, newTags))
        else :
            wrong.append((docIndex, questionIndex, sentenceIndex, newTags))
    return (correct, wrong)

In [30]:
(nerDevCorrect, nerDevWrong) = evaluateNERonDev()
print len(nerDevCorrect)
print len(nerDevWrong)

2409
6054


In [31]:
# For each question, evaluate if the answer is present as an entity

def evaluateAnswerRanking(questionsList,documentsList, numToEval):
    correct = []
    wrong = []
    #(corNER, wrongNER) = evaluateNER(questionsList,documentsList, numToEval)
    (nerDevCorrect, nerDevWrong) = evaluateNERonDev()
    entityListsWithAnswer = nerDevCorrect
    for (i,j,possAnswers) in entityListsWithAnswer:
        question = questionsList[i][j]["question"]
        answer =  questionsList[i][j]["answer"]
        print question
        print possAnswers
        #print answer
        answerPredicited = third_filter(question,second_filter(question,first_filter(question, possAnswers)),second_filter(question,first_filter(question, possAnswers))['answer_entities_list']) 
        #print answerPredicited
        #print '%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%'
        # TODO add Dereks part here
        if (answerPredicited == answer):
            correct.append((i,j))
        else :
            wrong.append((i,j,answerPredicited))
        #print correct
    return (correct, wrong)
    

In [32]:
(corAns, wrongAns) = evaluateAnswerRanking(dev, devSents,len(dev))
print("Number Correct : " + str(len(corAns)))
print("Number incorrect: " + str(len(wrongAns)))
print ("Average correct : " + str((len(corAns) + 0.0) / (len(corAns)+len(wrongAns))))

ValueError: too many values to unpack

## Derek part

In [112]:
def detect_openclass_word(sameword_list):
    tagged_text = nltk.pos_tag(sameword_list)
    is_noun = lambda pos: pos[:2] == 'NN'
    is_verb = lambda pos: pos[:2] == 'VB'
    #is_adjective = lambda pos: pos[:2] == 'JJ'
    #is_adverb = lambda pos: pos[:2] == 'RB'
    
    nouns = [word for (word, pos) in tagged_text if is_noun(pos)] 
    verbs = [word for (word, pos) in tagged_text if is_verb(pos)] 
    #adjectives = [word for (word, pos) in tagged_text if is_adjective(pos)] 
    #adverbs = [word for (word, pos) in tagged_text if is_adverb(pos)]
    return nouns+verbs

In [43]:
# Given a question, returns a tag for the answer form
# From PERSON, LOCATION, NUMBER, OTHER 
# Assuming question is lowercased
def getQuestionType(question):
    if 'Who' in question:
        return "PERSON"
    if 'where' in question:
        return "LOCATION"
    if 'How many' in question:
        return "NUMBER"
    if 'How much' in question:
        return "NUMBER"
    if 'When' in question:
        return "NUMBER"
    if 'what year' in question:
        return "NUMBER"
    if 'What year' in question:
        return "NUMBER"
    else:
        return "O"

In [44]:
#remove the non-word entities from answer-entities
def process_answer_entities(anwser_entities):
    new_anwser_entities =[]
    for (entity,entity_type) in anwser_entities:
        tokenized_entity = word_tokenize(entity)
        #print tokenized_entity
        temp = []
        for ele in tokenized_entity:
            if ele not in non_words:
                temp.append(ele)
        temp = ' '.join(temp)
        new_anwser_entities.append((temp,entity_type))
    return new_anwser_entities      


In [146]:

#first:  whose anwsers all appear in the questions rank the lowest
#assumption: input question in its dictionary value
from string import punctuation  
non_words = list(punctuation)

def first_filter(question, anwser_entities):
    ranking_dict_1 = {}
    ranking_list = []
    merge_list = []
    answer_entities_list = []
    anwser_entities = process_answer_entities(anwser_entities)
    
    for entity in anwser_entities:
        answer_entities_list.append(entity[0])
        if entity[0] in question:
            #print entity[0]
            #if entity[0].lower() not in stop_words and entity[0]!='' and entity[0].lower() not in non_words:
            if entity[1] != 'STOPWORD' and entity[1] !='PUNC' and entity[0]!='':   
                #print entity[0]
                merge_list.append(entity[0])
        else:
            if entity[0].lower() not in stop_words and entity[0]!='' and entity[0].lower() not in non_words:
            #if entity[1] != 'STOPWORD' and entity[1] !='PUNC' and entity[0]!='':  
                ranking_list.append(entity)
                
    #get rid of the not noun phrases   
    #print merge_list
    
    final_merge_list = []
    for phrase in merge_list:
        tokenized_phrase = word_tokenize(phrase)
        for word in tokenized_phrase:
            if word not in stop_words and word.lower() not in non_words:
                #print phrase
                final_merge_list.append(phrase)
                
    final_ranking_list = []            
    for entity in ranking_list:
        if entity[1] != 'STOPWORD':
            final_ranking_list.append(entity)
    
    #detect the open-class word in here for easier process   
    #print merge_list
    final_merge_list = set(detect_openclass_word(final_merge_list))
    ranking_dict_1["ranking_list"] = final_ranking_list
    ranking_dict_1["same_word_list"] = final_merge_list
    ranking_dict_1["answer_entities_list"] = answer_entities_list
    return ranking_dict_1


In [147]:
(corAns, wrongAns) = evaluateAnswerRanking(dev, devSents,len(dev))
print("Number Correct : " + str(len(corAns)))
print("Number incorrect: " + str(len(wrongAns)))
print ("Average correct : " + str((len(corAns) + 0.0) / (len(corAns)+len(wrongAns))))

Number Correct : 879
Number incorrect: 1530
Average correct : 0.364881693649


In [39]:

#second: answers which match the question type should be ranked higher than those that dont

#assumption: save questions' type in the dictionary format quesiton1 = 
#{u'answer': u'long playing',u'answer_sentence': 2, u'question':......., 'question_type:'PERSON'}

def second_filter(question, ranking_dict_1):
    question_with_type ={}
    question_with_type['question_type']= getQuestionType(question)
    question_with_type['question'] = question
    #print question_with_type
    ranking_dict_2 = {}
    ranking_list =[]
    merge_list = []
    for entity in ranking_dict_1["ranking_list"]:
        if question_with_type['question_type'] == 'O':
            ranking_list.append(entity[0])
        else:
            if entity[1] == question_with_type['question_type']:
                ranking_list.append(entity[0])
            else:
                merge_list.append(entity[0])
    ranking_dict_2["same_word_list"] = ranking_dict_1["same_word_list"]
    ranking_dict_2["ranking_list"] = ranking_list
    ranking_dict_2["Other_tags_list"] = merge_list
    ranking_dict_2["answer_entities_list"] = ranking_dict_1["answer_entities_list"]
    return ranking_dict_2
    

In [40]:

#Thrid: based on second, the prefered entity should be the one which is close in 
#the sentence to a closed-class word form the question
from collections import OrderedDict

def cal_distance_words(entity,same_words, anwser_entities):
    temp = 0
    for same_word in same_words:
        temp += abs(anwser_entities.index(entity) - anwser_entities.index(same_word))
    return float(temp)/float(len(same_words))

def sort_orderedDict(orderdict):
    return OrderedDict(sorted(orderdict.items(), key = lambda x:x[1], reverse = False))
        

def third_filter(question,second_filter,anwser_entities):
    dict_ranking ={}
    if (len(second_filter["same_word_list"])==0):
        if len(second_filter['ranking_list']) != 0:
            return second_filter['ranking_list'][0]
        else:
            return second_filter["Other_tags_list"][0]
            #for entity in second_filter["Other_tags_list"]:
                #dict_ranking[entity]= cal_distance_words(entity, second_filter["same_word_list"],anwser_entities)
            #dict_ranking = sort_orderedDict(dict_ranking)
            #if len(dict_ranking.items()) ==0:
                #return 0
            #else:
            #return dict_ranking.items()[0][0]
    else:
        #if len(second_filter['ranking_list']) != 0:
        for entity in second_filter["ranking_list"]:
            dict_ranking[entity]= cal_distance_words(entity, second_filter["same_word_list"],anwser_entities)
        #else:
            #if len(second_filter["Other_tags_list"]) !=0:
                #for entity in second_filter["Other_tags_list"]:
                    #dict_ranking[entity]= cal_distance_words(entity, second_filter["same_word_list"],anwser_entities)
            #print dict_ranking
            #else:
                #return 0
        dict_ranking = sort_orderedDict(dict_ranking)
            #print dict_ranking
        if len(dict_ranking.items()) ==0:
            return 0
        else:
            return dict_ranking.items()[0][0]


In [None]:
What country is Guam a territory of?
[(u'Guam ( i\u02c8\u0261w\u0251\u02d0m', 'O'), (u'or', 'STOPWORD'), (u'\u02c8\u0261w\u0252m', u'O'), (u';', 'PUNC'), (u'Chamorro', u'PERSON'), (u':', 'PUNC'), (u'Gu\xe5h\xe5n', 'OTHERCAP'), (u';', 'PUNC'), (u'[ needs', u'O'), (u'IPA', 'OTHERCAP'), (u'] formally', u'O'), (u'the', 'STOPWORD'), (u'Territory', 'OTHERCAP'), (u'of', 'STOPWORD'), (u'Guam', 'OTHERCAP'), (u')', u'O'), (u'is an', 'STOPWORD'), (u'unincorporated', u'O'), (u'and', 'STOPWORD'), (u'organized territory', u'O'), (u'of the', 'STOPWORD'), (u'United States', u'LOCATION'), (u'.', 'PUNC')]

In [92]:
question = "What country is Guam a territory of?"
a = [(u'Guam ( i\u02c8\u0261w\u0251\u02d0m', 'O'), (u'or', 'STOPWORD'), (u'\u02c8\u0261w\u0252m', u'O'), (u';', 'PUNC'), (u'Chamorro', u'PERSON'), (u':', 'PUNC'), (u'Gu\xe5h\xe5n', 'OTHERCAP'), (u';', 'PUNC'), (u'[ needs', u'O'), (u'IPA', 'OTHERCAP'), (u'] formally', u'O'), (u'the', 'STOPWORD'), (u'Territory', 'OTHERCAP'), (u'of', 'STOPWORD'), (u'Guam', 'OTHERCAP'), (u')', u'O'), (u'is an', 'STOPWORD'), (u'unincorporated', u'O'), (u'and', 'STOPWORD'), (u'organized territory', u'O'), (u'of the', 'STOPWORD'), (u'United States', u'LOCATION'), (u'.', 'PUNC')]

In [109]:
test = first_filter(question,a)
print test

{'ranking_list': [(u'Guam i\u02c8\u0261w\u0251\u02d0m', 'O'), (u'\u02c8\u0261w\u0252m', u'O'), (u'Chamorro', u'PERSON'), (u'Gu\xe5h\xe5n', 'OTHERCAP'), (u'needs', u'O'), (u'IPA', 'OTHERCAP'), (u'formally', u'O'), (u'Territory', 'OTHERCAP'), (u'unincorporated', u'O'), (u'organized territory', u'O'), (u'United States', u'LOCATION')], 'answer_entities_list': [u'Guam i\u02c8\u0261w\u0251\u02d0m', u'or', u'\u02c8\u0261w\u0252m', '', u'Chamorro', '', u'Gu\xe5h\xe5n', '', u'needs', u'IPA', u'formally', u'the', u'Territory', u'of', u'Guam', '', u'is an', u'unincorporated', u'and', u'organized territory', u'of the', u'United States', ''], 'same_word_list': set([u'Guam'])}


In [72]:
test2 = second_filter(question, test)
print test2

{'ranking_list': [u'Infrared radiation', u'scientific', u'medical applications'], 'Other_tags_list': [], 'answer_entities_list': [u'Infrared radiation', u'is', u'used', u'in', u'industrial', '', u'scientific', '', u'and', u'medical applications', ''], 'same_word_list': set([u'industrial'])}


In [73]:
#print cal_distance_words('Melbourne',test2["same_word_list"],test2["answer_entities_list"])
test3 = third_filter(question, test2,test2["answer_entities_list"])
print test3

scientific


In [84]:
(nerDevCorrect, nerDevWrong) = evaluateNERonDev()

In [85]:
print nerDevCorrect[0]

(0, 3, 3, [(u'Infrared thermal-imaging cameras', 'O'), (u'are', 'STOPWORD'), (u'used', u'O'), (u'to', 'STOPWORD'), (u'detect heat loss', u'O'), (u'in', 'STOPWORD'), (u'insulated systems', u'O'), (u',', 'PUNC'), (u'to', 'STOPWORD'), (u'observe changing blood flow', u'O'), (u'in the', 'STOPWORD'), (u'skin', u'O'), (u',', 'PUNC'), (u'and to', 'STOPWORD'), (u'detect overheating', u'O'), (u'of', 'STOPWORD'), (u'electrical apparatus', u'O'), (u'.', 'PUNC')])


## Evaluation

In [110]:
# For each question, evaluate if the answer is present as an entity

def evaluateAnswerRanking(questionsList,documentsList, numToEval):
    correct = []
    wrong = []
    (nerDevCorrect, nerDevWrong) = evaluateNERonDev()
    #(corNER, wrongNER) = evaluateNER()
    entityListsWithAnswer = nerDevCorrect
    for (i,j,x,possAnswers) in entityListsWithAnswer:
        question = questionsList[i][j]["question"]
        answer =  questionsList[i][j]["answer"]
        #print question
        #print possAnswers
        #print answer
        answerPredicited = third_filter(question,second_filter(question,first_filter(question, possAnswers)),second_filter(question,first_filter(question, possAnswers))['answer_entities_list']) 
        #print answerPredicited
        #print '%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%'
        # TODO add Dereks part here
        if (answerPredicited == answer):
            correct.append((i,j))
        else :
            wrong.append((i,j,answerPredicited))
        #print correct
    return (correct, wrong)
    

In [134]:
(corAns, wrongAns) = evaluateAnswerRanking(dev, devSents,len(dev))
print("Number Correct : " + str(len(corAns)))
print("Number incorrect: " + str(len(wrongAns)))
print ("Average correct : " + str((len(corAns) + 0.0) / (len(corAns)+len(wrongAns))))

Number Correct : 878
Number incorrect: 1531
Average correct : 0.364466583645


## Derek submission

In [129]:
answer_list = []
for ele in partBTestAnswers:
    answer_dict = {}
    #print ele
    question = test[ele[0]][ele[1]]['question']
    
    possAnswers = ele[3]
    answer = third_filter(question,second_filter(question,first_filter(question, possAnswers)),second_filter(question,first_filter(question, possAnswers))['answer_entities_list'])
    answer_dict['sentence'] = possAnswers
    answer_dict['question'] = question
    answer_dict['answer'] = answer
    answer_list.append(answer_dict)

In [130]:
import csv
headers = ['id', 'answer']

with open('submit.csv','w') as f:
    f_csv = csv.DictWriter(f, headers)
    f_csv.writeheader()
    for index in range(len(answer_list)):
        if isinstance( answer_list[index]['answer'], int):
            f_csv.writerows([{'id':index+1,'answer':answer_list[index]['answer']}])
        else:
            f_csv.writerows([{'id':index+1,'answer':answer_list[index]['answer'].encode("utf-8")}])

## Joshi Submission

In [57]:
# Part A gives us a most likely sentence
# Part B splits into entities

# Given a question, returns a tag for the answer form
# From PERSON, LOCATION, NUMBER, OTHER 
# Assuming question is lowercased
def getQuestionType(question):
    if 'Who' in question:
        return "PERSON"
    if 'where' in question:
        return "LOCATION"
    if 'How many' in question:
        return "NUMBER"
    if 'How much' in question:
        return "NUMBER"
    if 'When' in question:
        return "NUMBER"
    if 'what year' in question:
        return "NUMBER"
    else:
        return "O"

In [58]:
def filter_entitites(entities):
    return filter(lambda x: x[0] == "O" or x[0] == "STOPWORD", entities)

In [59]:
# First, answers whose content words all appear in the question should be ranked lowest.

def first_filter2(question, answer_entities):
   
    ranked_list = []
    
    question = set(pre_process(question))
    
#     print question
#     print
    
    for entity in answer_entities:

        raw_span = entity[0]
        span_tag = entity[1]
        
        set_span = set(pre_process(raw_span))
        
        if span_tag != "O" and span_tag != "STOPWORD" and span_tag !="O":
            
            if set_span.issubset(question):
                
                ranked_list.append([entity, 1])
#                 print "IN", raw_span, span_tag, set_span, question
                
            else:
                
                ranked_list.append([entity, 2])
#                 print "OUT", raw_span, span_tag, set_span, question
    
    return sorted(ranked_list, key=lambda x: x[1], reverse=True)

In [60]:
# Second, answers which match the question type should be ranked higher than those that don't; for this, you
# should build a simple rule-based question type classifier based on key words (e.g. questions which contain "who" are
# people).

# First, answers whose content words all appear in the question should be ranked lowest.

def second_filter2(question, ranked_list):
   
    question_type = getQuestionType(question)
#     print question_type
    
    for index, answer in enumerate(ranked_list):
        
        entity_tag = answer[0][1]
        
        if entity_tag == question_type:
#             print "MATCH", answer[0], question_type, question
            ranked_list[index].append(2)
#             ranked_list[index][1] += 1
        else:
            ranked_list[index].append(1)
#             ranked_list[index][1] -= 1
            
    return ranked_list

In [61]:
import nltk
# Follow lemmatize function from guide notebook: WSTA_N1B_preprocessing.ipynb
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
word_tokenizer = nltk.tokenize.WordPunctTokenizer() #word_tokenize #tokenize.regexp.WordPunctTokenizer()

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def pre_process2(line):
    tokenized_sentence = word_tokenizer.tokenize(line.lower())
    lemmatized_sentence = [lemmatize(token) for token in tokenized_sentence]
    filtered_sentence = [token for token in lemmatized_sentence if token not in filter_tokens]
    tagged_sent = nltk.pos_tag(lemmatized_sentence)
    final = []
    for word, tag in tagged_sent:
        if "V" in tag or "NN" in tag:
#             final.append((word,tag))
            final.append(word)
            
#     print "RESULT: ", final
    return final

In [62]:
# Third, among entities of the same type, the prefered entity should be the one which is closer in the sentence to a
# closed-class word from the question.

from collections import defaultdict

def third_filter2(question, possAnswers, ranked_list):
    
    question = pre_process2(question)

    answer_sent = " ".join([x[0] for x in possAnswers])
    answer_sent = pre_process(answer_sent)
    raw_answer_sent = " ".join(answer_sent)
    
#     print "QUESTION: "
#     pp.pprint(question)
#     print "ANSWER: "
#     pp.pprint(answer_sent)
#     pp.pprint(raw_answer_sent)
    
    for index, answer in enumerate(ranked_list):

        span_tag = answer[0][1]
        raw_span = answer[0][0]

        proc_span = pre_process(raw_span)

        raw_proc_span = " ".join(proc_span)
        new_raw_proc_span = "-".join(proc_span)

        raw_answer_sent = raw_answer_sent.replace(raw_proc_span, new_raw_proc_span)
    
    answer_sent = raw_answer_sent.split(" ")
    
    avg_dict = defaultdict(float)
    
    for open_class in question:
        
        if open_class in answer_sent:
            
            open_class_locations = [i for i, x in enumerate(answer_sent) if x == open_class]
            
#             print "OPEN CLASS: ", repr(open_class)

            for index, answer in enumerate(ranked_list):

                span_tag = answer[0][1]
                raw_span = answer[0][0]

                proc_span = pre_process(raw_span)
                
                raw_proc_span = " ".join(proc_span)
                new_raw_proc_span = "-".join(proc_span)
                
                proc_span_locations = [i for i, x in enumerate(answer_sent) if x == new_raw_proc_span]
                
                min_dist = len(answer_sent)
                min_dist_ind = (None, None)
                
                for loc1 in proc_span_locations:
                    
                    for loc2 in open_class_locations:
                        
                        dist = abs(loc1 - loc2)
                        
                        if dist < min_dist:
                            
                            min_dist = dist
                            min_dist_ind = (loc1, loc2)
                
#                 print "PROC: ", proc_span_locations
#                 print "OPEN CLASS: ", open_class_locations                
                scale = (len(answer_sent) - min_dist) * 1.0 / len(answer_sent)
#                 print "JOINT: ", min_dist_ind, scale
                avg_dict[index] += scale
#                 ranked_list[index][1] *= scale
    
    
    for key, value in avg_dict.iteritems():
        ranked_list[key].append(value / len(question))

    return ranked_list

In [63]:
def reduce_rank(ranking_list):
    
    new_ranking = []
    
    for rank in ranking_list:
        
        new_rank = ( rank[1] + rank[2] )
        
        if len(rank) == 4:
             new_rank *= rank[3]
        
        new_ranking.append([rank[0], new_rank])
        
    return sorted(new_ranking, key=lambda x: x[1], reverse=True)

In [74]:
answer_list = []
for ele in partBTestAnswers:
    answer_dict = {}
    #print ele
    question = test[ele[0]][ele[1]]['question']
    
    possAnswers = ele[3]
    #print possAnswers
    #answer = third_filter(question,second_filter(question,first_filter(question, possAnswers)),second_filter(question,first_filter(question, possAnswers))['answer_entities_list'])
    first_pass = first_filter2(question, possAnswers)
    second_pass = second_filter2(question, first_pass)
    third_pass = third_filter2(question, possAnswers, second_pass)
    fourth_pass = reduce_rank(third_pass)
    
    answer_dict['sentence'] = possAnswers
    answer_dict['question'] = question
    if len(fourth_pass)==0:
        answer_dict['answer'] = 'no result found'
    else:
        answer_dict['answer'] = fourth_pass.pop(0)
    answer_list.append(answer_dict)

In [79]:
print answer_list[2000]

{'answer': [(u'five', 'NUMBER'), 3.6923076923076925], 'question': u'How many councilors are there?', 'sentence': [(u'It', 'O'), (u'is', 'STOPWORD'), (u'composed', u'O'), (u'of', 'STOPWORD'), (u'five', 'NUMBER'), (u'councilors (', u'O'), (u'German', 'OTHERCAP'), (u':', 'PUNC'), (u'Gemeinderat-r\xe4tin', 'OTHERCAP'), (u')', u'O'), (u',', 'PUNC'), (u'each', 'STOPWORD'), (u'presiding', u'O'), (u'over a', 'STOPWORD'), (u'directorate (', u'O'), (u'Direktion', 'OTHERCAP'), (u') comprising several departments', u'O'), (u'and', 'STOPWORD'), (u'bureaus', u'O'), (u'.', 'PUNC')]}


In [66]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

import nltk
from nltk.corpus import stopwords

import pickle # Useful for read / write of list file
import os #Needed to check if file exists

from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

stop_words = set(stopwords.words('english'))

from string import punctuation  
punct_tokens = set(punctuation)
extra_tokens = set(["what", "where", "how", "when", "who"])

filter_tokens = extra_tokens.union(punct_tokens).union(stop_words)

In [82]:
import csv
headers = ['id', 'answer']

with open('submit.csv','w') as f:
    f_csv = csv.DictWriter(f, headers)
    f_csv.writeheader()
    for index in range(len(answer_list)):
        if isinstance( answer_list[index]['answer'], int):
            f_csv.writerows([{'id':index+1,'answer':answer_list[index]['answer'][0][0]}])
        else:
            f_csv.writerows([{'id':index+1,'answer':answer_list[index]['answer'][0][0].encode("utf-8")}])