In [14]:
import xml.etree.ElementTree as et


def get_element_texts(parent_element, xpath):
    texts = []
    for element in parent_element.findall(xpath):
        text = ''.join(element.itertext())
        if text:
            texts.append(text)
    return texts


def get_expressions_dict(root):
    expressions_dict = {}
    for answer_section in root.iter('answer_section'):
        section_id = answer_section.get('id')
        expression_set = answer_section.findall('answer_set/answer/expression_set/expression')
        expressions = [''.join(expression.itertext()) for expression in expression_set]
        expressions_dict[section_id] = expressions
    return expressions_dict

In [15]:
import MeCab


mecab = MeCab.Tagger('-Owakati')


def get_tokenized_line_ja(line):
    return mecab.parse(line).strip()

In [43]:
from subprocess import Popen, PIPE


def tokenize_en(text):
    with Popen(['java', 'edu.stanford.nlp.process.PTBTokenizer', '-preserveLines'],
               stdin=PIPE, stdout=PIPE, stderr=PIPE) as tokenizer_proc:
        out, err = tokenizer_proc.communicate(input=text.encode('UTF-8'))
        return out.decode('UTF-8')

In [37]:
def write_rank_score_ja(eval_tree, gold_dict, model_ja, name_prefix):
    for topic in eval_tree.getroot().iter('TOPIC'):
        topic_id = topic.get('ID')
        ans_dict = {}
        for ans in topic.findall('ANSWER_SET/ANSWER'):
            ans_file = ans.get('FILE_NAME')
            ans_text = ''.join(ans.itertext())
            ans_dict[ans_file] = ans_text
        score_dict = {}
        tokenized_golds_words = []
        golds = gold_dict[topic_id]
        for gold in golds:
            tokenized_gold = get_tokenized_line_ja(gold)
            tokenized_golds_words.append(tokenized_gold.split())
        score = 0
        for name, text in ans_dict.items():
            tokenized_text_words = get_tokenized_line_ja(text).split()
            for tokenized_gold_words in tokenized_golds_words:
                score += model_ja.wmdistance(tokenized_text_words, tokenized_gold_words)
            score /= 3
            score_dict[name] = score
        for rank, name in enumerate(sorted(score_dict, key=score_dict.get)):
            score = score_dict[name]
            ans = topic.find('ANSWER_SET/ANSWER/[@FILE_NAME="{}"]'.format(name))
            ans.set('RANK', str(rank + 1))
            ans.set('SCORE', str(score))
    eval_tree.write('{}_DGLab_01.xml'.format(name_prefix), encoding='UTF-8', xml_declaration=True)

In [44]:
def write_rank_score_en(eval_tree, gold_dict, model_en, name_prefix):
    for topic in eval_tree.getroot().iter('TOPIC'):
        topic_id = topic.get('ID')
        ans_dict = {}
        for ans in topic.findall('ANSWER_SET/ANSWER'):
            ans_file = ans.get('FILE_NAME')
            ans_text = ''.join(ans.itertext())
            ans_dict[ans_file] = ans_text
        score_dict = {}
        tokenized_golds_words = []
        golds = gold_dict[topic_id]
        for tokenized_gold in golds:
            tokenized_golds_words.append(tokenized_gold.split())
        score = 0
        for name, text in ans_dict.items():
            tokenized_text_words = tokenize_en(text).split()
            for tokenized_gold_words in tokenized_golds_words:
                score += model_en.wmdistance(tokenized_text_words, tokenized_gold_words)
            score /= 3
            score_dict[name] = score
        for rank, name in enumerate(sorted(score_dict, key=score_dict.get)):
            score = score_dict[name]
            ans = topic.find('ANSWER_SET/ANSWER/[@FILE_NAME="{}"]'.format(name))
            ans.set('RANK', str(rank + 1))
            ans.set('SCORE', str(score))
    eval_tree.write('{}_DGLab_01.xml'.format(name_prefix), encoding='UTF-8', xml_declaration=True)

In [13]:
from gensim.models.word2vec import Word2Vec


model_ja = Word2Vec.load('w2v_ja.model')

In [17]:
tree_ja = et.parse('qalab3-essay-phase2/_references/qalab3-ja-essay-phase2/qalab3-ja-phase2-goldstandard-essay.xml')

In [18]:
gold_dict_ja = get_expressions_dict(tree_ja)

In [19]:
eval_ja = et.parse('qalab3-essay-phase2/_references/qalab3-ja-essay-phase2/qalab3-ja-phase2-essay-evaluationmethod.xml')

In [38]:
write_rank_score_ja(eval_ja, gold_dict_ja, model_ja, 'qalab3-ja-phase2-essay-evaluationmethod')

In [39]:
model_en = Word2Vec.load('w2v_en.model')

In [40]:
tree_en = et.parse('qalab3-essay-phase2/_references/qalab3-en-essay-phase2/qalab3-en-phase2-goldstandard-essay.xml')

In [41]:
gold_dict_en = get_expressions_dict(tree_en)

In [42]:
eval_en = et.parse('qalab3-essay-phase2/_references/qalab3-en-essay-phase2/qalab3-en-phase2-essay-evaluationmethod.xml')

In [45]:
write_rank_score_en(eval_en, gold_dict_en, model_en, 'qalab3-en-phase2-essay-evaluationmethod')