In [1]:
from os import mkdir, path
from subprocess import Popen, PIPE
import re
import xml.etree.ElementTree as et

In [2]:
import MeCab


mecab_ipadic_path = '/usr/local/lib/mecab/dic/mecab-ipadic-neologd/'
assert path.exists(mecab_ipadic_path), 'mecab-ipadic-neologd not found. Exitting.'
mecab = MeCab.Tagger('-Owakati -d {}'.format(mecab_ipadic_path))
mecab.parse('')


def tokenize_ja_word(text):
    return mecab.parse(text).strip()


def tokenize_ja(text):
    return ' '.join(list(text.strip()))


def tokenize_en(text):
    with Popen(['java', 'edu.stanford.nlp.process.PTBTokenizer', '-preserveLines'],
               stdin=PIPE, stdout=PIPE, stderr=PIPE) as tokenizer_proc:
        out, err = tokenizer_proc.communicate(input=text.encode('UTF-8'))
        return out.decode('UTF-8')

In [3]:
def get_parameters_ja(query_string_list_ja, index_name='passages_ja'):
    parameter_template = (
        '<parameters>\n'
        ' <index>../../qalab3-essay-phase2/indri/indexes/{}</index>\n'
        '\n{}\n'
        '</parameters>\n'
    )
    query_element_template = (
        '  <query>\n'
        '   <type>indri</type>\n'
        '   <number>{}</number>\n'
        '   <text>\n'
        '    {}\n'
        '   </text>\n'
        '  </query>\n'
    )

    query_elements = []
    for i, query in enumerate(query_string_list_ja):
        query_element = query_element_template.format(i, query)
        query_elements.append(query_element)
    parameters = parameter_template.format(index_name, '\n'.join(query_elements))
    return parameters


def get_parameters_en(query_string_list_en, index_name='passages_en'):
    parameter_template = (
        '<parameters>\n'
        ' <index>../../qalab3-essay-phase2/indri/indexes/{}</index>\n'
        ' <stemmer><name>krovetz</name></stemmer>\n'
        '\n{}\n'
        '</parameters>\n'
    )
    query_element_template = (
        '  <query>\n'
        '   <type>indri</type>\n'
        '   <number>{}</number>\n'
        '   <text>\n'
        '    {}\n'
        '   </text>\n'
        '  </query>\n'
    )

    query_elements = []
    for i, query in enumerate(query_string_list_en):
        query_element = query_element_template.format(i, query)
        query_elements.append(query_element)
    parameters = parameter_template.format(index_name, '\n'.join(query_elements))
    return parameters

In [4]:
def run_indri_query(parameter_file_path):
    with Popen(['IndriRunQuery', parameter_file_path, '-trecFormat=true', '-count=37'],
               stdout=PIPE, stderr=PIPE) as indri_proc:
        out, err = indri_proc.communicate()
        return out.decode('UTF-8')

In [5]:
def parse_result(result):
    result_set = {}
    result_lines = result.splitlines()
    for line in result_lines:
        fields = line.split(' ')
        serial = fields[0]
        doc_id = fields[2]
        score = fields[4]
        if serial not in result_set:
            result_set[serial] = []
        result_set[serial].append({'doc_id': doc_id, 'score': score})
    return result_set

In [6]:
def get_query_string_en(prop_text):
    def pattern_filter(text):
        patterns = [
            r'(\d+ English words or less)',
            r'[\(\)]',
            r'(underlined section \(\d+\))',
            r'(Plate \w)',
            r'\b[XYZ]\b',
            r'[,.?\'"\-`“”’]',
            r'(’s)'
        ]
        for pattern in patterns:
            text = re.sub(pattern, '', text)

        return text
    return '#combine({})'.format(tokenize_en(pattern_filter(prop_text)))

In [7]:
def get_query_string_ja(prop_text):
    def pattern_filter(text):
        patterns = [r'\d+字以内', r'[\(\)。、]', r'下線部\w', r'図版\w', r'[ＸＹＺ]']
        for pattern in patterns:
            text = re.sub(pattern, '', text)
        return text

    keyterms = tokenize_ja_word(pattern_filter(prop_text)).split(' ')
    phrases = []
    for keyterm in keyterms:
        phrases.append('#1({})'.format(' '.join(list(keyterm))))
    return '#combine({})'.format(' '.join(phrases))

In [8]:
def get_prop_id_text_pairs(root, exam_id):
    prop_id_text_pairs = []
    exam = root.find('exam/[@id="{}"]'.format(exam_id))
    for ans in exam.iter('answer'):
        annotator_id = ans.get('annotator')
        for sem in ans.iter('semantic_unit'):
            sem_id = sem.get('id')
            for prop in sem.iter('proposition'):
                prop_id = '{}_{}_{}'.format(annotator_id, sem_id, prop.get('id'))
                prop_text = prop.get('value')
                prop_id_text_pairs.append((prop_id, prop_text))
    return prop_id_text_pairs

In [9]:
ref_folder_path_en = '../../qalab3-essay-phase2/_references/qalab3-en-essay-phase2/qalab3-en-phase2-nugget-essay'

nugget_file_names_en = ['B792W10.xml', 'C792W10.xml']
exam_id_list_en = ['B792W10_[1]', 'C792W10_[1]']
conditions_en = {}
for file, exam_id in zip(nugget_file_names_en, exam_id_list_en):
    tree_en = et.parse('{}/{}'.format(ref_folder_path_en, file))
    root_en = tree_en.getroot()
    prop_list = get_prop_id_text_pairs(root_en, exam_id)
    conditions_en[exam_id] = prop_list

In [10]:
for exam_en, condition_en in conditions_en.items():
    query_string_list_en = [get_query_string_en(prop_text) for prop_id, prop_text in condition_en]

    param_en = get_parameters_en(query_string_list_en)
    param_file_en = '../../qalab3-essay-phase2/indri/parameter_files/query-nugget_en_{}.xml'.format(exam_en)
    with open(param_file_en, 'w') as f:
        f.write(param_en)
    result = run_indri_query(param_file_en)
    result_set = parse_result(result)
    with open('../../qalab3-essay-phase2/workspace/candidates_en_{}.txt'.format(exam_en), 'w') as f:
        for i, prop in enumerate(condition_en):
            passage_list = result_set[str(i)]
            prop_id = prop[0]
            for passage in passage_list:
                if passage['doc_id'][0] == exam_en[0]:
                    f.write('{}\t{:.5f}\t{}\n'.format(prop_id, float(passage['score']), passage['doc_id']))
            f.write('\n')

    essay_param_en = get_parameters_en(query_string_list_en, 'essays_en')
    essay_param_file_en = '../../qalab3-essay-phase2/indri/parameter_files/query-essay-nugget_en_{}.xml'.format(
        exam_en)
    with open(essay_param_file_en, 'w') as f:
        f.write(essay_param_en)
    result = run_indri_query(essay_param_file_en)
    result_set = parse_result(result)
    with open('../../qalab3-essay-phase2/workspace/candidates_essay_nugget_en_{}.txt'.format(exam_en), 'w') as f:
        for i, prop in enumerate(condition_en):
            ans_list = result_set[str(i)]
            prop_id = prop[0]
            for ans in ans_list:
                if ans['doc_id'][0] == exam_en[0]:
                    f.write('{}\t{:.5f}\t{}\n'.format(prop_id, float(ans['score']), ans['doc_id']))
            f.write('\n')

In [11]:
ref_folder_path_ja = '../../qalab3-essay-phase2/_references/qalab3-ja-essay-phase2/qalab3-ja-phase2-nugget-essay'

nugget_file_names_ja = ['B792W10.xml', 'C792W10.xml']
exam_id_list_ja = ['B792W10_【１】', 'C792W10_【１】']
conditions_ja = {}
for file, exam_id in zip(nugget_file_names_ja, exam_id_list_ja):
    tree_ja = et.parse('{}/{}'.format(ref_folder_path_ja, file))
    root_ja = tree_ja.getroot()
    prop_list = get_prop_id_text_pairs(root_ja, exam_id)
    conditions_ja[exam_id] = prop_list

In [12]:
for exam_ja, condition_ja in conditions_ja.items():
    query_string_list_ja = [get_query_string_ja(prop_text) for prop_id, prop_text in condition_ja]

    param_ja = get_parameters_ja(query_string_list_ja)
    param_file_ja = '../../qalab3-essay-phase2/indri/parameter_files/query-nugget_ja_{}.xml'.format(exam_ja)
    with open(param_file_ja, 'w') as f:
        f.write(param_ja)
    result = run_indri_query(param_file_ja)
    result_set = parse_result(result)
    with open('../../qalab3-essay-phase2/candidates_ja_{}.txt'.format(exam_ja), 'w') as f:
        for i, prop in enumerate(condition_ja):
            passage_list = result_set[str(i)]
            prop_id = prop[0]
            for passage in passage_list:
                if passage['doc_id'][0] == exam_ja[0]:
                    f.write('{}\t{:.5f}\t{}\n'.format(prop_id, float(passage['score']), passage['doc_id']))

    essay_param_ja = get_parameters_en(query_string_list_ja, 'essays_ja')
    essay_param_file_ja = '../../qalab3-essay-phase2/indri/parameter_files/query-essay-nugget_ja_{}.xml'.format(
        exam_ja)
    with open(essay_param_file_ja, 'w') as f:
        f.write(essay_param_ja)
    result = run_indri_query(essay_param_file_ja)
    result_set = parse_result(result)
    with open('../../qalab3-essay-phase2/workspace/candidates_essay_nugget_ja_{}.txt'.format(exam_ja), 'w') as f:
        for i, prop in enumerate(condition_ja):
            ans_list = result_set[str(i)]
            prop_id = prop[0]
            for ans in ans_list:
                if ans['doc_id'][0] == exam_ja[0]:
                    f.write('{}\t{:.5f}\t{}\n'.format(prop_id, float(ans['score']), ans['doc_id']))
            f.write('\n')
            f.write('\n')