In [1]:
import xml.etree.ElementTree as et
import re
from subprocess import Popen, PIPE

In [2]:
class MecabNode:
    def __init__(
            self,
            surface: str=None,
            syntax_type: str=None,
            sub_syntax_type: str=None,
            semantic_type: str=None,
            sub_semantic_type: str=None,
            reading: str=None):
        self.surface = surface
        self.syntax_type = syntax_type
        self.sub_syntax_type = sub_syntax_type
        self.semantic_type = semantic_type
        self.sub_semantic_type = sub_semantic_type
        self.reading = reading

    def __repr__(self):
        return '[{}]({}/{}/{}/{}/{})'.format(self.surface,
                                             self.syntax_type, self.sub_syntax_type,
                                             self.semantic_type, self.sub_semantic_type,
                                             self.reading)

In [3]:
import os
from typing import List

import MeCab


class MecabWrapper:
    def __init__(self):
        mecab_ipadic_path = '/usr/local/lib/mecab/dic/mecab-ipadic-neologd/'
        assert os.path.exists(mecab_ipadic_path), 'mecab-ipadic-neologd not found. Exitting.'
        self.tagger = MeCab.Tagger('-d {}'.format(mecab_ipadic_path))
        self.tagger.parse('')

    def get_nodes(self, sentence: str) -> List[MecabNode]:
        """Get a list of `MecabNode`s from the given sentence

        Args:
            sentence (str): input sentence

        Returns:
            nodes (list): a list of `MecabNode` objects

        :param sentence: str
        :return: List[MecabNode]
        """
        node_ptr = self.tagger.parseToNode(sentence)
        nodes = []
        while node_ptr:
            features = node_ptr.feature.split(',')
            if 'BOS/EOS' != features[0]:
                reading = features[7] if len(features) > 7 else node_ptr.surface
                nodes.append(MecabNode(node_ptr.surface,
                                       features[0], features[1], features[2], features[3], reading))
            node_ptr = node_ptr.next
        return nodes

    def get_readings(self, sentence: str) -> List[str]:
        nodes = self.get_nodes(sentence)
        return [node.reading for node in nodes]

    def get_words(self, sentence: str) -> List[str]:
        nodes = self.get_nodes(sentence)
        return [node.surface for node in nodes]

In [4]:
def get_element_texts(parent_element, xpath):
    texts = []
    for element in parent_element.findall(xpath):
        text = ''.join(element.itertext())
        if text:
            texts.append(text)
    return texts

In [5]:
def get_length_limit(parent_element):
    return int(re.findall(r'\d+', parent_element.find('answer_set/answer/[@length_limit]').get('length_limit'))[0])

In [6]:
def get_conditions_list(root):
    conditions_list = []
    for answer_section in root.iter('answer_section'):
        section_id = answer_section.get('id')
        grands = get_element_texts(answer_section, 'grand_question_set/grand_question')
        instructions = get_element_texts(answer_section, 'instruction/p')
        refs = get_element_texts(answer_section, 'reference_set/reference/[@is_directly_referred="0"]')
        direction = get_element_texts(answer_section, 'reference_set/reference/[@is_directly_referred="1"]')
        keywords = get_element_texts(answer_section, 'keyword_set/keyword')
        viewpoints = get_element_texts(answer_section, 'viewpoint_set/viewpoint')
        answer_length_limit = get_length_limit(answer_section)
        conditions = {
            'section_id': section_id,
            'grands': grands,
            'instructions': instructions,
            'refs': refs,
            'direction': direction,
            'keywords': keywords,
            'viewpoints': viewpoints,
            'answer_length_limit': answer_length_limit
        }
        conditions_list.append(conditions)
    return conditions_list

In [44]:
def filter_by_noun_ja(text):
    tokens = []
    tagger = MecabWrapper()
    nodes = tagger.get_nodes(text)
    for node in nodes:
        if '名詞' == node.syntax_type:
            tokens.append(node.surface)
            # if '固有名詞' == node.sub_syntax_type:
    return tokens


def get_query_string_list_ja(conditions_list_ja):
    def pattern_filter(text):
        patterns = [r'\d+字以内', r'[\(\)]', r'下線部\w', r'図版\w', r'[ＸＹＺ]']
        for pattern in patterns:
            text = re.sub(pattern, '', text)
        return text

    query_string_list = []
    for conditions in conditions_list_ja:
        query_string_template = '#combine({})'
        phrases = []
        keyterms = conditions['keywords']
        if keyterms:
            for keyterm in keyterms:
                phrase = '#1({})'.format(' '.join(list(
                    keyterm
                )).replace('(', '').replace(')', ''))
                phrases.append(phrase)
        else:
            raw_phrases = []
            if conditions['instructions']:
                raw_phrases.append(
                    pattern_filter(
                        conditions['instructions'][0]
                    ))
                instruction_phrase = ' '.join(list(
                    pattern_filter(
                        conditions['instructions'][0]
                    )
                ))
                phrases.append(instruction_phrase)
            if conditions['refs']:
                raw_phrases.append(
                    pattern_filter(
                        conditions['refs'][0]
                    ))
                ref_phrase = ' '.join(list(
                    pattern_filter(
                        conditions['refs'][0]
                    )
                ))
                phrases.append(ref_phrase)
            if conditions['direction']:
                raw_phrases.append(
                    pattern_filter(
                        conditions['direction'][0]
                    ))
                direction_phrase = ' '.join(list(
                    pattern_filter(
                        conditions['direction'][0]
                    )
                ))
                phrases.append(direction_phrase)
            if conditions['viewpoints']:
                raw_phrases.append(
                    pattern_filter(
                        conditions['viewpoints'][0]
                    ))
                viewpoint_phrase = ' '.join(list(
                    pattern_filter(
                        conditions['viewpoints'][0]
                    )
                ))
                phrases.append(viewpoint_phrase)
            new_phrases = []
            for raw_phrase in raw_phrases:
                nouns = filter_by_noun_ja(raw_phrase)
                for noun in nouns:
                    term = '#1({})'.format(' '.join(list(
                        noun
                    )).replace('(', '').replace(')', ''))
                    new_phrases.append(term)
            phrases.extend(new_phrases)
        query_string = query_string_template.format(' '.join(phrases))
        query_string_list.append(query_string)
    return query_string_list


def tokenize_en(text):
    with Popen(['java', 'edu.stanford.nlp.process.PTBTokenizer', '-preserveLines'],
               stdin=PIPE, stdout=PIPE, stderr=PIPE) as tokenizer_proc:
        out, err = tokenizer_proc.communicate(input=text.encode('UTF-8'))
        return out.decode('UTF-8')


def tag_pos_en(text):
    with Popen([
        'java', 'edu.stanford.nlp.tagger.maxent.MaxentTagger', '--model',
        'edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger'],
            stdin=PIPE, stdout=PIPE, stderr=PIPE) as tokenizer_proc:
        out, err = tokenizer_proc.communicate(input=text.encode('UTF-8'))
        return out.decode('UTF-8')


def filter_by_noun_en(text):
    return re.findall(r'\b(\w+)_NN\b', tag_pos_en(text))


def get_query_string_list_en(conditions_list_en):
    def pattern_filter(text):
        patterns = [
            r'(\d+ English words or less)',
            r'[\(\)]',
            r'(underlined section \(\d+\))',
            r'(Plate \w)',
            r'\b[XYZ]\b',
            r'[,.?\'"\-`“”’]',
            r'(’s)'
        ]
        for pattern in patterns:
            text = re.sub(pattern, '', text)

        return text


    query_string_list = []
    for conditions in conditions_list_en:
        query_string_template = '#combine({})'
        phrases = []
        keyterms = conditions['keywords']
        if keyterms:
            for keyterm in keyterms:
                phrase = '#1({})'.format(
                    pattern_filter(
                        keyterm
                    )
                )
                phrases.append(phrase)
        else:
            raw_phrases = []
            if conditions['instructions']:
                instruction_phrase = conditions['instructions'][0]
                raw_phrases.append(
                    pattern_filter(
                        conditions['instructions'][0]
                    ))
                phrases.append(
                    pattern_filter(
                        instruction_phrase
                    )
                )
            if conditions['refs']:
                ref_phrase = conditions['refs'][0]
                raw_phrases.append(
                    pattern_filter(
                        conditions['refs'][0]
                    ))
                phrases.append(
                    pattern_filter(
                        ref_phrase
                    )
                )
            if conditions['direction']:
                direction_phrase = conditions['direction'][0]
                raw_phrases.append(
                    pattern_filter(
                        conditions['direction'][0]
                    ))
                phrases.append(
                    pattern_filter(
                        direction_phrase
                    )
                )
            if conditions['viewpoints']:
                viewpoint_phrase = conditions['viewpoints'][0]
                raw_phrases.append(
                    pattern_filter(
                        conditions['viewpoints'][0]
                    ))
                phrases.append(
                    pattern_filter(
                        viewpoint_phrase
                    )
                )
            new_phrases = []
            for raw_phrase in raw_phrases:
                nouns = filter_by_noun_en(raw_phrase)
                for noun in nouns:
                    term = '#1({})'.format(
                        pattern_filter(
                            noun
                        )
                    )
                new_phrases.append(term)
            phrases.extend(new_phrases)
        query_string = query_string_template.format(' '.join(phrases))
        query_string_list.append(query_string)
    return query_string_list

In [18]:
def get_parameters_ja(query_string_list_ja):
    parameter_template = (
        '<parameters>\n'
        ' <index>qalab3-essay-phase2/indri/indexes/tokyoshoseki</index>\n'
        ' <index>qalab3-essay-phase2/indri/indexes/yamakawa</index>\n'
        ' <index>qalab3-essay-phase2/indri/indexes/jawiki</index>\n'
        '\n{}\n'
        '</parameters>\n'
    )
    query_element_template = (
        '  <query>\n'
        '   <type>indri</type>\n'
        '   <number>{}</number>\n'
        '   <text>\n'
        '    {}\n'
        '   </text>\n'
        '  </query>\n'
    )

    query_elements = []
    for i, query in enumerate(query_string_list_ja):
        query_element = query_element_template.format(i, query)
        query_elements.append(query_element)
    parameters = parameter_template.format('\n'.join(query_elements))
    return parameters


def get_parameters_en(query_string_list_en):
    parameter_template = (
        '<parameters>\n'
        ' <index>qalab3-essay-phase2/indri/indexes/enwiki</index>\n'
        ' <stemmer><name>krovetz</name></stemmer>\n'
        '\n{}\n'
        '</parameters>\n'
    )
    query_element_template = (
        '  <query>\n'
        '   <type>indri</type>\n'
        '   <number>{}</number>\n'
        '   <text>\n'
        '    {}\n'
        '   </text>\n'
        '  </query>\n'
    )

    query_elements = []
    for i, query in enumerate(query_string_list_en):
        query_element = query_element_template.format(i, query)
        query_elements.append(query_element)
    parameters = parameter_template.format('\n'.join(query_elements))
    return parameters

In [9]:
def run_indri_query(parameter_file_path):
    with Popen(['IndriRunQuery', parameter_file_path, '-trecFormat=true', '-count=10'],
               stdout=PIPE, stderr=PIPE) as indri_proc:
        out, err = indri_proc.communicate()
        return out.decode('UTF-8')

In [10]:
from math import pow


def parse_result(result):
    result_set = {}
    result_lines = result.splitlines()
    for line in result_lines:
        fields = line.split(' ')
        serial = fields[0]
        doc_id = fields[2]
        rank = fields[3]
        score = fields[4]
        prob = pow(2, float(score))
        if serial not in result_set:
            result_set[serial] = []
        result_set[serial].append({'doc_id': doc_id, 'rank': rank, 'score': score, 'prob': prob})
    return result_set

In [20]:
import sqlite3


def attach_doc_ja(result_set, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    set_size = len(result_set)
    for i in range(set_size):
        candidates = result_set[str(i)]
        for j, candidate in enumerate(candidates):
            doc_id = candidate['doc_id']
            cursor.execute('SELECT doc FROM ja_docs WHERE doc_no=?', (doc_id,))
            all_rows = []
            for row in cursor:
                all_rows.append(row[0])
            result_set[str(i)][j]['doc'] = '\n'.join(all_rows)
    return result_set


def attach_doc_en(result_set, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    set_size = len(result_set)
    for i in range(set_size):
        candidates = result_set[str(i)]
        for j, candidate in enumerate(candidates):
            doc_id = candidate['doc_id']
            cursor.execute('SELECT doc FROM en_docs WHERE doc_no=?', (doc_id,))
            for row in cursor:
                result_set[str(i)][j]['doc'] = row[0]
                break
    return result_set

In [12]:
def attach_sentence(result_set_with_docs, query_string_list):
    set_size = len(result_set_with_docs)
    for i in range(set_size):
        query_string = query_string_list[i]
        query_string = query_string.replace('#combine(', '').replace('#1(', '').replace(')', '')
        tokens = map(lambda x: '({})'.format(x), query_string.split(' '))
        pattern = '|'.join(tokens)
        candidates = result_set_with_docs[str(i)]
        for j, candidate in enumerate(candidates):
            doc = candidate['doc']
            doc_lines = doc.splitlines()
            sentences = []
            sentence_set = set()
            for line in doc_lines:
                if line.startswith('<DOC') or line.startswith('</DOC') \
                        or line.startswith('<TEXT>') or line.startswith('</TEXT>'):
                    continue
                sentence = line.replace('<Title>', '').replace('</Title>', '').strip()
                if sentence in sentence_set:
                    continue
                sentence_set.add(sentence)
                matches = re.findall(pattern, sentence)
                if matches:
                    sentences.append({'sentence': sentence, 'match_count': len(matches)})
            result_set_with_docs[str(i)][j]['sentences'] = sentences
    return result_set_with_docs

In [13]:
def attach_passage(result_set_with_sentences):
    set_size = len(result_set_with_sentences)
    for i in range(set_size):
        candidates = result_set_with_sentences[str(i)]
        for j, candidate in enumerate(candidates):
            sorted_sentences = sorted(candidate['sentences'], key=lambda x: x['match_count'], reverse=True)
            passage = '\n'.join([s['sentence'] for s in sorted_sentences[:7]])
            result_set_with_sentences[str(i)][j]['passage'] = passage
    return result_set_with_sentences

In [14]:
def get_extraction_xml(result_set_with_passages, conditions_list):
    topic_element_template = (
        '<TOPIC ID="{}">\n'
        ' <PASSAGE_SET>\n'
        '{}'
        ' </PASSAGE_SET>\n'
        '</TOPIC>\n'
    )
    passage_element_template = (
        '  <PASSAGE RANK="{}" SOURCE_ID="{}" SOURCE_ID_TYPE="QALab3" SCORE="{}" NORMALIZED_SCORE="{}">\n'
        '   {}\n'
        '  </PASSAGE>\n'
    )

    topic_elements = []
    set_size = len(result_set_with_passages)
    for i in range(set_size):
        section_id = conditions_list[i]['section_id']
        candidates = result_set_with_passages[str(i)]
        passage_elements = []
        for candidate in candidates:
            rank = candidate['rank']
            doc_id = candidate['doc_id']
            score = candidate['score']
            prob = candidate['prob']
            passage = candidate['passage']
            passage_element = passage_element_template.format(rank, doc_id, score, prob, passage)
            passage_elements.append(passage_element)
        passage_set = ''.join(passage_elements)
        topic_element = topic_element_template.format(section_id, passage_set)
        topic_elements.append(topic_element)
    topic_set = ''.join(topic_elements)
    extraction_xml = '<?xml version="1.0" encoding="UTF-8"?>\n{}'.format(topic_set)
    return extraction_xml

In [34]:
tree_ja = et.parse('qalab3-essay-phase2/_references/qalab3-ja-essay-phase2/qalab3-ja-phase2-answersheet-essay.xml')
conditions_list_ja = get_conditions_list(tree_ja.getroot())

In [46]:
query_string_list_ja = get_query_string_list_ja(conditions_list_ja)
query_string_list_ja

['#combine(#1(ア ク テ ィ ウ ム の 海 戦) #1(イ ス ラ ム 教) #1(オ ス マ ン 帝 国) #1(サ ラ デ ィ ン) #1(ナ イ ル 川) #1(ナ セ ル) #1(ナ ポ レ オ ン) #1(ム ハ ン マ ド ・ ア リ ー))',
 '#combine(日 本 は の 連 合 組 織 に 参 加 し ， 後 に 脱 退 し た 。 脱 退 の 経 緯 を で 記 せ 。 連 合 組 織 #1(日 本) #1(連 合) #1(組 織) #1(参 加) #1(，) #1(後) #1(脱 退) #1(脱 退) #1(経 緯) #1(連 合) #1(組 織))',
 '#combine(の 戦 争 の な か に は ， 1 9 4 8 年 ５ 月 に 始 ま っ た 第 一 次 中 東 戦 争 パ レ ス テ ィ ナ 戦 争 が あ る 。 こ の 戦 争 の 結 果 ど の よ う な こ と が 起 こ っ た か ， で 説 明 せ よ 。 一 連 の 戦 争 #1(戦 争) #1(な か) #1(1 9 4 8 年) #1(５ 月) #1(第 一 次 中 東 戦 争) #1(パ レ ス テ ィ ナ) #1(戦 争) #1(戦 争) #1(結 果) #1(よ う) #1(こ と) #1(説 明) #1(一 連) #1(戦 争))',
 '#combine(#1(植 民 地 奴 隷 制 の 廃 止) #1(サ ト ウ キ ビ ・ プ ラ ン テ ー シ ョ ン) #1(ゴ ー ル ド ・ ラ ッ シ ュ) #1(海 禁) #1(ア ヘ ン 戦 争) #1(海 峡 植 民 地) #1(利 権 回 収 運 動) #1(孫 文))',
 '#combine(は ， 1 6 ～ 1 7 世 紀 の 間 に 宗 教 政 策 を 大 き く 変 え た 。 そ の 変 化 を ， 関 係 す る 二 人 の 皇 帝 の 名 を 用 い ， で 説 明 せ よ 。 関 係 す る 二 人 の 皇 帝 の 名 を 用 い #1(1 6) #1(1 7 世 紀) #1(間) #1(宗 教) #1(政 策) #1(変 化) #1(関 係) #1(二 人) #1(皇 帝) #1(名) #1(説 明) #1(関 係) #1(二 人) #1(皇 帝)

In [48]:
parameters_ja = get_parameters_ja(query_string_list_ja)
with open('qalab3-essay-phase2/indri/parameter_files/query-ja-phase2.xml', 'w') as f:
    f.write(parameters_ja)
result_ja = run_indri_query('qalab3-essay-phase2/indri/parameter_files/query-ja-phase2.xml')
result_set_ja = parse_result(result_ja)
result_set_with_docs_ja = attach_doc_ja(result_set_ja, 'qalab3-essay-phase2/ja_doc.db')
result_set_with_sentences_ja = attach_sentence(result_set_with_docs_ja, query_string_list_ja)
result_set_with_passages_ja = attach_passage(result_set_with_sentences_ja)
extraction_xml_ja = get_extraction_xml(result_set_with_passages_ja, conditions_list_ja)
with open('qalab3-essay-phase2/qalab3-ja-phase2-answersheet-essay_DGLab_extraction_01.xml', 'w') as f:
    f.write(extraction_xml_ja)

In [50]:
tree_en = et.parse('qalab3-essay-phase2/_references/qalab3-en-essay-phase2/qalab3-en-phase2-answersheet-essay.xml')
conditions_list_en = get_conditions_list(tree_en.getroot())

In [52]:
query_string_list_en = get_query_string_list_en(conditions_list_en)
query_string_list_en

['#combine(#1(Battle of Actium) #1(Islam) #1(Ottoman Empire) #1(Saladin) #1(Nile River) #1(Nasser) #1(Napoleon) #1(Muhammed Ali))',
 '#combine(Japan participated in the federation in underlined section 1 but then left it Explain in  what led to leaving the federation federation #1(federation) #1(federation))',
 '#combine(One of the wars in underlined section 5 was the First Arab–Israeli War Palestine War which began in May 1948 Explain in  the outcome of this war series of battles #1(war) #1(series))',
 '#combine(#1(Abolition of the colonial slave system) #1(sugar cane plantation) #1(gold rush) #1(Haijin) #1(Opium Wars) #1(Straits Settlements) #1(rights recovery movement) #1(Sun Yatsen))',
 '#combine( changed its religious policy greatly during the 16th and 17th centuries Explain in  this change Include the names of the two popes involved Include the names of the two popes involved #1(change) #1(change))',
 '#combine( had a system for treating hereticsaWrite the name of this system and

In [53]:
parameters_en = get_parameters_en(query_string_list_en)
with open('qalab3-essay-phase2/indri/parameter_files/query-en-phase2.xml', 'w') as f:
    f.write(parameters_en)
result_en = run_indri_query('qalab3-essay-phase2/indri/parameter_files/query-en-phase2.xml')
result_set_en = parse_result(result_en)
result_set_with_docs_en = attach_doc_en(result_set_en, 'qalab3-essay-phase2/en_doc.db')
result_set_with_sentences_en = attach_sentence(result_set_with_docs_en, query_string_list_en)
result_set_with_passages_en = attach_passage(result_set_with_sentences_en)
extraction_xml_en = get_extraction_xml(result_set_with_passages_en, conditions_list_en)
with open('qalab3-essay-phase2/qalab3-en-phase2-answersheet-essay_DGLab_extraction_01.xml', 'w') as f:
    f.write(extraction_xml_en)