In [1]:
from os import mkdir, path
from subprocess import Popen, PIPE
import xml.etree.ElementTree as et

In [2]:
TREC_TEXT_TEMPLATE = (
    '<DOC>\n'
    '<DOCNO>{}</DOCNO>\n'
    '<TEXT>\n'
    '{}\n'
    '</TEXT>\n'
    '</DOC>\n')


def build_trec_text(serial, text):
    return TREC_TEXT_TEMPLATE.format(serial, text)


def tokenize_en(text):
    with Popen(['java', 'edu.stanford.nlp.process.PTBTokenizer', '-preserveLines'],
               stdin=PIPE, stdout=PIPE, stderr=PIPE) as tokenizer_proc:
        out, err = tokenizer_proc.communicate(input=text.encode('UTF-8'))
        return out.decode('UTF-8')


def tokenize_ja(text):
    return ' '.join(list(text.strip()))


def write_files(doc_set):
    for serial, doc in doc_set.items():
        with open(doc['filepath'], 'w') as f:
            f.write(doc['trec_text'])

In [3]:
def get_passage_dict(root, topic_id_set):
    passage_dict = {}
    for topic_id in topic_id_set:
        topic = root.find('TOPIC/[@ID="{}"]'.format(topic_id))
        for passage_set in topic.iter('PASSAGE_SET'):
            passage_set_name = passage_set.get('FILE_NAME')
            for passage in passage_set.iter('PASSAGE'):
                passage_id = '{}_{}_{}'.format(topic_id, passage_set_name, passage.get('RANK'))
                passage_text = ''.join(passage.itertext()).strip()
                passage_dict[passage_id] = passage_text
    return passage_dict

In [4]:
ref_folder_path_en = '../../qalab3-essay-phase2/_references/qalab3-en-essay-phase2'
passage_file_name_en = 'qalab3-en-phase2-essay-extraction-for-evaluation.xml'

tree_en = et.parse('{}/{}'.format(ref_folder_path_en, passage_file_name_en))
root_en = tree_en.getroot()
topic_id_set_en = {'B792W10-1', 'C792W10-1'}
passage_dict_en = get_passage_dict(root_en, topic_id_set_en)

In [None]:
TOKENIZED_OUTPUT_DIR = '../../qalab3-essay-phase2/passages-en-tokenized/'
if not path.isdir(TOKENIZED_OUTPUT_DIR):
    mkdir(TOKENIZED_OUTPUT_DIR)

tokenized_docs_en = {}
total = len(passage_dict_en)
count = 0
print('tokenizing {} docs...'.format(total))
for serial, doc in passage_dict_en.items():
    tokenized_text = tokenize_en(doc)
    tokenized_trec_text = build_trec_text(serial, tokenized_text)
    filepath = path.join(TOKENIZED_OUTPUT_DIR, '{}-seg.xml'.format(serial))
    tokenized_docs_en[serial] = {'trec_text': tokenized_trec_text, 'filepath': filepath}
    count += 1
    if count % 100 == 0 or count == total:
        print('tokenized {}/{} docs'.format(count, total), end='\r')

tokenizing 477 docs...
tokenized 100/477 docs

In [33]:
write_files(tokenized_docs_en)

In [37]:
ref_folder_path_ja = '../../qalab3-essay-phase2/_references/qalab3-ja-essay-phase2'
passage_file_name_ja = 'qalab3-ja-phase2-essay-extraction-for-evaluation.xml'

tree_ja = et.parse('{}/{}'.format(ref_folder_path_ja, passage_file_name_ja))
root_ja = tree_ja.getroot()
topic_id_set_ja = {'B792W10-1', 'C792W10-1'}
passage_dict_ja = get_passage_dict(root_ja, topic_id_set_ja)

In [39]:
TOKENIZED_OUTPUT_DIR = '../../qalab3-essay-phase2/passages-ja-tokenized/'
if not path.isdir(TOKENIZED_OUTPUT_DIR):
    mkdir(TOKENIZED_OUTPUT_DIR)

tokenized_docs_ja = {}
total = len(passage_dict_ja)
count = 0
print('tokenizing {} docs...'.format(total))
for serial, doc in passage_dict_ja.items():
    tokenized_text = tokenize_ja(doc)
    tokenized_trec_text = build_trec_text(serial, tokenized_text)
    filepath = path.join(TOKENIZED_OUTPUT_DIR, '{}-seg.xml'.format(serial))
    tokenized_docs_ja[serial] = {'trec_text': tokenized_trec_text, 'filepath': filepath}
    count += 1
    if count % 100 == 0 or count == total:
        print('tokenized {}/{} docs'.format(count, total), end='\r')

tokenizing 692 docs...
tokenized 100/692 docstokenized 200/692 docstokenized 300/692 docstokenized 400/692 docstokenized 500/692 docstokenized 600/692 docstokenized 692/692 docs

In [40]:
write_files(tokenized_docs_ja)