In [None]:
from os import mkdir, path
from subprocess import Popen, PIPE
import xml.etree.ElementTree as et

In [None]:
TREC_TEXT_TEMPLATE = (
    '<DOC>\n'
    '<DOCNO>{}</DOCNO>\n'
    '<TEXT>\n'
    '{}\n'
    '</TEXT>\n'
    '</DOC>\n')


def build_trec_text(serial, text):
    return TREC_TEXT_TEMPLATE.format(serial, text)


def tokenize_en(text):
    with Popen(['java', 'edu.stanford.nlp.process.PTBTokenizer', '-preserveLines'],
               stdin=PIPE, stdout=PIPE, stderr=PIPE) as tokenizer_proc:
        out, err = tokenizer_proc.communicate(input=text.encode('UTF-8'))
        return out.decode('UTF-8')


def tokenize_ja(text):
    return ' '.join(list(text.strip()))


def write_files(doc_set):
    for serial, doc in doc_set.items():
        with open(doc['filepath'], 'w') as f:
            f.write(doc['trec_text'])

In [None]:
def get_nugget_dict(root, topic_id_set):
    nugget_dict = {}
    for topic_id in topic_id_set:
        exam = root.find('exam/[@id="{}"]'.format(topic_id))
        if not exam:
            continue
        for ans in exam.iter('answer'):
            annotator_id = ans.get('annotator')
            for sem in ans.iter('semantic_unit'):
                sem_id = sem.get('id')
                for prop in sem.iter('proposition'):
                    prop_id = prop.get('id')
                    nugget_id = '{}_{}_{}_{}'.format(topic_id, annotator_id, sem_id, prop_id)
                    nugget_text = prop.get('value').strip()
                    nugget_dict[nugget_id] = nugget_text
    return nugget_dict

In [None]:
ref_folder_path_en = '../../qalab3-essay-phase2/_references/qalab3-en-essay-phase2/qalab3-en-phase2-nugget-essay'

nugget_dict_en = dict()
topic_id_set_en = {'B792W10_[1]', 'C792W10_[1]'}
for topic_id in topic_id_set_en:
    tree_en = et.parse('{}/{}'.format(ref_folder_path_en, '{}.xml'.format(topic_id.replace('_[1]', ''))))
    root_en = tree_en.getroot()
    nugget_dict_en.update(get_nugget_dict(root_en, topic_id_set_en))

In [None]:
TOKENIZED_OUTPUT_DIR = '../../qalab3-essay-phase2/nuggets-en-tokenized/'
if not path.isdir(TOKENIZED_OUTPUT_DIR):
    mkdir(TOKENIZED_OUTPUT_DIR)

tokenized_docs_en = {}
total = len(nugget_dict_en)
count = 0
print('tokenizing {} docs...'.format(total))
for serial, doc in nugget_dict_en.items():
    tokenized_text = tokenize_en(doc)
    tokenized_trec_text = build_trec_text(serial, tokenized_text)
    filepath = path.join(TOKENIZED_OUTPUT_DIR, '{}-seg.xml'.format(serial))
    tokenized_docs_en[serial] = {'trec_text': tokenized_trec_text, 'filepath': filepath}
    count += 1
    if count % 10 == 0 or count == total:
        print('tokenized {}/{} docs'.format(count, total), end='\r')

In [None]:
write_files(tokenized_docs_en)

In [None]:
ref_folder_path_ja = '../../qalab3-essay-phase2/_references/qalab3-ja-essay-phase2/qalab3-ja-phase2-nugget-essay'

nugget_dict_ja = dict()
topic_id_set_ja = {'B792W10_【１】', 'C792W10_【１】'}
for topic_id in topic_id_set_ja:
    tree_ja = et.parse('{}/{}'.format(ref_folder_path_ja, '{}.xml'.format(topic_id.replace('_【１】', ''))))
    root_ja = tree_ja.getroot()
    nugget_dict_ja.update(get_nugget_dict(root_ja, topic_id_set_ja))

In [None]:
TOKENIZED_OUTPUT_DIR = '../../qalab3-essay-phase2/nuggets-ja-tokenized/'
if not path.isdir(TOKENIZED_OUTPUT_DIR):
    mkdir(TOKENIZED_OUTPUT_DIR)

tokenized_docs_ja = {}
total = len(nugget_dict_ja)
count = 0
print('tokenizing {} docs...'.format(total))
for serial, doc in nugget_dict_ja.items():
    tokenized_text = tokenize_ja(doc)
    tokenized_trec_text = build_trec_text(serial, tokenized_text)
    filepath = path.join(TOKENIZED_OUTPUT_DIR, '{}-seg.xml'.format(serial))
    tokenized_docs_ja[serial] = {'trec_text': tokenized_trec_text, 'filepath': filepath}
    count += 1
    if count % 10 == 0 or count == total:
        print('tokenized {}/{} docs'.format(count, total), end='\r')

In [None]:
write_files(tokenized_docs_ja)