In [1]:
from os import mkdir, path
from subprocess import Popen, PIPE
import xml.etree.ElementTree as et

In [2]:
TREC_TEXT_TEMPLATE = (
    '<DOC>\n'
    '<DOCNO>{}</DOCNO>\n'
    '<TEXT>\n'
    '{}\n'
    '</TEXT>\n'
    '</DOC>\n')


def build_trec_text(serial, text):
    return TREC_TEXT_TEMPLATE.format(serial, text)


def tokenize_en(text):
    with Popen(['java', 'edu.stanford.nlp.process.PTBTokenizer', '-preserveLines'],
               stdin=PIPE, stdout=PIPE, stderr=PIPE) as tokenizer_proc:
        out, err = tokenizer_proc.communicate(input=text.encode('UTF-8'))
        return out.decode('UTF-8')


def tokenize_ja(text):
    return ' '.join(list(text.strip()))


def write_files(doc_set):
    for serial, doc in doc_set.items():
        with open(doc['filepath'], 'w') as f:
            f.write(doc['trec_text'])

In [3]:
def get_essay_dict(root, topic_id_set):
    essay_dict = {}
    for topic_id in topic_id_set:
        ans_set = root.find('TOPIC/[@ID="{}"]/ANSWER_SET'.format(topic_id))
        for ans in ans_set.iter('ANSWER'):
            ans_id = '{}_{}'.format(topic_id, ans.get('FILE_NAME').replace(' ', '').replace('.xml', ''))
            ans_text = ''.join(ans.itertext()).strip()
            essay_dict[ans_id] = ans_text
    return essay_dict

In [4]:
ref_folder_path_en = '../../qalab3-essay-phase2/_references/qalab3-en-essay-phase2'
essay_file_name_en = 'qalab3-en-phase2-essay-e2e-and-summarization-for-evaluation.xml'

tree_en = et.parse('{}/{}'.format(ref_folder_path_en, essay_file_name_en))
root_en = tree_en.getroot()
topic_id_set_en = {'B792W10-1', 'C792W10-1'}
essay_dict_en = get_essay_dict(root_en, topic_id_set_en)

In [5]:
TOKENIZED_OUTPUT_DIR = '../../qalab3-essay-phase2/essays-en-tokenized/'
if not path.isdir(TOKENIZED_OUTPUT_DIR):
    mkdir(TOKENIZED_OUTPUT_DIR)

tokenized_docs_en = {}
total = len(essay_dict_en)
count = 0
print('tokenizing {} docs...'.format(total))
for serial, doc in essay_dict_en.items():
    tokenized_text = tokenize_en(doc)
    tokenized_trec_text = build_trec_text(serial, tokenized_text)
    filepath = path.join(TOKENIZED_OUTPUT_DIR, '{}-seg.xml'.format(serial))
    tokenized_docs_en[serial] = {'trec_text': tokenized_trec_text, 'filepath': filepath}
    count += 1
    if count % 10 == 0 or count == total:
        print('tokenized {}/{} docs'.format(count, total), end='\r')

tokenizing 30 docs...
tokenized 30/30 docs

In [6]:
write_files(tokenized_docs_en)

In [7]:
ref_folder_path_ja = '../../qalab3-essay-phase2/_references/qalab3-ja-essay-phase2'
essay_file_name_ja = 'qalab3-ja-phase2-essay-e2e-and-summarization-for-evaluation.xml'

tree_ja = et.parse('{}/{}'.format(ref_folder_path_ja, essay_file_name_ja))
root_ja = tree_ja.getroot()
topic_id_set_ja = {'B792W10-1', 'C792W10-1'}
essay_dict_ja = get_essay_dict(root_ja, topic_id_set_ja)

In [8]:
TOKENIZED_OUTPUT_DIR = '../../qalab3-essay-phase2/essays-ja-tokenized/'
if not path.isdir(TOKENIZED_OUTPUT_DIR):
    mkdir(TOKENIZED_OUTPUT_DIR)

tokenized_docs_ja = {}
total = len(essay_dict_ja)
count = 0
print('tokenizing {} docs...'.format(total))
for serial, doc in essay_dict_ja.items():
    tokenized_text = tokenize_ja(doc)
    tokenized_trec_text = build_trec_text(serial, tokenized_text)
    filepath = path.join(TOKENIZED_OUTPUT_DIR, '{}-seg.xml'.format(serial))
    tokenized_docs_ja[serial] = {'trec_text': tokenized_trec_text, 'filepath': filepath}
    count += 1
    if count % 10 == 0 or count == total:
        print('tokenized {}/{} docs'.format(count, total), end='\r')

tokenizing 37 docs...
tokenized 10/37 docstokenized 20/37 docstokenized 30/37 docstokenized 37/37 docs

In [9]:
write_files(tokenized_docs_ja)