In [1]:
from copy import deepcopy
import re
import xml.etree.ElementTree as et


def get_length_limit(parent_element):
    return int(re.findall(r'\d+', parent_element.find(
        'answer_set/answer/[@length_limit]').get('length_limit'))[0])


def _select_passage_sentence(passage_and_src_pair_list, keywords, get_sentence_func):
    keyword_pattern = '|'.join(map(lambda x: '({})'.format(x), keywords))
    keyword_regex = re.compile(keyword_pattern)
    matched_sentence_list = []
    sentence_dict = {}
    for passage, src in passage_and_src_pair_list:
        sentences = get_sentence_func(passage)
        for i, sentence in enumerate(sentences):
            matches = keyword_regex.findall(sentence)
            if matches:
                if sentence not in sentence_dict:
                    matched_sentence_list.append({
                        'src': '{}[{}]'.format(src, i),
                        'sentence': sentence,
                        'rate': len(matches) / len(sentence)})
                    sentence_dict[sentence] = len(matched_sentence_list) - 1
                else:
                    matched_sentence_list[sentence_dict[sentence]]['src'] += ',{}[{}]'.format(src, i)
    sorted(matched_sentence_list, key=lambda x: x['rate'], reverse=True)
    return [(entry['sentence'], entry['src']) for entry in matched_sentence_list]


sentence_stop_ja = re.compile(r'([。？！])')
def _get_sentence_ja(text):
    pieces = sentence_stop_ja.split(text)
    return [x + y for x, y in zip(pieces[::2], pieces[1::2])]


import spacy
nlp = spacy.load('en')
def _get_sentence_en(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

In [11]:
def summarize_func_truncate_ja(passage_and_src_pair_list, len_limit):
    summary_text = None
    src_list = []
    for passage, src in passage_and_src_pair_list:
        if not summary_text:
            summary_text = passage[:len_limit]
            src_list.append(src)
        elif len(summary_text) + len(passage) <= len_limit:
            summary_text += '\n' + passage
            src_list.append(src)
        else:
            break
    return summary_text, src_list


def summarize_func_truncate_en(passage_and_src_pair_list, len_limit):
    summary_tokens = []
    src_list = []
    current_length = 0
    for passage, src in passage_and_src_pair_list:
        passage_tokens = passage.split(' ')
        if not summary_tokens:
            summary_tokens.extend(passage_tokens)
            current_length += len(passage_tokens)
            src_list.append(src)
            if len(summary_tokens) > len_limit:
                return ' '.join(summary_tokens[:len_limit]), src_list
        elif current_length + len(passage_tokens) <= len_limit:
            summary_tokens.extend(passage_tokens)
            current_length += len(passage_tokens)
            src_list.append(src)
        else:
            break
    return ' '.join(summary_tokens[:len_limit]), src_list


def get_summary_set_by_participant(exp_root, ans_root, summarize_func):
    summaries = {}
    for topic in exp_root.iter('TOPIC'):
        topic_id = topic.get('ID')
        ans_section = ans_root_ja.find('answer_section/[@id="{}"]'.format(topic_id))
        ans_len_limit = get_length_limit(ans_section)
        for passage_set in topic.iter('PASSAGE_SET'):
            passage_set_name = passage_set.get('FILE_NAME')
            if passage_set_name not in summaries:
                summaries[passage_set_name] = {}
            
            passage_and_src_pairs = []
            for passage in passage_set.iter('PASSAGE'):
                passage_id = passage.get('SOURCE_ID')
                passage_text = ''.join(passage.itertext()).strip()
                passage_and_src_pairs.append((passage_text, passage_id))
            summary_text, passage_id_list = summarize_func(passage_and_src_pairs, ans_len_limit)
            summaries[passage_set_name][topic_id] = (summary_text, passage_id_list)
    return summaries


def write_xml_by_participant(summaries, ans_tree, tgt_folder_path, ans_file_name):
    for passage_set_name, sections in summaries.items():
        ans_tree_copy = deepcopy(ans_tree)
        ans_root_copy = ans_tree_copy.getroot()
        for ans_section_id, summary_tuple in sections.items():
            summary = summary_tuple[0]
            passage_id_str = ','.join(summary_tuple[1]) 
            ans_root_copy.set('src', passage_set_name)
            expression = ans_root_copy.find(
                'answer_section/[@id="{}"]/answer_set/answer/expression_set/expression'.format(
                    ans_section_id))
            expression.text = summary
            expression.set('source_id', passage_id_str)
        ans_tree_copy.write('{}/{}_DGLab_summarization_ExP10_{}_01.xml'.format(
            tgt_folder_path, ans_file_name, passage_set_name
        ), encoding='UTF-8', xml_declaration=True)

In [85]:
temp_run_folder_path = 'qalab3-essay-phase2/'
ref_folder_path_ja = 'qalab3-essay-phase2/_references/qalab3-ja-essay-phase2'

ans_file_name_ja = 'qalab3-ja-phase2-answersheet-essay' 
ans_tree_ja = et.parse('{}/{}.xml'.format(ref_folder_path_ja, ans_file_name_ja))
ans_root_ja = ans_tree_ja.getroot()

passage_file_name_ja = 'qalab3-ja-phase2-essay-extraction-ExP10_revised.xml'
exp_tree_ja = et.parse('{}/{}'.format(ref_folder_path_ja, passage_file_name_ja))
exp_root_ja = exp_tree_ja.getroot()

summary_set_ja = get_summary_set_by_participant(exp_root_ja, ans_root_ja, summarize_func_truncate_ja)
write_xml_by_participant(summary_set_ja, ans_tree_ja, temp_run_folder_path, ans_file_name_ja)

In [82]:
temp_run_folder_path = 'qalab3-essay-phase2/'
ref_folder_path_en = 'qalab3-essay-phase2/_references/qalab3-en-essay-phase2'

ans_file_name_en = 'qalab3-en-phase2-answersheet-essay' 
ans_tree_en = et.parse('{}/{}.xml'.format(ref_folder_path_en, ans_file_name_en))
ans_root_en = ans_tree_en.getroot()

passage_file_name_en = 'qalab3-en-phase2-essay-extraction-ExP10_revised.xml'
exp_tree_en = et.parse('{}/{}'.format(ref_folder_path_en, passage_file_name_en))
exp_root_en = exp_tree_en.getroot()

summary_set_en = get_summary_set_by_participant(exp_root_en, ans_root_en, summarize_func_truncate_en)
write_xml_by_participant(summary_set_en, ans_tree_en, temp_run_folder_path, ans_file_name_en)

In [12]:
def passage_filter_ja(passage_and_src_pair_list, keywords):
    return _select_passage_sentence(passage_and_src_pair_list, keywords, _get_sentence_ja)


def passage_filter_en(passage_and_src_pair_list, keywords):
    return _select_passage_sentence(passage_and_src_pair_list, keywords, _get_sentence_en)


def get_element_texts(parent_element, xpath):
    texts = []
    for element in parent_element.findall(xpath):
        text = ''.join(element.itertext())
        if text:
            texts.append(text)
    return texts


def get_summary_set_for_all(exp_root, ans_root, summarize_func, passage_filter_func):
    summaries = {}
    for topic in exp_root.iter('TOPIC'):
        topic_id = topic.get('ID')
        ans_section = ans_root.find('answer_section/[@id="{}"]'.format(topic_id))
        ans_len_limit = get_length_limit(ans_section)
        keywords = get_element_texts(ans_section, 'keyword_set/keyword')
        passage_and_src_pairs = []
        for passage_set in topic.iter('PASSAGE_SET'):
            passage_set_name = passage_set.get('FILE_NAME')

            for passage in passage_set.iter('PASSAGE'):
                passage_id = passage_set_name + '_' + passage.get('SOURCE_ID')
                passage_text = ''.join(passage.itertext()).strip()
                passage_and_src_pairs.append((passage_text, passage_id))
        passage_and_src_pairs = passage_filter_func(passage_and_src_pairs, keywords)
        summary_text, passage_id_list = summarize_func(passage_and_src_pairs, ans_len_limit)
        summaries[topic_id] = (summary_text, passage_id_list)
    return summaries


def write_xml_by_extracton_file(summaries, ans_tree, tgt_folder_path, ans_file_name, extraction_file_id):
    ans_tree_copy = deepcopy(ans_tree)
    ans_root_copy = ans_tree_copy.getroot()
    for ans_sect in ans_root_copy.findall('answer_section'):
        ans_sect_id = ans_sect.get('id')
        if ans_sect_id not in summaries:
            ans_root_copy.remove(ans_sect)

    for ans_section_id, summary_tuple in summaries.items():
        summary = summary_tuple[0]
        passage_id_str = ','.join(summary_tuple[1]) 
        ans_root_copy.set('src', extraction_file_id)
        expression = ans_root_copy.find(
            'answer_section/[@id="{}"]/answer_set/answer/expression_set/expression'.format(ans_section_id))
        expression.text = summary
        expression.set('source_id', passage_id_str)
    ans_tree_copy.write('{}/{}_DGLab_summarization_{}_01.xml'.format(
        tgt_folder_path, ans_file_name, extraction_file_id
    ), encoding='UTF-8', xml_declaration=True)

In [88]:
temp_run_folder_path = 'qalab3-essay-phase2/'
ref_folder_path_ja = 'qalab3-essay-phase2/_references/qalab3-ja-essay-phase2'

ans_file_name_ja = 'qalab3-ja-phase2-answersheet-essay' 
ans_tree_ja = et.parse('{}/{}.xml'.format(ref_folder_path_ja, ans_file_name_ja))
ans_root_ja = ans_tree_ja.getroot()

passage_file_id_list = ['ExP10_revised', 'GSN+ExP10_revised', 'GSN']
for passage_file_id in passage_file_id_list:
    passage_file_name_ja = 'qalab3-ja-phase2-essay-extraction-{}.xml'.format(passage_file_id)
    exp_tree_ja = et.parse('{}/{}'.format(ref_folder_path_ja, passage_file_name_ja))
    exp_root_ja = exp_tree_ja.getroot()

    summary_set_ja = get_summary_set_for_all(exp_root_ja, ans_root_ja, summarize_func_truncate_ja, passage_filter_ja)
    write_xml_by_extracton_file(summary_set_ja, ans_tree_ja, temp_run_folder_path, ans_file_name_ja, passage_file_id)

In [89]:
temp_run_folder_path = 'qalab3-essay-phase2/'
ref_folder_path_en = 'qalab3-essay-phase2/_references/qalab3-en-essay-phase2'

ans_file_name_en = 'qalab3-en-phase2-answersheet-essay' 
ans_tree_en = et.parse('{}/{}.xml'.format(ref_folder_path_en, ans_file_name_en))
ans_root_en = ans_tree_en.getroot()

passage_file_id_list = ['ExP10_revised', 'GSN+ExP10_revised', 'GSN']
for passage_file_id in passage_file_id_list:
    passage_file_name_en = 'qalab3-en-phase2-essay-extraction-{}.xml'.format(passage_file_id)
    exp_tree_en = et.parse('{}/{}'.format(ref_folder_path_en, passage_file_name_en))
    exp_root_en = exp_tree_en.getroot()

    summary_set_en = get_summary_set_for_all(exp_root_en, ans_root_en, summarize_func_truncate_en, passage_filter_en)
    write_xml_by_extracton_file(summary_set_en, ans_tree_en, temp_run_folder_path, ans_file_name_en, passage_file_id)

In [13]:
from subprocess import Popen, PIPE, TimeoutExpired


def shuca_ja(text, length):
    echo = Popen(['echo', text], stdout=PIPE)
    juman = Popen(['juman'], stdin=echo.stdout, stdout=PIPE)
    echo.stdout.close()
    knp = Popen(['knp', '-anaphora', '-case', '-tab'], stdin=juman.stdout, stdout=PIPE, stderr=PIPE)
    knp_output, knp_error = knp.communicate()
    knp_text = knp_output.decode('UTF-8')
    juman.stdout.close()
    with open('temp.txt', 'a') as f:
        f.write(knp_text)
    with open('err.txt', 'a') as f:
        f.write(knp_error.decode('UTF-8'))
    knp.stdout.close()
    knp.terminate()
    juman.terminate()
    echo.terminate()

    shuca = Popen(['shuca/lib/Shuca.py', '-l', str(length)], stdin=PIPE, stdout=PIPE, stderr=PIPE)
    output = None
    error = None
    output, error = shuca.communicate(input=knp_output)
    if error:
        print('Shuca error: {}'.format(error.decode('UTF-8')))
    if output:
        summary = output.decode('UTF-8')
    else:
        summary = ''
    shuca.stdout.close()
    shuca.terminate()

    summary = '\n'.join([line.strip() for line in summary.splitlines() if line.strip()])
    print('Shuca output: {}'.format(summary))
    return summary


def summarize_func_shuca_ja(passage_and_src_pair_list, len_limit):
    print('\noriginal sentence count={}'.format(len(passage_and_src_pair_list)))
    passage_and_src_pair_list = passage_and_src_pair_list[:25]
    passages, src_list = zip(*passage_and_src_pair_list)
    # passages = list(map(lambda x: re.sub(r'\s', '', x), passages))
    passage_text = '\n'.join(passages)
    summary_text = shuca_ja(passage_text, len_limit)
    return summary_text, src_list

In [96]:
temp_run_folder_path = 'qalab3-essay-phase2/'
ref_folder_path_ja = 'qalab3-essay-phase2/_references/qalab3-ja-essay-phase2'

ans_file_name_ja = 'qalab3-ja-phase2-answersheet-essay' 
ans_tree_ja = et.parse('{}/{}.xml'.format(ref_folder_path_ja, ans_file_name_ja))
ans_root_ja = ans_tree_ja.getroot()

passage_file_id_list = ['ExP10_revised', 'GSN+ExP10_revised', 'GSN']
for passage_file_id in passage_file_id_list:
    passage_file_name_ja = 'qalab3-ja-phase2-essay-extraction-{}.xml'.format(passage_file_id)
    exp_tree_ja = et.parse('{}/{}'.format(ref_folder_path_ja, passage_file_name_ja))
    exp_root_ja = exp_tree_ja.getroot()

    summary_set_ja = get_summary_set_for_all(exp_root_ja, ans_root_ja, summarize_func_shuca_ja, passage_filter_ja)
    write_xml_by_extracton_file(summary_set_ja, ans_tree_ja, temp_run_folder_path, ans_file_name_ja, passage_file_id)


original sentence count=206
Shuca output: オスマン帝国（オスマントルコ語：دولتعليهعثمانیه、Devlet-i'Alīye-i'Osmānīye、現代トルコ語：OsmanlıİmparatorluğuまたはOsmanlıDevleti）は、トルコ帝国、オスマントルコとしても知られています。
また、タハリール広場から東に400mほどのところにムハンマド・アリー朝の王宮であり、現大統領府であるアブディーン宮殿がある。
タハリール広場から南のナイル川沿いはガーデン・シティと呼ばれ、イギリス統治時代にエジプト総督府がおかれ開発が進められたエリアである。
とくに19世紀後半のエジプト太守イスマーイール・パシャは近代化に熱心であり、スエズ運河の開通にあわせてナイル川東岸の低湿地を開発して、パリの都市計画に倣った新市街を旧市街の西側に建設した。
やがて西方を拠点としたオクタヴィアヌスと東方に拠るアントニウスの対立がおこり，オクタヴィアヌスは，プトレマイオス朝エジプトのクレオパトラと結んだアントニウスを前３１年アクティウムの海戦でやぶった。
しかし，アントニウスがプトレマイオス朝エジプトの女王クレオパトラと結んだため，オクタヴィアヌスはこの連合軍を，前３１年，アクティウムの海戦でやぶった。

original sentence count=173
Shuca output: １４世紀後半，倭寇の活動と，交易の自由化がもたらす国内経済の動揺に悩まされた明の洪武帝は，国内経済の復活と治安の安定のために，中国商人の対外交易と渡航を禁止した（海禁）。
また16世紀後半、中国の海禁政策がゆるむと、多数の中国商人が来航するようになり、東南アジアの海の交易は活況を呈した。
また中国の対外貿易は、あくまでも中華帝国側が恩恵的に許可する朝貢貿易であり、明初には倭寇対策もあって、民間人の海洋貿易や海上交通が法令として禁止(海禁)された。
16世紀にはいり、それまで海禁政策のもとにあった中国商人がさかんに密貿易をおこなうようになると、琉球王国による中継貿易は急速におとろえ、日本との貿易も堺や博多などの商人にとってかわられた。
16世紀に入ると、中国人や日本人、朝鮮人の私貿易商人たちは武装して海禁に抵抗し、中国や朝鮮の沿岸都市を襲った。
孫文の中国は

In [114]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words


def sumy_lex_rank_en(text, length):
    LANGUAGE = 'english'
    SENTENCES_COUNT = 20
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)
    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    summary_tokens = []
    current_length = 0
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        passage_tokens = str(sentence).split(' ')
        if not summary_tokens:
            summary_tokens.extend(passage_tokens)
            current_length += len(passage_tokens)
            if len(summary_tokens) > length:
                return ' '.join(summary_tokens[:length])
        elif current_length + len(passage_tokens) <= length:
            summary_tokens.extend(passage_tokens)
            current_length += len(passage_tokens)
        else:
            break
    return ' '.join(summary_tokens[:length])


def summarize_func_sumy_lex_rank_en(passage_and_src_pair_list, len_limit):
    print('\noriginal sentence count={}'.format(len(passage_and_src_pair_list)))
    passage_and_src_pair_list = passage_and_src_pair_list[:30]
    passages, src_list = zip(*passage_and_src_pair_list)
    passage_text = '\n'.join(passages)
    summary_text = sumy_lex_rank_en(passage_text, len_limit)
    return summary_text, src_list

In [115]:
temp_run_folder_path = 'qalab3-essay-phase2/'
ref_folder_path_en = 'qalab3-essay-phase2/_references/qalab3-en-essay-phase2'

ans_file_name_en = 'qalab3-en-phase2-answersheet-essay' 
ans_tree_en = et.parse('{}/{}.xml'.format(ref_folder_path_en, ans_file_name_en))
ans_root_en = ans_tree_en.getroot()

passage_file_id_list = ['ExP10_revised', 'GSN+ExP10_revised', 'GSN']
for passage_file_id in passage_file_id_list:
    passage_file_name_en = 'qalab3-en-phase2-essay-extraction-{}.xml'.format(passage_file_id)
    exp_tree_en = et.parse('{}/{}'.format(ref_folder_path_en, passage_file_name_en))
    exp_root_en = exp_tree_en.getroot()

    summary_set_en = get_summary_set_for_all(exp_root_en, ans_root_en, summarize_func_sumy_lex_rank_en, passage_filter_en)
    write_xml_by_extracton_file(summary_set_en, ans_tree_en, temp_run_folder_path, ans_file_name_en, passage_file_id)


original sentence count=107

original sentence count=34

original sentence count=54

original sentence count=74

original sentence count=78

original sentence count=151

original sentence count=57

original sentence count=86

original sentence count=104

original sentence count=105

original sentence count=44

original sentence count=23

original sentence count=32

original sentence count=30

original sentence count=27


In [23]:
def get_passage_list_for_all(exp_root, ans_root, passage_filter_func):
    passage_list = []
    for topic in exp_root.iter('TOPIC'):
        topic_id = topic.get('ID')
        ans_section = ans_root.find('answer_section/[@id="{}"]'.format(topic_id))
        ans_len_limit = get_length_limit(ans_section)
        keywords = get_element_texts(ans_section, 'keyword_set/keyword')
        passage_and_src_pairs = []
        for passage_set in topic.iter('PASSAGE_SET'):
            passage_set_name = passage_set.get('FILE_NAME')

            for passage in passage_set.iter('PASSAGE'):
                passage_id = passage_set_name + '_' + passage.get('SOURCE_ID')
                passage_text = ''.join(passage.itertext()).strip()
                passage_and_src_pairs.append((passage_text, passage_id))
        passage_and_src_pairs = passage_filter_func(passage_and_src_pairs, keywords)
        passage_list.append((topic_id, passage_and_src_pairs))
    return passage_list

In [33]:
import struct
from tensorflow.core.example import example_pb2

SUMMARY_FIELD_NAME = 'abstract'
TEXT_FIELD_NAME = 'article'

def get_pb2_str(text, summary):
    tf_example = example_pb2.Example()
    tf_example.features.feature[TEXT_FIELD_NAME].bytes_list.value.extend([text.encode('utf-8')])
    tf_example.features.feature[SUMMARY_FIELD_NAME].bytes_list.value.extend([summary.encode('utf-8')])
    return tf_example.SerializeToString()


def pack_tensorflow_pb2_str(tensorflow_pb2_str):
    str_len = len(tensorflow_pb2_str)
    return struct.pack('q{}s'.format(str_len), str_len, tensorflow_pb2_str)



def tokenize_en(text):
    with Popen(['java', 'edu.stanford.nlp.process.PTBTokenizer', '-preserveLines'],
               stdin=PIPE, stdout=PIPE, stderr=PIPE) as tokenizer_proc:
        out, err = tokenizer_proc.communicate(input=text.encode('UTF-8'))
        return out.decode('UTF-8')


import MeCab


def tokenize_ja(text):
    mecab = MeCab.Tagger('-Owakati')
    return mecab.parse(text).strip()

In [66]:
import collections
import csv


temp_run_folder_path = '../'
ref_folder_path_ja = '../_references/qalab3-ja-essay-phase2'

ans_file_name_ja = 'qalab3-ja-phase2-answersheet-essay' 
ans_tree_ja = et.parse('{}/{}.xml'.format(ref_folder_path_ja, ans_file_name_ja))
ans_root_ja = ans_tree_ja.getroot()

old_vocab = '../../summarization_corpus_to_tensorflow_input/finished_files/vocab-livedoor_jawikinews.txt'
old_vocab_dict = {}
with open(old_vocab) as ov:
    reader = csv.reader(ov, delimiter=' ', quoting=3)
    old_vocab_dict = {rows[0]: int(rows[1]) for rows in reader}
vocab_counter = collections.Counter(old_vocab_dict)
passage_file_id_list = ['ExP10_revised', 'GSN+ExP10_revised', 'GSN']
for passage_file_id in passage_file_id_list:
    passage_file_name_ja = 'qalab3-ja-phase2-essay-extraction-{}.xml'.format(passage_file_id)
    exp_tree_ja = et.parse('{}/{}'.format(ref_folder_path_ja, passage_file_name_ja))
    exp_root_ja = exp_tree_ja.getroot()

    passage_list_ja = get_passage_list_for_all(exp_root_ja, ans_root_ja, passage_filter_ja)
    for topic_id, passage_and_src_pairs in passage_list_ja:
        tokenized_sentences =[tokenize_ja(passage_src_pair[0]) for passage_src_pair in passage_and_src_pairs]
        vocab_counter.update([str(t) for s in tokenized_sentences for t in str(s).split(' ')])
        with open('test-{}-{}-ja.bin'.format(passage_file_id, topic_id), 'wb') as f:
            for i in range(0, len(tokenized_sentences), 3):
                passage_text = ' '.join(tokenized_sentences[i:i+3])
                f.write(pack_tensorflow_pb2_str(get_pb2_str(passage_text, 'dummy_summary')))

with open('vocab-qalab.txt', 'w') as v:
    for word, count in vocab_counter.most_common(50000):
        v.write(word + ' ' + str(count) + '\n')
    # write_xml_by_extracton_file(summary_set_ja, ans_tree_ja, temp_run_folder_path, ans_file_name_ja, passage_file_id)

In [65]:
import collections
import csv


temp_run_folder_path = '../'
ref_folder_path_en = '../_references/qalab3-en-essay-phase2'

ans_file_name_en = 'qalab3-en-phase2-answersheet-essay' 
ans_tree_en = et.parse('{}/{}.xml'.format(ref_folder_path_en, ans_file_name_en))
ans_root_en = ans_tree_en.getroot()

old_vocab = 'vocab-en.txt'
old_vocab_dict = {}
with open(old_vocab) as ov:
    reader = csv.reader(ov, delimiter=' ', quoting=3)
    old_vocab_dict = {rows[0]: int(rows[1]) for rows in reader}
vocab_counter = collections.Counter(old_vocab_dict)
passage_file_id_list = ['ExP10_revised', 'GSN+ExP10_revised', 'GSN']
for passage_file_id in passage_file_id_list:
    passage_file_name_en = 'qalab3-en-phase2-essay-extraction-{}.xml'.format(passage_file_id)
    exp_tree_en = et.parse('{}/{}'.format(ref_folder_path_en, passage_file_name_en))
    exp_root_en = exp_tree_en.getroot()

    passage_list_en = get_passage_list_for_all(exp_root_en, ans_root_en, passage_filter_en)
    for topic_id, passage_and_src_pairs in passage_list_en:
        tokenized_sentences =[tokenize_en(passage_src_pair[0]) for passage_src_pair in passage_and_src_pairs]
        vocab_counter.update([str(t) for s in tokenized_sentences for t in str(s).split(' ')])
        with open('test-{}-{}-en.bin'.format(passage_file_id, topic_id), 'wb') as f:
            for i in range(0, len(tokenized_sentences), 3):
                passage_text = ' '.join(tokenized_sentences[i:i+3])
                f.write(pack_tensorflow_pb2_str(get_pb2_str(passage_text, 'dummy_summary')))

with open('vocab-en-qalab.txt', 'w') as v:
    for word, count in vocab_counter.most_common(50000):
        v.write(word + ' ' + str(count) + '\n')
    # write_xml_by_extracton_file(summary_set_ja, ans_tree_ja, temp_run_folder_path, ans_file_name_ja, passage_file_id)