In [18]:
from os import mkdir, path
from pandas import read_csv
import sqlite3
from subprocess import Popen, PIPE

In [2]:
TREC_TEXT_TEMPLATE = (
    '<DOC>\n'
    '<DOCNO>{}</DOCNO>\n'
    '<Title>{}</Title>\n'
    '<TEXT>\n'
    '{}\n'
    '</TEXT>\n'
    '</DOC>\n')


def build_trec_text(serial, title, text):
    return TREC_TEXT_TEMPLATE.format(serial, title, text)


def tokenize(text):
    with Popen(['java', 'edu.stanford.nlp.process.PTBTokenizer', '-preserveLines'],
               stdin=PIPE, stdout=PIPE, stderr=PIPE) as tokenizer_proc:
        out, err = tokenizer_proc.communicate(input=text.encode('UTF-8'))
        return out.decode('UTF-8')


TOKENIZED_OUTPUT_DIR = 'qalab3-essay-phase2/enwiki_doc_by_id-tokenized/'
if not path.isdir(TOKENIZED_OUTPUT_DIR):
    mkdir(TOKENIZED_OUTPUT_DIR)


def write_files(doc_set):
    for serial, doc in doc_set.items():
        with open(doc['filepath'], 'w') as f:
            f.write(doc['trec_text'])

In [8]:
def write_to_db(doc_set, db_cursor):
    batch = [(doc_no, doc['trec_text']) for doc_no, doc in doc_set.items()]
    db_cursor.executemany('INSERT INTO en_docs (doc_no, doc) VALUES (?, ?)', batch)

In [5]:
df = read_csv('qalab3-essay-phase2/_references/training_set/enwiki_dump.csv.bz2', compression='bz2')

In [6]:
docs = {}
for row in zip(df['Title'], df['Text'], df['ID']):
    title = row[0].replace('\\n', '\n').replace('\,', ',').strip()
    text = row[1].replace('\\n', '\n').replace('\,', ',').strip()
    serial = 'enwiki-{}'.format(row[2])
    trec_text = build_trec_text(serial, title, text)
    docs[serial] = {'trec_text': trec_text, 'title': title, 'text': text}
del df

In [9]:
DB_PATH = path.join('qalab3-essay-phase2', 'en_doc.db')
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('DROP TABLE IF EXISTS en_docs')
cursor.execute('CREATE TABLE en_docs (doc_no TEXT, doc TEXT)')
conn.commit()
write_to_db(docs, cursor)
conn.commit()
conn.close()

In [None]:
tokenized_docs = {}
total = len(docs)
count = 0
print('tokenizing {} docs...'.format(total))
for serial, doc in docs.items():
    tokenized_title = tokenize(doc['title'])
    tokenized_text = tokenize(doc['text'])
    tokenized_trec_text = build_trec_text(serial, tokenized_title, tokenized_text)
    filepath = path.join(TOKENIZED_OUTPUT_DIR, '{}-seg.xml'.format(serial))
    tokenized_docs[serial] = {'trec_text': tokenized_trec_text, 'filepath': filepath}
    count += 1
    if count % 100 == 0 or count == total:
        print('tokenized {}/{} docs'.format(count, total), end='\r')
del docs

tokenizing 11217 docs...
tokenized 11200/11217 docs

In [None]:
write_files(tokenized_docs)