In [1]:
import sqlite3
from os import path
from zipfile import ZipFile

In [2]:
DB_PATH = path.join('qalab3-essay-phase2', 'ja_doc.db')
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('DROP TABLE IF EXISTS ja_docs')
cursor.execute('CREATE TABLE ja_docs (doc_no TEXT, doc TEXT)')
conn.commit()

In [3]:
def write_doc_to_db(zip_file_path, db_cursor, inner_folder_path=None):
    subtree = path.splitext(path.basename(zip_file_path))[0]
    if inner_folder_path:
        subtree = path.join(subtree, inner_folder_path)
    with ZipFile(zip_file_path) as z:
        namelist = z.namelist()
        names = list(filter(lambda x: path.commonpath([subtree, x]) == subtree, namelist))
        for filename in names:
            info = z.getinfo(filename)
            if info.is_dir():
                continue
            docs = {}
            doc_lines = []
            with z.open(filename) as f:
                is_new_doc = False
                doc_id = None
                for line in f:
                    line = line.decode('UTF-8')
                    doc_lines.append(line)
                    stripped_line = line.strip()
                    if stripped_line == '</DOC>':
                        is_new_doc = True
                        if len(doc_lines) > 0:
                            if not doc_id:
                                doc_id = filename.replace('.txt', '')
                            docs[doc_id] = ''.join(doc_lines)
                            doc_lines = []
                    elif stripped_line.startswith('<DOCNO>'):
                        doc_id = stripped_line.replace('<DOCNO>', '').replace('</DOCNO>', '').strip()
                        doc_id = doc_id.replace('2部 ユーラシア諸地域の交流と再', '2')
            batch = [(doc_no, doc) for doc_no, doc in docs.items()]
            db_cursor.executemany('INSERT INTO ja_docs (doc_no, doc) VALUES (?, ?)', batch)

In [4]:
write_doc_to_db('qalab3-essay-phase2/_references/training_set/qalab3-ja-knowledgesource/'
                'qalab3-ja-knowledgesource/make-index-Tokyoshoseki.zip',
                cursor,
                'tokyoshoseki-newxml')
conn.commit()

In [6]:
write_doc_to_db('qalab3-essay-phase2/_references/training_set/qalab3-ja-knowledgesource/'
                'qalab3-ja-knowledgesource/make-index-Yamakawa.zip',
                cursor,
                'yamakawa-Sekaishi-newxml')
conn.commit()

In [9]:
write_doc_to_db('qalab3-essay-phase2/_references/training_set/jawiki-100M-newxml.zip',
                cursor)
conn.commit()

In [7]:
conn.close()