In [5]:
import os
import json
import re
from rethinkdb import RethinkDB
from dotenv import dotenv_values

In [6]:
# PATHS
PATH_DATA_IN = './../data/documents/'
PATH_DATA_OUT = './../data/out/'
PATH_ENTITIES = './../../all-parsed/entities.json'
PATH_ENTITY_DATA = './../data/entities/'
PATH_OUT_FILE = './../documents.json'

In [7]:
# create txt_files list of text files, got from PATH_DATA_IN in a format of dict list with keys 'name' and 'content'
txt_files = []
for file in os.listdir(PATH_DATA_IN):
    if file.endswith('.txt'):
        with open(PATH_DATA_IN + file, 'r') as f:
            txt_files.append({
                'name': file.split('.')[0],
                'content': f.read()
            })
            

In [8]:
# connect to rethinkdb database and download actual entities table
# this needs a tunnel to be initialized with the command in tunnel.txt
r = RethinkDB()
config = dotenv_values("./../../../env/.env.tunnel")

conn = r.connect(host=config["DB_HOST"], port=config["DB_PORT"], password=config["DB_PASS"], db=config["DB_NAME"])

# get all entities with legacyId
entities_with_legacy_id = r.db('inkvisitor').table('entities').with_fields(["id", "legacyId"]).coerce_to('array').run(conn)

# store entities with legacyId in a dictionary and save it to PATH_ENTITY_DATA + 'entities_with_legacy_id.json'
entities_with_legacy_id_dict = {}
for entity in entities_with_legacy_id:
    entities_with_legacy_id_dict[entity['legacyId']] = entity['id']

with open(PATH_ENTITY_DATA + 'entities_with_legacy_id.json', 'w') as f:
    json.dump(entities_with_legacy_id_dict, f)


# get all entity ids and store in in a dict to PATH_ENTITY_DATA + 'entities_all_ids.json' in a form of "id": True
entities_all_ids = r.db('inkvisitor').table('entities').pluck('id').coerce_to('array').run(conn)
entities_all_ids_dict = {}
for entity in entities_all_ids:
    entities_all_ids_dict[entity['id']] = True

with open(PATH_ENTITY_DATA + 'entities_all_ids.json', 'w') as f:
    json.dump(entities_all_ids_dict, f)


In [9]:
# load entities_with_legacy_id and entities_all_ids from PATH_ENTITY_DATA
with open(PATH_ENTITY_DATA + 'entities_with_legacy_id.json', 'r') as f:
    lagacy_dict = json.load(f)

with open(PATH_ENTITY_DATA + 'entities_all_ids.json', 'r') as f:
    entities_dict = json.load(f)
    

def parse_file(file):
    name = file['name']
    content = file['content']

    # get all references in the content marked by <id>text text </id> using refex '/<([\w-]+)>/g'
    regex = r'<([\w-]+)>'

    references = []
    for match in re.finditer(regex, content):
        references.append(match.group(1))

    for ref in references:
        if ref in lagacy_dict:
            #print(f'replacing...{ref} with {lagacy_dict[ref]} in {name}')
            content = content.replace(f'<{ref}>', f'<{lagacy_dict[ref]}>')
            content = content.replace(f'</{ref}>', f'</{lagacy_dict[ref]}>')
        elif ref in entities_dict:
            pass
        else:
            print(f'entity not found: {ref} in {name}')

    # make sure PATH_DATA_OUT exists
    if not os.path.exists(PATH_DATA_OUT):
        os.makedirs(PATH_DATA_OUT)
        
    # write the new content to PATH_DATA_OUT
    with open(PATH_DATA_OUT + name + '.txt', 'w') as f:
        f.write(content)
    
    return {
        'name': name,
        'content': content
    }
    

# parse all files in txt_files and save them to PATH_DATA_OUT
parsed_documents = []
for file in txt_files:
    parsed_file = parse_file(file)

    file_obj = {
        "id": parsed_file['name'],
        "content": parsed_file['content'],
        "title": parsed_file['name'],
        "createdAt": '2021-01-01T00:00:00.000Z', 
        "updatedAt": '2021-01-01T00:00:00.000Z'
    }

    parsed_documents.append(file_obj)

# write parsed_documents to PATH_OUT_FILE
with open(PATH_OUT_FILE, 'w') as f:
    json.dump(parsed_documents, f)

