# General

## Imports

In [None]:
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
from datetime import datetime
import os
import pandas as pd
import numpy as np
import pickle
import re
from pprint import pprint
from random import sample

## Parameters

In [None]:
pd.set_option('display.max_colwidth', 100)

In [None]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = "../data/db/mom-data/metadata.charter.public"
fileExtension = ".cei.xml"

# Load .xml mapping

In [None]:
with open('../data/mapping/CEI2CSV.pkl', 'rb') as f:
    mapping = pickle.load(f)

pprint(mapping)

# Scan directory and create paths

In [None]:
def get_file_paths(directory):
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_file_paths(entry.path)
        else:
            continue

In [None]:
paths = [f"{PurePosixPath(path)}" for path in get_file_paths(directoryPath)]

In [None]:
#paths_sample = sample(paths, 30000)

# Create deques, query paths, and fill deques

In [None]:
def get_xpath_result(xpath_mapping):
    return tree.xpath(xpath_mapping, namespaces = namespaces, smart_strings = False)

In [None]:
lists = ["atom_id", "cei_abstract_joined", "cei_abstract_foreign", "cei_tenor_joined", "cei_pTenor", "cei_tenor_appLem", "cei_placeName", "cei_lang_MOM", "cei_date", "cei_dateRange", "cei_date_ATTRIBUTE_value", "cei_dateRange_ATTRIBUTE_from", "cei_dateRange_ATTRIBUTE_to", "cei_graphic_ATTRIBUTE_url_orig", "cei_graphic_ATTRIBUTE_url_copy"]
atom_id, cei_abstract_joined, cei_abstract_foreign, cei_tenor_joined, cei_pTenor, cei_tenor_appLem, cei_placeName, cei_lang_MOM, cei_date, cei_dateRange, cei_date_ATTRIBUTE_value, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy = ([] for i in range(len(lists)))

In [None]:
for file in paths:
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(file)
        atom_id.append(get_xpath_result(f"{mapping['atom:id']}/text()"))
        cei_abstract_joined.append("".join(get_xpath_result(f"{mapping['cei:abstract']}/descendant-or-self::node()[not(name()='cei:sup')]/text()")))
        cei_abstract_foreign.append(get_xpath_result(f"{mapping['cei:foreign']}/text()"))
        cei_tenor_joined.append("".join(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::node()[not(name()='cei:sup')]/text()")))
        cei_pTenor.append(get_xpath_result(f"{mapping['cei:pTenor']}/text()[not(self::cei:sup)]"))
        cei_tenor_appLem.append(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::node()/node()"))
        cei_placeName.append(get_xpath_result(f"{mapping['cei:issued/cei:placeName']}/text()"))
        cei_lang_MOM.append(get_xpath_result(f"{mapping['cei:lang_MOM']}/text()"))
        cei_date.append(get_xpath_result(f"{mapping['cei:date']}/text()"))
        cei_dateRange.append(get_xpath_result(f"{mapping['cei:dateRange']}/text()"))
        cei_date_ATTRIBUTE_value.append(get_xpath_result(f"{mapping['cei:date']}/@value"))
        cei_dateRange_ATTRIBUTE_from.append(get_xpath_result(f"{mapping['cei:dateRange']}/@from"))
        cei_dateRange_ATTRIBUTE_to.append(get_xpath_result(f"{mapping['cei:dateRange']}/@to"))
        cei_graphic_ATTRIBUTE_url_orig.append(get_xpath_result(f"{mapping['@url']}"))
        cei_graphic_ATTRIBUTE_url_copy.append(get_xpath_result(f"{mapping['cei:graphic/@url']}"))

# Merge deques to dataframe

In [None]:
# generate dynamic dictionary content for renaming frames
for i, item in enumerate(lists):
    print(f"{i}:'{item}',")
    i =+ 1

In [None]:
contents = list(zip(atom_id, cei_abstract_joined, cei_abstract_foreign, cei_tenor_joined, cei_pTenor, cei_placeName, cei_lang_MOM, cei_date, cei_dateRange, cei_date_ATTRIBUTE_value, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy))
contents_full = pd.DataFrame(contents).rename(
    columns={ 0:'atom_id',
1:'cei_abstract_joined',
2:'cei_abstract_foreign',
3:'cei_tenor_joined',
4:'cei_pTenor',
5:'cei_placeName',
6:'cei_lang_MOM',
7:'cei_date',
8:'cei_dateRange',
9:'cei_date_ATTRIBUTE_value',
10:'cei_dateRange_ATTRIBUTE_from',
11:'cei_dateRange_ATTRIBUTE_to',
12:'cei_graphic_ATTRIBUTE_url_orig',
13:'cei_graphic_ATTRIBUTE_url_copy'})

In [None]:
contents_full

In [None]:
test = contents_full.cei_tenor_appLem

In [None]:
timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')
test.to_csv(f"../data/output/tests/test_charters_{timemarker}.csv")

# Export whole dataframe

In [None]:
timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')
contents_full.to_json(f"../data/output/charters_{timemarker}.json")
contents_full.to_parquet(f"../data/output/charters_{timemarker}.parquet")

---

# Import generated data as dataframe

In [None]:
#contents_full_json = pd.read_json("../data/output/charters_2022-08-31-2322.json")
contents_full_parquet = pd.read_parquet('../data/output/charters_2022-09-18-2211.parquet', engine='pyarrow')

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(contents_full_parquet.iloc[1].to_string)