# General

## Imports

In [9]:
from collections import deque
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
from datetime import datetime
import os
import pandas as pd
import pickle
from pprint import pprint
from random import sample

## Parameters

In [2]:
pd.set_option('display.max_colwidth', 100)

In [3]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = "../data/db/mom-data/metadata.charter.public"
fileExtension = ".cei.xml"

# Load .xml mapping

In [10]:
with open('../data/mapping/CEI2CSV.pkl', 'rb') as f:
    mapping = pickle.load(f)

pprint(mapping)

{'@from': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:issued/cei:dateRange/@from',
 '@id': '/atom:entry/atom:content/cei:text/@id',
 '@key': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:abstract/cei:persName/@key',
 '@n': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:witnessOrig/cei:archIdentifier/cei:idno/@n',
 '@old': '/atom:entry/atom:content/cei:text/cei:body/cei:idno/@old',
 '@target': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:diplomaticAnalysis/cei:p/cei:ref/@target',
 '@to': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:issued/cei:dateRange/@to',
 '@type': '/atom:entry/atom:content/@type',
 '@url': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:witnessOrig/cei:figure/cei:graphic/@url',
 '@value': '/atom:entry/atom:content/cei:text/cei:body/cei:chDesc/cei:issued/cei:date/@value',
 'app:control': '/atom:entry/app:control',
 'app:draft': '/atom:entry/app:control/app:draft',
 'atom:author': '/atom:en

# Scan directory and create paths

In [5]:
def get_file_paths(directory):
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_file_paths(entry.path)
        else:
            continue

In [6]:
paths = [f"{PurePosixPath(path)}" for path in get_file_paths(directoryPath)]

In [8]:
paths_sample = sample(paths, 20)

# 2

In [9]:
# def get_file_paths(directory, extension):
#     for entry in os.scandir(directory):
#         if entry.is_file() and entry.name.endswith(extension):
#             yield Path(entry.path)
#         elif entry.is_dir():
#             yield from get_file_paths(entry.path, extension)
#         else:
#             continue

In [16]:
# paths = [f"{PurePosixPath(path)}" for path in get_file_paths(directory=directoryPath, extension=fileExtension)]

In [17]:
# len(paths)

659170

# Create deques, query paths, and fill deques

In [11]:
deques = deque(["atom_id", "cei_placeName", "cei_lang_MOM", "cei_tenor", "cei_date", "cei_date_ATTRIBUTE_value", "cei_dateRange", "cei_dateRange_ATTRIBUTE_from", "cei_dateRange_ATTRIBUTE_to", "cei_abstract", "cei_abstract_foreign", "cei_graphic_ATTRIBUTE_url_orig", "cei_graphic_ATTRIBUTE_url_copy"])
atom_id, cei_placeName, cei_lang_MOM, cei_tenor, cei_date, cei_date_ATTRIBUTE_value, cei_dateRange, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_abstract, cei_abstract_foreign, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy = ([] for i in range(len(deques)))

def get_xpath_result(xpath_mapping):
    return tree.xpath(xpath_mapping, namespaces = namespaces, smart_strings = False)

for file in paths_sample:
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(file)
        atom_id.append(get_xpath_result(f"{mapping['atom:id']}/text()"))
        cei_abstract.append("".join(get_xpath_result(f"{mapping['cei:abstract']}/descendant-or-self::text()[not(self::cei:sup)]")))
        cei_abstract_foreign.append(get_xpath_result(f"{mapping['cei:foreign']}/text()"))
        cei_tenor_joined.append("".join(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::text()[not(self::cei:sup)]")))
        cei_pTenor.append(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::text()[not(self::cei:sup)]"))
        cei_tenor_sup
        cei_tenor_firstApp

        cei_ptenor #todo: extract list of ptenors
        cei_tenor_noSup.append("".join(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::text()[not(self::cei:sup)]")))
        cei_tenor_app.append("".join(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::text()[not(self::cei:sup)]")))
        cei_placeName.append(get_xpath_result(f"{mapping['cei:issued/cei:placeName']}/text()"))
        cei_lang_MOM.append(get_xpath_result(f"{mapping['cei:lang_MOM']}/text()"))
        cei_date.append(get_xpath_result(f"{mapping['cei:date']}/text()"))
        cei_dateRange.append(get_xpath_result(f"{mapping['cei:dateRange']}/text()"))
        cei_date_ATTRIBUTE_value.append(get_xpath_result(f"{mapping['cei:date']}/@value"))
        cei_dateRange_ATTRIBUTE_from.append(get_xpath_result(f"{mapping['cei:dateRange']}/@from"))
        cei_dateRange_ATTRIBUTE_to.append(get_xpath_result(f"{mapping['cei:dateRange']}/@to"))
        cei_graphic_ATTRIBUTE_url_orig.append(get_xpath_result(f"{mapping['@url']}"))
        cei_graphic_ATTRIBUTE_url_copy.append(get_xpath_result(f"{mapping['cei:graphic/@url']}"))

# Merge deques to dataframe

In [12]:
# generate dynamic dictionary content for renaming frames
for i, item in enumerate(deques):
    print(f"{i}:'{item}',")
    i =+ 1

0:'atom_id',
1:'cei_placeName',
2:'cei_lang_MOM',
3:'cei_tenor',
4:'cei_date',
5:'cei_date_ATTRIBUTE_value',
6:'cei_dateRange',
7:'cei_dateRange_ATTRIBUTE_from',
8:'cei_dateRange_ATTRIBUTE_to',
9:'cei_abstract',
10:'cei_abstract_foreign',
11:'cei_graphic_ATTRIBUTE_url_orig',
12:'cei_graphic_ATTRIBUTE_url_copy',


In [13]:
contents = list(zip(atom_id, cei_placeName, cei_lang_MOM, cei_tenor, cei_date, cei_date_ATTRIBUTE_value, cei_dateRange, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_abstract, cei_abstract_foreign, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy))
contents_full = pd.DataFrame(contents).rename(
    columns={ 0:'atom_id',
1:'cei_placeName',
2:'cei_lang_MOM',
3:'cei_tenor',
4:'cei_date',
5:'cei_date_ATTRIBUTE_value',
6:'cei_dateRange',
7:'cei_dateRange_ATTRIBUTE_from',
8:'cei_dateRange_ATTRIBUTE_to',
9:'cei_abstract',
10:'cei_abstract_foreign',
11:'cei_graphic_ATTRIBUTE_url_orig',
12:'cei_graphic_ATTRIBUTE_url_copy' })

In [17]:
contents_full

Unnamed: 0,atom_id,cei_placeName,cei_lang_MOM,cei_tenor,cei_date,cei_date_ATTRIBUTE_value,cei_dateRange,cei_dateRange_ATTRIBUTE_from,cei_dateRange_ATTRIBUTE_to,cei_abstract,cei_abstract_foreign,cei_graphic_ATTRIBUTE_url_orig,cei_graphic_ATTRIBUTE_url_copy
0,"[tag:www.monasterium.net,2011:/charter/HU-MNL-DLINTI/BUDKQ397/161931]",[],[],,[9999],[99999999],[],[],[],,[],[http://archives.hungaricana.hu/tile/thumb/charters/olpic/107400/DL_107350/DL_107350_copia_1.ecw...,[]
1,"[tag:www.monasterium.net,2011:/charter/CZ-NA/AZK/1417]",[],[německy],,[22. únor 1538],[15380222],[],[],[],"Ubrmanská úmluva mezi chebským klášterem a Erhardem Wernherem, Niklasem Prunnerem a Šebestianem ...",[],"[NA-AZK_15380222_01417_r.jpg, NA-AZK_15380222_01417_v.jpg]",[]
2,"[tag:www.monasterium.net,2011:/charter/IlluminierteUrkundenKurie/1424-01-04_Linz]",[],[],,[],[],[1424-01-04],[14240104],[14240104],"Papst Martin V. erteilt dem Kloster Gleink einen Ablass, weil es ignis incendio vielen Schaden e...",[],[],[]
3,"[tag:www.monasterium.net,2011:/charter/IT-ASFi/DNMonticianoSSPietroPaolo/00030149]",[nel portico posteriore del monastero dei frati eremiti di Monticiano],[],,[],[],[1307 Giugno 17],[13070617],[13070617],,[],[],[]
4,"[tag:www.monasterium.net,2011:/charter/IT-ASFi/DNMonticianoSSPietroPaolo/00020999]",[nel luogo suddetto [comune di Monticiano]],[],,[],[],[1283 Gennaio 8],[12830108],[12830108],,[],[],[]
5,"[tag:www.monasterium.net,2011:/charter/DE-StAAm/EnsdorfOSB/277]",[],[],,[15. Juni 1415],[14150615],[],[],[],Verkauf eines Ackers zu Lammerthal (Laymtal) als freies lediges Eigen. Aussteller: A: Friedrich ...,[],[],[]
6,"[tag:www.monasterium.net,2011:/charter/AT-DOZA/Urkunden/1686_V_25b]",[],[],,[1686 Mai 25],[16860525],[],[],[],noch kein Regest vorhanden,[],"[DOZA-Urkunden_16860525_b_01.jpg, DOZA-Urkunden_16860525_b_02.jpg, DOZA-Urkunden_16860525_b_03.jpg]",[]
7,"[tag:www.monasterium.net,2011:/charter/PassauSNikolaTB/296]",[],[lat.],\n Text:\n + Require post finem libri in hoc compertorio hoc signum:a\n No...,[],[99999999],[],[],[],Dietmar und Liupold erhalten für sich und ihre Söhne den\n Besitz zu Beneuentenrut...,[],[],[]
8,"[tag:www.monasterium.net,2011:/charter/AT-NOeLA/StA_Urk/StA_Urk_1845]",[],[],,[],[],[1412 September 29.],[14120929],[14120929],"Hans von Ernfels bestätigt, daß die Verfügungen seiner Nichte Katharina, Tochter seines Bruders ...",[],[],[]
9,"[tag:www.monasterium.net,2011:/charter/HU-MNL-DLCSLP/PERCSQ148/81789]",[],[],,[1430-11-11],[14301111],[],[],[],ZSIGMOND KIRÁLY,[],[http://archives.hungaricana.hu/tile/thumb/charters/olpic/071200/DL_071101/DL_071101_orig_01.ecw...,[]


# Export whole dataframe

In [17]:
timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')
contents_full.to_json(f"../data/output/charters_{timemarker}.json")
contents_full.to_parquet(f"../data/output/charters_{timemarker}.parquet")

---

# Import generated data as dataframe

In [18]:
contents_full_json = pd.read_json("../data/output/charters_2022-08-31-2322.json")
contents_full_parquet = pd.read_parquet('../data/output/charters_2022-08-31-2322.parquet', engine='pyarrow')

In [23]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(contents_full_parquet.cei_tenor.sample(n=100))

555860                                                                                                       
203123                                                                                                       
93740                                                                                                        
60018                                                                                                        
420958                                                                                                       
349345                                                                                                       
280717                                                                                                       
193136                                                                                                       
125320                                                                                                       
118616    