# General

## Imports

In [1]:
from collections import deque
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
from datetime import datetime
import os
import pandas as pd
import pickle
import pprint
import random

## Parameters

In [3]:
pd.set_option('display.max_colwidth', 100)

In [4]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = "../data/db/mom-data/metadata.charter.public"
fileExtension = ".cei.xml"

# Load .xml mapping

In [9]:
with open('../data/mapping/CEI2CSV.pkl', 'rb') as f:
    mapping = pickle.load(f)

# Scan directory and create paths

In [10]:
def get_file_paths(directory):
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_file_paths(entry.path)
        else:
            continue

In [11]:
paths = [f"{PurePosixPath(path)}" for path in get_file_paths(directoryPath)]

# Create deques, query paths, and fill deques

In [13]:
deques = deque(["atom_id", "cei_placeName", "cei_lang_MOM", "cei_tenor", "cei_date", "cei_date_ATTRIBUTE_value", "cei_dateRange", "cei_dateRange_ATTRIBUTE_from", "cei_dateRange_ATTRIBUTE_to", "cei_abstract", "cei_abstract_foreign", "cei_graphic_ATTRIBUTE_url_orig", "cei_graphic_ATTRIBUTE_url_copy"])
atom_id, cei_placeName, cei_lang_MOM, cei_tenor, cei_date, cei_date_ATTRIBUTE_value, cei_dateRange, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_abstract, cei_abstract_foreign, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy = ([] for i in range(len(deques)))

def get_xpath_result(xpath_mapping):
    return tree.xpath(xpath_mapping, namespaces = namespaces, smart_strings = False)

for file in paths:
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(file)
        atom_id.append(get_xpath_result(f"{mapping['atom:id']}/text()"))
        cei_abstract.append("".join(get_xpath_result(f"{mapping['cei:abstract']}/descendant-or-self::text()[not(self::cei:sup)]")))
        cei_abstract_foreign.append(get_xpath_result(f"{mapping['cei:foreign']}/text()"))
        cei_tenor.append("".join(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::text()[not(self::cei:sup)]")))
        cei_placeName.append(get_xpath_result(f"{mapping['cei:issued/cei:placeName']}/text()"))
        cei_lang_MOM.append(get_xpath_result(f"{mapping['cei:lang_MOM']}/text()"))
        cei_date.append(get_xpath_result(f"{mapping['cei:date']}/text()"))
        cei_dateRange.append(get_xpath_result(f"{mapping['cei:dateRange']}/text()"))
        cei_date_ATTRIBUTE_value.append(get_xpath_result(f"{mapping['cei:date']}/@value"))
        cei_dateRange_ATTRIBUTE_from.append(get_xpath_result(f"{mapping['cei:dateRange']}/@from"))
        cei_dateRange_ATTRIBUTE_to.append(get_xpath_result(f"{mapping['cei:dateRange']}/@to"))
        cei_graphic_ATTRIBUTE_url_orig.append(get_xpath_result(f"{mapping['@url']}"))
        cei_graphic_ATTRIBUTE_url_copy.append(get_xpath_result(f"{mapping['cei:graphic/@url']}"))

# Merge deques to dataframe

In [14]:
# generate dynamic dictionary content for renaming frames
for i, item in enumerate(deques):
    print(f"{i}:'{item}',")
    i =+ 1

0:'atom_id',
1:'cei_placeName',
2:'cei_lang_MOM',
3:'cei_tenor',
4:'cei_date',
5:'cei_date_ATTRIBUTE_value',
6:'cei_dateRange',
7:'cei_dateRange_ATTRIBUTE_from',
8:'cei_dateRange_ATTRIBUTE_to',
9:'cei_abstract',
10:'cei_abstract_foreign',
11:'cei_graphic_ATTRIBUTE_url_orig',
12:'cei_graphic_ATTRIBUTE_url_copy',


In [15]:
contents = list(zip(atom_id, cei_placeName, cei_lang_MOM, cei_tenor, cei_date, cei_date_ATTRIBUTE_value, cei_dateRange, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_abstract, cei_abstract_foreign, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy))
contents_full = pd.DataFrame(contents).rename(
    columns={ 0:'atom_id',
1:'cei_placeName',
2:'cei_lang_MOM',
3:'cei_tenor',
4:'cei_date',
5:'cei_date_ATTRIBUTE_value',
6:'cei_dateRange',
7:'cei_dateRange_ATTRIBUTE_from',
8:'cei_dateRange_ATTRIBUTE_to',
9:'cei_abstract',
10:'cei_abstract_foreign',
11:'cei_graphic_ATTRIBUTE_url_orig',
12:'cei_graphic_ATTRIBUTE_url_copy' })

In [68]:
contents_full

Unnamed: 0,atom_id,cei_placeName,cei_lang_MOM,cei_tenor,cei_date,cei_date_ATTRIBUTE_value,cei_dateRange,cei_dateRange_ATTRIBUTE_from,cei_dateRange_ATTRIBUTE_to,cei_abstract,cei_abstract_foreign,cei_graphic_ATTRIBUTE_url_orig,cei_graphic_ATTRIBUTE_url_copy
0,"[tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/0632d68f-f907-4113-8...",[Napoli],[],,[],[],[1578 agosto 29],[15780829],[15780829],Fede del miracolo di S. Giovanni Battista nella Chiesa di S. Liguoro.,[],[http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Baia...,[]
1,"[tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/0e3d1203-0e72-4b17-8...",[Napoli],[],+In nomine domini dei salvatoris nostri Iuesu Christi. Anno ab incarnationis eius millesimo duce...,[],[],[1268 ottobre 23],[12681023],[12681023],"Palermo de Caserta, figlio dei defunti Giovanni e Sarracina, abitante di Calvizzano, ottiene in ...",[],[http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Baia...,[]
2,"[tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/0eb5100f-8132-48fd-a...",[Napoli],[],,[],[],[1127 febbraio 16],[11270216],[11270216],Privigelio di Sergio console e duca in favore del monastero di S. Liguoro.,[],[http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Baia...,[]
3,"[tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/16bfa2f5-9171-4d18-b...",[Napoli],[],,[],[],[1518 marzo 3],[15180303],[15180303],"Mattea Luca Caracciolo, vesc. di Lesina concede alcuni suoi beni al monastero di S. Liguoro.",[],[http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Baia...,[]
4,"[tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/16c37112-ba89-4c4c-8...",[Napoli],[],,[],[],[1559 settembre 11],[15590911],[15590911],Il monastero di S. Liguoro concede a Giovan Battista Grisono che venda una masseria in tenimento...,[],[http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Baia...,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
659165,"[tag:www.monasterium.net,2011:/charter/WirtembergischesUrkundenbuch/faf66a3f-d54b-4439-a443-65cd...",[],[],"CCCXIV. In nomine sánete et individué trinitatis. Cunradus, divina favente clemencia Romanorum...",[99999999],[99999999],[],[],[],König Konrad III. bestätigt der Kirche in Speier alle von seinen Vorfahren und\n ...,[],[],"[00000042.png, 00000043.png]"
659166,"[tag:www.monasterium.net,2011:/charter/WirtembergischesUrkundenbuch/fb51a6c4-bde7-4df7-86c7-ee6d...",[],[],"CDXCV. Chr. I In nomine sánete et individué trinitatis. Heinricus sextus, divina faveutc\n ...",[99999999],[99999999],[],[],[],Kaiser Heinrich VI. nimmt das Kloster Herrenalb mit den durch Kauf von Eberhard\n ...,[],[],"[00000338.png, 00000339.png, 00000340.png]"
659167,"[tag:www.monasterium.net,2011:/charter/WirtembergischesUrkundenbuch/fc9ffa72-79b1-4029-8eb5-a535...",[],[],"CCCLXVII. 1159. ! In nominç sanctç et individuc trinitatis. j Gimtherus, dei gratia Spirensis...",[99999999],[99999999],[],[],[],"Bischof Günther von Speier erwirbt und übergiebt die Villa Eilfingen, unter\n ...",[],[],"[00000149.png, 00000150.png, 00000151.png]"
659168,"[tag:www.monasterium.net,2011:/charter/WirtembergischesUrkundenbuch/fe45e235-cb7f-4ee3-ae89-6e10...",[],[],"CCCXXXV. 1152. j In nomine sanctç et individué, trinitatis. l Ego Guntherus, dei gratia sanct...",[99999999],[99999999],[],[],[],"Bischof Günther von Speier verordnet, dass das Kloster Maulbronn aus seinem in\n ...",[],[],"[00000085.png, 00000086.png]"


# Export whole dataframe

In [17]:
timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')
contents_full.to_json(f"../data/output/charters_{timemarker}.json")
contents_full.to_parquet(f"../data/output/charters_{timemarker}.parquet")

---

# Import generated data as dataframe

In [None]:
# contents_full_json = pd.read_json("../data/output/x.json")
# contents_full_parquet = pd.read_parquet('../data/output/x.parquet', engine='pyarrow')