# Transformation of .xml (cei) [mom-dump] into tabular data
## info
- dependencies are included for each markdown section, respectively 

- ✔ means tested and working

- 👑 means most worthy and purposeful script in given context

# get and print all xml tags used in a file ✔

In [None]:
import xml.etree.ElementTree as ET 
from pprint import pprint

In [None]:
tree=ET.parse('../data/db/mom-data/metadata.charter.public/61ebd1f0-6e2a-4181-b3b2-ecc71ed5e013/0d2c98ff-afb5-4728-9f57-1623ad8240a3.cei.xml')
root=tree.getroot()

elemList=[]

for elem in tree.iter():
    elemList.append(elem.tag)

elemList = list(set(elemList))
pprint(elemList)

# get all graphic urls from a file ✔

In [None]:
import xml.etree.ElementTree as ET

In [None]:
namespaces={'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}

In [None]:
tree=ET.parse('../data/db/mom-data/metadata.charter.public/BischoefeSpeyer/0a4ea494-2b54-4495-a777-b214c57677f2.cei.xml')
root=tree.getroot()

In [None]:
for tag in root.findall('.//cei:graphic', namespaces):
    attribute = tag.attrib
    print(attribute)

# get atomID and graphicURL into pd df from a file ✔
- only charters with images

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd

In [None]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}

In [None]:
tree=ET.parse('../data/db/mom-data/metadata.charter.public/BischoefeSpeyer/0a4ea494-2b54-4495-a777-b214c57677f2.cei.xml')
root = tree.getroot()

In [None]:
atomIDs = []
attributes = []

for tag in root.findall('.//cei:graphic', namespaces):
    atomID = root[0].text
    atomIDs.append(atomID)
    attribute = tag.attrib['url'] 
    attributes.append(attribute)

In [None]:
img_list = list(zip(atomIDs, attributes))

In [None]:
df = pd.DataFrame(img_list).rename(columns={0: 'atomID', 1: 'url'})
df

# get atomID and graphicURL into .csv from a collection using scandir ✔
- without subdirectories
- only charters with images

In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from pathlib import PurePath

In [None]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = '../data/db/mom-data/metadata.charter.public/BischoefeSpeyer/'
extension = ('.cei.xml')

In [None]:
atomIDs = []
attributes = []

with os.scandir(directoryPath) as it:
    for entry in it:
        if entry.name.endswith(extension) and entry.is_file():
            tree = ET.parse(entry.path)
            root = tree.getroot()
            for tag in root.findall('.//cei:graphic', namespaces):
                atomID = root[0].text
                atomIDs.append(atomID)
                attribute = tag.attrib['url'] 
                attributes.append(attribute)

In [None]:
img_list = list(zip(atomIDs, attributes))
df = pd.DataFrame(img_list).rename(columns={0: 'atomID', 1: 'url'})
df

In [None]:
path = PurePath(directoryPath)
pathname = path.name

In [None]:
df.to_csv(f'../data/output/{pathname}.csv', index=False)

# 👑 get atomID and graphicURL into .csv from multiple collections ✔
- includes subdirectories
- fast, still takes several minutes for whole mom-dump

In [None]:
from pathlib import Path
from pathlib import PurePath
import xml.etree.ElementTree as ET
import pandas as pd

In [None]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = '\\\?/'+'C://Users/atzenhof/playground/GitHub/didip/data/db/mom-data/metadata.charter.public/' # escape needed for subdirectory paths longer than system allows for
fileExtension = ('*.cei.xml')

In [None]:
atomIDs = []
attributes = []

for file in Path(directoryPath).rglob(fileExtension):
    tree = ET.parse(file) # parsing does not consider utf-8 specific symbols (such as long dash); needs postcorrection
    root = tree.getroot()
    for tag in root.findall('.//cei:graphic', namespaces):
        atomID = root[0].text
        atomIDs.append(atomID)
        if 'url' in tag.attrib:
            attribute = tag.attrib['url'] 
            attributes.append(attribute)
        else:
            continue

In [None]:
img_list = list(zip(atomIDs, attributes))
df = pd.DataFrame(img_list).rename(columns={0: 'atomID', 1: 'url'})
df

In [None]:
pathname = PurePath(directoryPath).name
df.to_csv(f'../data/output/{pathname}.csv', index=False)
#df_links = df[df['url'].str.contains("http://")]
#df_links.to_csv(f'links_{pathname}.csv', index=False)

# 👑 transform referential img-.xml file(s) into csv using lxml ✔

In [None]:
from pathlib import PurePath
from pathlib import Path
from lxml import etree #lxml since xml.etree.ElementTree does not have full xpath support (no getparent() after using find/findall())
import pandas as pd

In [None]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = '\\\?/'+'C://Users/atzenhof/playground/GitHub/didip/data/images_xml' # path escape due to long path
fileExtension = ('*.xml')

In [None]:
atomIDs = []
image_links = []

for file in Path(directoryPath).rglob(fileExtension):
    tree = etree.parse(str(file)) # requres conversion to str since lxml does not vibe with windowspath
    root = tree.getroot()
    for img in root.findall('.//img', namespaces):
        atomID = img.getparent().attrib['id']
        atomIDs.append(atomID)
        image_link = img.attrib['src']
        image_links.append(image_link)

In [None]:
img_list = list(zip(atomIDs, image_links))
df = pd.DataFrame(img_list).rename(columns={0: 'atomID', 1: 'url'})
df

In [None]:
pathname = PurePath(directoryPath).name
df.to_csv(f'../data/output/{pathname}.csv', index=False)

# lists atomIDs of all charters in folder [mom-dump] including images
## to do
- add exception for image link (if it is empty)

In [7]:
from pathlib import Path
from pathlib import PurePath
import xml.etree.ElementTree as ET
import pandas as pd

In [8]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = '\\\?/'+'C://Users/atzenhof/playground/GitHub/didip/data/db_subset_for_test' # escape needed for subdirectory paths longer than system allows for
fileExtension = ('*.cei.xml')

In [9]:
print(directoryPath)

\\?/C://Users/atzenhof/playground/GitHub/didip/data/db_subset_for_test


In [15]:
atomIDs = []
attributes = []

for file in Path(directoryPath).rglob(fileExtension):
    tree = ET.parse(file) # parsing does not consider utf-8 specific symbols (such as long dash); needs postcorrection
    root = tree.getroot()
    for tag in root.findall('.//cei:graphic', namespaces):
        atomID = root[0].text
        atomIDs.append(atomID)
        if 'url' in tag.attrib:
            attribute = tag.attrib['url'] 
            attributes.append(attribute)
        else:
            attribute = ""
            attributes.append(attribute)

In [18]:
img_list = list(zip(atomIDs, attributes))
df = pd.DataFrame(img_list).rename(columns={0: 'atomID', 1: 'url'})
df

Unnamed: 0,atomID,url
0,"tag:www.monasterium.net,2011:/charter/Bischoef...",00000289.png
1,"tag:www.monasterium.net,2011:/charter/Bischoef...",00000290.png
2,"tag:www.monasterium.net,2011:/charter/Bischoef...",00000543.png
3,"tag:www.monasterium.net,2011:/charter/Bischoef...",00000544.png
4,"tag:www.monasterium.net,2011:/charter/Bischoef...",00000294.png
...,...,...
12401,"tag:www.monasterium.net,2011:/charter/RhenoMos...",00000337.png
12402,"tag:www.monasterium.net,2011:/charter/RhenoMos...",00000843.png
12403,"tag:www.monasterium.net,2011:/charter/RhenoMos...",00000557.png
12404,"tag:www.monasterium.net,2011:/charter/RhenoMos...",00000558.png


bei else: continue .. 12406 rows

bei else: attribute = "" .. 12405 rows

In [24]:
pathname = PurePath(directoryPath).name
df.to_csv(f'../data/output/{pathname}.csv', index=False)

df_noURLs = df[df['url'].astype(bool)]

df_noURLs.to_csv(f'../data/output/{pathname}_nourls.csv'
    f'{pathname}_noURLs.csv', index=False)