# general

## imports

In [1]:
from lxml import etree
from pathlib import Path
from pathlib import PurePath
from pathlib import PurePosixPath
import datetime
import hashlib
import numpy as np
import os
import pandas as pd
import pickle
import pprint
import random
import requests

## parameters

In [3]:
pd.set_option('display.max_colwidth', 100)

# create sample + URL based on xquery result

In [3]:
df = pd.read_csv('../../data/output/images_xml.csv', encoding="utf-8")

In [4]:
df

Unnamed: 0,atom,url
0,"tag:www.monasterium.net,2011:/charter/AT-HHStA...",http://images.monasterium.net/img/AT-OeStA-HHS...
1,"tag:www.monasterium.net,2011:/charter/AT-HHStA...",http://images.monasterium.net/img/AT-OeStA-HHS...
2,"tag:www.monasterium.net,2011:/charter/AT-HHStA...",http://images.monasterium.net/img/AT-OeStA-HHS...
3,"tag:www.monasterium.net,2011:/charter/AT-HHStA...",http://images.monasterium.net/img/AT-OeStA-HHS...
4,"tag:www.monasterium.net,2011:/charter/AT-HHStA...",http://images.monasterium.net/img/AT-OeStA-HHS...
...,...,...
815787,"tag:www.monasterium.net,2011:/charter/BISANU/1...",https://images.monasterium.net/pics/858/132702...
815788,"tag:www.monasterium.net,2011:/charter/BISANU/1...",https://images.monasterium.net/pics/858/132702...
815789,"tag:www.monasterium.net,2011:/charter/BISANU/1...",https://images.monasterium.net/pics/858/132702...
815790,"tag:www.monasterium.net,2011:/charter/BISANU/1...",https://images.monasterium.net/pics/858/I.._IE...


In [None]:
atom_series = df['atom'].drop_duplicates(keep='first', inplace=False)
atom_series_clean = atom_series.reset_index(drop=True)
atom_series_clean_duplicate = atom_series_clean

In [None]:
df_atom_url_clean = pd.concat([atom_series_clean, atom_series_clean_duplicate], axis=1)
df_atom_url_clean.columns.values[0] = "atomID"
df_atom_url_clean.columns.values[1] = "URL"
df_atom_url_clean["parts"] = ""

In [None]:
df_atom_url_clean.URL = df_atom_url_clean.URL.str.split('/')

In [None]:
df_atom_url_clean['URL'].sample(n=10).head(10)

In [None]:
for cell in df_atom_url_clean['URL']:
    del cell[:2]

In [None]:
df_atom_url_parts = df_atom_url_clean
df_atom_url_parts

In [None]:
mom_base = "https://www.monasterium.net/mom"
charter_end = "charter"

In [None]:
df_atom_url_parts['URL'].str.len()

In [None]:
#legacy
#noHungaricana = df[~df.url.str.contains('hungaricana')]
#df[~df.url.str.startswith('/')]

In [None]:
charter_sample_list = []

In [None]:
for cell in df_atom_url_parts['URL']:
    if len(cell) == 3:
        charter_sample_list.append(mom_base + "/" + cell[0] + "/" + cell[1] + "/" + cell[2] + "/" + charter_end)
    else:
        charter_sample_list.append(mom_base + "/" + cell[0] + "/" + cell[1] + "/" + charter_end)

In [None]:
df = pd.Series(charter_sample_list)

In [None]:
sample = df.sample(n=1000)

In [None]:
sample

In [None]:
df.to_csv("../data/output/playground_sample_test.csv", encoding="utf-8", header=False, index=False)

# get img links from database

In [None]:
from pathlib import PurePath
from pathlib import Path
from lxml import etree
import pandas as pd

In [None]:
namespaces = {"atom": "http://www.w3.org/2005/Atom", "cei": "http://www.monasterium.net/NS/cei"}
fileExtension = "*.cei.xml"

In [None]:
charter_image_list_test = pd.read_csv("../data/output/charter_image_list_explodedmetadata.charter.public.csv", encoding="utf-8")

In [None]:
charter_image_list_test

# get id+origImageLink+copyImageLink+hasImage(Bool) from mom dump

In [4]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = "../data/db/mom-data/metadata.charter.public"
fileExtension = "*.cei.xml"

In [5]:
atom_id = []
origImageLink = []
copyImageLink = []

In [None]:
for file in Path(directoryPath).rglob(fileExtension):
    tree = etree.parse(str(file))
    atom_id.append(tree.xpath("//atom:id/text()", namespaces = namespaces))
    origImageLink.append(tree.xpath("//cei:witnessOrig/cei:figure/cei:graphic/@url", namespaces = namespaces))
    copyImageLink.append(tree.xpath("//cei:witness/cei:figure/cei:graphic/@url", namespaces = namespaces))

In [None]:
img_contents = list(zip(atom_id, origImageLink, copyImageLink))
img_contents_full = pd.DataFrame(img_contents).rename(columns={0: "atom_id", 1: "origImageLink", 2: "copyImageLink"})

In [None]:
img_contents_full

In [None]:
img_contents_full['hasImage'] = [False if oi == [] and ci == [] else True for oi, ci in zip(img_contents_full.origImageLink, img_contents_full.copyImageLink)]

In [None]:
img_contents_full.loc[img_contents_full["hasImage"] == False]

In [None]:
img_contents_full.to_csv("../data/output/id+origImageLink+copyImageLink+hasImage_full.csv", encoding="utf-8")

In [None]:
img_contents_full_exploded = img_contents_full.explode("atom_id").explode("origImageLink").explode("copyImageLink")
img_contents_full_exploded_reset = img_contents_full_exploded.reset_index(drop=True, inplace=True)
img_contents_full_exploded_reset.to_csv("../data/output/id+origImageLink+copyImageLink+hasImage_exploded_full.csv", encoding="utf-8")

# get charters with images and seals

In [None]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = "../data/db/mom-data/metadata.charter.public"
fileExtension = "*.cei.xml"

In [None]:
atom_id = []
origImageLink = []
copyImageLink = []
cei_orig_sealDesc = []
cei_copy_sealDesc = []

In [None]:
for file in Path(directoryPath).rglob(fileExtension):
    tree = etree.parse(str(file))
    atom_id.append(tree.xpath("//atom:id/text()", namespaces = namespaces))
    origImageLink.append(tree.xpath("//cei:witnessOrig/cei:figure/cei:graphic/@url", namespaces = namespaces))
    copyImageLink.append(tree.xpath("//cei:witness/cei:figure/cei:graphic/@url", namespaces = namespaces))
    cei_orig_sealDesc.append(tree.xpath("//cei:witnessOrig/cei:auth/cei:sealDesc/text()", namespaces = namespaces))
    cei_copy_sealDesc.append(tree.xpath("//cei:witness/cei:auth/cei:sealDesc/text()", namespaces = namespaces))

In [None]:
id_imgLink_sealDesc = list(zip(atom_id, origImageLink, copyImageLink, cei_orig_sealDesc, cei_copy_sealDesc))
id_imgLink_sealDesc = pd.DataFrame(id_imgLink_sealDesc).rename(columns={0: "atom_id", 1: "origImageLink", 2: "copyImageLink", 3: "cei_orig_sealDesc", 4: "cei_copy_sealDesc"})

In [None]:
id_imgLink_sealDesc['hasImage'] = [False if oi == [] and ci == [] else True for oi, ci in zip(id_imgLink_sealDesc.origImageLink, id_imgLink_sealDesc.copyImageLink)]
id_imgLink_sealDesc['hasSealDesc'] = [False if os == [] and cs == [] else True for os, cs in zip(id_imgLink_sealDesc.cei_orig_sealDesc, id_imgLink_sealDesc.cei_copy_sealDesc)]

In [None]:
id_imgLink_sealDesc.loc[id_imgLink_sealDesc["hasImage"] == True]

In [None]:
id_imgLink_sealDesc.to_csv("../data/output/seal_desc.csv", encoding="utf-8")

In [None]:
#datetime.datetime.now().strftime("%D,%T")

# get urls of all images

## get img paths for collection charters

In [None]:
charter_image_list_fresh = pd.read_csv("../data/output/charter_image_list_explodedmetadata.charter.public.csv", encoding="utf-8")

In [None]:
charter_image_list = charter_image_list_fresh

In [None]:
charter_image_list

In [None]:
charter_imageList_badLinks = (charter_image_list[~charter_image_list.cei_graphic_ATTRIBUTE_url.str.startswith("http", na=False)])[~charter_image_list.cei_graphic_ATTRIBUTE_url.isna()]
charter_imageList_goodLinks = charter_image_list[charter_image_list.cei_graphic_ATTRIBUTE_url.str.startswith("http", na=False)]
charter_imageList_noLinks = charter_image_list[charter_image_list.cei_graphic_ATTRIBUTE_url.isna()]

In [None]:
charter_imageList_badLinks.reset_index(inplace=True, drop=True)
charter_imageList_goodLinks.reset_index(inplace=True, drop=True)
charter_imageList_noLinks.reset_index(inplace=True, drop=True)

In [None]:
print(f"There are {len(charter_imageList_badLinks)} images with 'bad' image links.")
print(f"There are {len(charter_imageList_goodLinks)} images with 'good' image links.")
print(f"There are {len(charter_imageList_noLinks)} charters with 'no' image links.")

## get links for bad image links

In [None]:
charter_imageList_badLinks["atom_parts"] = charter_imageList_badLinks.atom_id.str.split('/')

In [None]:
charter_imageList_badLinks_corrected = charter_imageList_badLinks

In [None]:
for cell in charter_imageList_badLinks_corrected['atom_parts']:
    del cell[:2]

In [None]:
charter_imageList_badLinks_corrected_parted = charter_imageList_badLinks_corrected

In [None]:
charter_imageList_badLinks_corrected_parted

In [None]:
lists = ["atom_id", "archive", "fond", "collection"]
atom_id, archive, fond, collection = ([] for i in range(len(lists)))

In [None]:
for cell in charter_imageList_badLinks_corrected_parted["atom_parts"]:
    if len(cell) == 3:
        archive.append(cell[0])
        fond.append(cell[1])
        collection.append(None)
    else:
        archive.append(None)
        fond.append(None)
        collection.append(cell[0])

contents = list(zip(archive, fond, collection))
contents_df = pd.DataFrame(contents).rename(columns={0: "archive", 1: "fond", 2: "collection"})

In [None]:
contents_df["atom_id"] = charter_imageList_badLinks_corrected_parted["atom_id"]

In [None]:
contents_df

In [None]:
new_df = pd.concat([charter_imageList_badLinks_corrected_parted, contents_df], axis=1)

In [None]:
new_df

In [None]:
newer_df = new_df.iloc[: , :-1]

In [None]:
newer_df

In [None]:
newer_df.to_csv("../data/output/charters_badLinks_parted.csv", encoding="utf-8")

## get collection/archive reference from all charters

In [None]:
charter_image_list = charter_image_list_fresh

In [None]:
charter_image_list["atom_parts"] = charter_image_list.atom_id.str.split('/')

In [None]:
charter_image_list

## ...

In [None]:
charter_image_list_corrected = charter_image_list

In [None]:
for cell in charter_image_list_corrected['atom_parts']:
    del cell[:2]

In [None]:
charter_image_list_corrected

In [None]:
charter_image_list_corrected_parted = charter_image_list_corrected

In [None]:
charter_image_list_corrected_parted

In [None]:
lists = ["archive", "fond", "collection"]
archive, fond, collection = ([] for i in range(len(lists)))

In [None]:
for cell in charter_image_list_corrected_parted["atom_parts"]: #this query might include an error: resulting df yields one row less than there should be? probably one has no atom-id
    if len(cell) == 3:
        archive.append(cell[0])
        fond.append(cell[1])
        collection.append(None)
    elif len(cell) == 2:
        archive.append(None)
        fond.append(None)
        collection.append(cell[0])
    else:
        continue

contents = list(zip(archive, fond, collection))
contents_df = pd.DataFrame(contents).rename(columns={0: "archive", 1: "fond", 2: "collection"})

In [None]:
contents_df

In [None]:
contents_df["atom_id"] = charter_image_list["atom_id"] # da is wsl da Hund drin

In [None]:
contents_df["in_archive"] = [True if a != None else False for a in contents_df["archive"]]
contents_df["in_collection"] = [True if c != None else False for c in contents_df["collection"]]

In [None]:
contents_df.to_csv("../data/output/charters_parted.csv", encoding="utf-8")

## get collection img path and other info from collections folder 

In [None]:
namespaces = {"atom": "http://www.w3.org/2005/Atom", "cei": "http://www.monasterium.net/NS/cei", "xrx": "http://www.monasterium.net/NS/xrx"}
directoryPath = "../data/db/mom-data/metadata.collection.public"
fileExtension = "*.cei.xml"

In [None]:
lists = ["atom_id", "provenance", "country_ID", "country", "xrx_keyword", "sourceDesc", "pubPlace", "date", "image_server_address", "image_server_folder" ]
atom_id, provenance, country_ID, country, xrx_keyword, sourceDesc, pubPlace, date, image_server_address, image_server_folder = ([] for i in range(len(lists)))

In [None]:
for file in Path(directoryPath).rglob(fileExtension):
    tree = etree.parse(str(file))
    atom_id.append(tree.xpath("//atom:id/text()", namespaces = namespaces))
    provenance.append(tree.xpath("//cei:provenance/@abbr", namespaces = namespaces))
    country_ID.append(tree.xpath("//cei:provenance/cei:country/@id", namespaces = namespaces))
    country.append(tree.xpath("//cei:provenance/cei:country/text()", namespaces = namespaces))
    xrx_keyword.append(tree.xpath("///xrx:keywords/xrx:keyword/text()", namespaces = namespaces))
    sourceDesc.append(tree.xpath("//cei:sourceDesc/cei:p/text()", namespaces = namespaces))
    pubPlace.append(tree.xpath("//cei:imprint/cei:pubPlace/text()", namespaces = namespaces))
    date.append(tree.xpath("//cei:imprint/cei:date/text()", namespaces = namespaces))
    image_server_address.append(tree.xpath("//cei:image_server_address/text()", namespaces = namespaces))
    image_server_folder.append(tree.xpath("//cei:image_server_folder/text()", namespaces = namespaces))

In [None]:
collection_data = list(zip(atom_id, provenance, country_ID, country, xrx_keyword, sourceDesc, pubPlace, date, image_server_address, image_server_folder))
collection_data = pd.DataFrame(collection_data).rename(columns={0: "atom_id", 1: "provenance", 2: "country_ID", 3: "country", 4: "xrx_keyword", 5: "sourceDesc",  6: "pubPlace", 7: "date", 8: "image_server_address", 9: "image_server_folder"})

In [None]:
collection_data.to_parquet("../data/output/collection_data.parquet")

## something else?

In [None]:
collection_data_1 = collection_data

In [None]:
collection_data_1["dir"] = collection_data_1.explode("atom_id").atom_id.str.split("/")

In [None]:
collection_data_1.to_csv("../data/output/tests/collection_data_1_slashtest.csv", encoding="utf-8")

In [None]:
collection_data_pref = collection_data_1

In [None]:
dir = []

In [None]:
for cell in collection_data_pref["dir"]:
    dir.append(cell[-1])

In [None]:
collection_data_pref["dir"] = dir

In [None]:
collection_data_pref  = collection_data_pref.filter(["atom_id", "image_server_address", "image_server_folder", "dir"])

In [None]:
collection_data_imageLinks = collection_data_pref.explode("atom_id").explode("image_server_address").explode("image_server_folder").explode("dir") 

In [None]:
collection_data_imageLinks.rename(columns= {"atom_id": "c_atom_id"}, inplace=True)

In [None]:
collection_data_imageLinks.to_csv("../data/output/collection_data_imageLinks.csv", encoding="utf-8")

## get fond img path from archives folder

In [None]:
namespaces = {"atom": "http://www.w3.org/2005/Atom", "cei": "http://www.monasterium.net/NS/cei", "xrx": "http://www.monasterium.net/NS/xrx", "ead": "urn:isbn:1-931666-22-9"}
directoryPath = "../data/db/mom-data/metadata.fond.public"
fileExtensionPref = "*.preferences.xml"
fileExtensionEAD = "*.ead.xml"

In [None]:
lists = ["dir", "image_server_base_url", "atom_ID", "unit_ID"]
dir, image_server_base_url, atom_ID, unit_ID = ([] for i in range(len(lists))) # creates generator items instead of lists when only one is selected? how can this be used/exploited?

In [None]:
for file in Path(directoryPath).rglob(fileExtensionPref):
    dir.append(str(Path(file).parent.parent.name) + "/" + str(Path(file.parent.name)))
    tree = etree.parse(str(file))
    image_server_base_url.append(tree.xpath("//xrx:param[@name='image-server-base-url']/text()", namespaces = namespaces))

In [None]:
fond_data_pref = list(zip(dir, image_server_base_url))
fond_data_pref = pd.DataFrame(fond_data_pref).rename(columns={0: "dir", 1: "image_server_base_url"}).explode("image_server_base_url")

In [None]:
lists = ["dir", "image_server_base_url", "atom_ID", "unit_ID"]
dir, image_server_base_url, atom_ID, unit_ID = ([] for i in range(len(lists))) # creates generator items instead of lists when only one is selected? how can this be used/exploited?

In [None]:
for file in Path(directoryPath).rglob(fileExtensionEAD):
    dir.append(str(Path(file).parent.parent.name) + "/" + str(Path(file.parent.name)))
    tree = etree.parse(str(file))
    atom_ID.append(tree.xpath("//atom:id/text()", namespaces = namespaces))
    unit_ID.append(tree.xpath("//ead:unitid/@identifier", namespaces = namespaces))

In [None]:
fond_data_EAD = list(zip(dir, atom_ID, unit_ID))
fond_data_EAD = pd.DataFrame(fond_data_EAD).rename(columns={0: "dir", 1: "atom_ID", 2: "unit_ID"}).explode("atom_ID").explode("unit_ID")

In [None]:
fond_data_outer = fond_data_EAD.merge(fond_data_pref, on="dir", how="outer")
fond_data_inner = fond_data_EAD.merge(fond_data_pref, on="dir", how="inner")

In [None]:
fond_data_outer["dir"] = fond_data_outer["atom_ID"].str.split("/")

atom_ID = []

for cell in fond_data_outer["dir"]:
    atom_ID.append(f"{(cell)[-2]}/{(cell)[-1]}")

fond_data_outer["dir"] = atom_ID

In [None]:
fond_data_imgLinks = fond_data_outer.drop(["unit_ID"], axis=1)[["atom_ID", "dir", "image_server_base_url"]]

In [None]:
fond_data_imgLinks.rename(columns={"atom_ID": "f_atom_id"}, inplace=True)

In [None]:
fond_data_imgLinks.to_csv("../data/output/fond_data_imgLinks.csv", encoding="utf-8")

In [None]:
fonds_without_baseURL = pd.concat([fond_data_outer,fond_data_inner]).drop_duplicates(keep=False)

In [None]:
fonds_without_baseURL.to_csv("../data/output/fonds_without_baseURL.csv", encoding="utf-8", index=False)

## build image urls for charter files

In [None]:
#df_fresh = pd.read_csv('../data/output/id+origImageLink+copyImageLink+hasImage_exploded_full.csv', encoding="utf-8")

In [None]:
fond_data_imgLinks = pd.read_csv("../data/output/fond_data_imgLinks.csv", encoding="utf-8", index_col=[0])
collection_data_imageLinks = pd.read_csv("../data/output/collection_data_imageLinks.csv", encoding="utf-8", index_col=[0])

In [None]:
df_fresh = pd.read_csv("../data/output/charters_badLinks_parted.csv", encoding="utf-8", index_col=[0])

In [None]:
df_collection = ((df_fresh.drop(["archive", "fond"], axis=1))[df_fresh.collection.notnull()]).reset_index(drop=True)
df_archive = ((df_fresh.drop(["collection"], axis=1))[df_fresh.archive.notnull()]).reset_index(drop=True)

In [None]:
df_collection["dir"] = df_collection["collection"]
df_archive["dir"] = df_archive["archive"] + "/" + df_archive["fond"]

In [None]:
df_collection_links = pd.merge(df_collection, collection_data_imageLinks, on="dir", how="outer")
df_archive_links = pd.merge(df_archive, fond_data_imgLinks, on="dir", how="outer")

In [None]:
df_collection_links["charter_image_url"] = "http://" + df_collection_links["image_server_address"] + "/" + df_collection_links["image_server_folder"] + "/" + df_collection_links["cei_graphic_ATTRIBUTE_url"]
df_archive_links["charter_image_url"] = df_archive_links["image_server_base_url"] + "/" + df_archive_links["cei_graphic_ATTRIBUTE_url"]

In [None]:
df_collection_links_clean = df_collection_links.drop(["collection", "dir", "c_atom_id", "atom_parts", "image_server_address", "image_server_folder"], axis=1)
df_archive_links_clean = df_archive_links.drop(["archive", "archive", "fond", "dir", "f_atom_id", "atom_parts", "image_server_base_url"], axis=1)

In [None]:
df_collection_links_clean.dropna(inplace=True)
df_archive_links_clean.dropna(inplace=True)

In [None]:
df_collection_links_clean.to_csv("../data/output/df_collection_links_clean.csv", encoding="utf-8")
df_archive_links_clean.to_csv("../data/output/df_archive_links_clean.csv", encoding="utf-8")

## check img urls for up down

In [None]:
df_collection_links_clean = pd.read_csv("../data/output/df_collection_links_clean.csv", encoding="utf-8", index_col=[0])
df_archive_links_clean = pd.read_csv("../data/output/df_archive_links_clean.csv", encoding="utf-8", index_col=[0])

In [None]:
#df_archive_links_clean_test = df_archive_links_clean.sample(n=50)
df_archive_links_clean_test = final_df.sample(n=150)

In [None]:
df_archive_links_clean_test

In [None]:
df_archive_links_clean_test_snap = df_archive_links_clean_test

In [None]:
def get_url_status_try(url):
    request = requests.get(url)
    status_code = request.status_code
    return status_code

In [None]:
for url in df_archive_links_clean_test_snap["charter_image_url"]:
    print(url)
    print(get_url_status_try(url))
    df_archive_links_clean_test_snap["ping_code"] = get_url_status_try(url)
    df_archive_links_clean_test_snap["ping_when"] = datetime.datetime.now()

In [None]:
# def get_url_status_requests(url):
#     if type(url) == str:
#         url_clean = url.replace(" ", "%20")
#         request = requests.get(url_clean)
#         status_code = request.status_code
#         return status_code
#     else:
#         return "Failed."

In [None]:
df_archive_links_clean_test.to_csv("../data/output/tests/df_archive_links_clean_test.csv", encoding="utf-8")

# merge generated paths with existing ones (bad and good charters, unite!)

In [None]:
df_collection_links_clean = pd.read_csv("../data/output/df_collection_links_clean.csv", encoding="utf-8", index_col=[0])
df_archive_links_clean = pd.read_csv("../data/output/df_archive_links_clean.csv", encoding="utf-8", index_col=[0])

In [None]:
charter_image_list_fresh = pd.read_csv("../data/output/charter_image_list_explodedmetadata.charter.public.csv", encoding="utf-8")

In [None]:
charter_image_list = charter_image_list_fresh


In [None]:
charter_image_list

In [None]:
charter_imageList_badLinks = (charter_image_list[~charter_image_list.cei_graphic_ATTRIBUTE_url.str.startswith("http", na=False)])[~charter_image_list.cei_graphic_ATTRIBUTE_url.isna()]
charter_imageList_goodLinks = charter_image_list[charter_image_list.cei_graphic_ATTRIBUTE_url.str.startswith("http", na=False)]
charter_imageList_noLinks = charter_image_list[charter_image_list.cei_graphic_ATTRIBUTE_url.isna()]

In [None]:
print(f"There are {len(charter_imageList_badLinks)} images with 'bad' image links.")
print(f"There are {len(charter_imageList_goodLinks)} images with 'good' image links.")
print(f"There are {len(charter_imageList_noLinks)} charters with 'no' image links.")

In [None]:
df_collection_links_clean

In [None]:
charter_imageList_goodLinks

In [None]:
charter_imageList_goodLinks["charter_image_url"] = charter_imageList_goodLinks["cei_graphic_ATTRIBUTE_url"]

In [None]:
charter_imageList_goodLinks.drop(columns="cei_graphic_ATTRIBUTE_url", inplace=True)

In [None]:
charter_imageList_goodLinks

In [None]:
badCharterCorrectedMerged = pd.concat([df_collection_links_clean, df_archive_links_clean])

In [None]:
badCharterCorrectedMerged

In [None]:
badCharterCorrectedMergedClean = badCharterCorrectedMerged.drop(columns=["cei_graphic_ATTRIBUTE_url"])

In [None]:
chartersWithImages_merged_final = charter_imageList_goodLinks.merge(badCharterCorrectedMergedClean, on="atom_id", how="outer")

In [None]:
final_df = pd.concat([charter_imageList_goodLinks, badCharterCorrectedMergedClean])

In [None]:
final_df.reset_index().drop()

In [None]:
final_df.reset_index(inplace=True, drop=True)

In [None]:
len(final_df.loc[final_df["charter_image_url"].str.contains("monasterium")])

In [None]:
test = final_df.drop_duplicates(subset="charter_image_url")

In [None]:
test.reset_index(inplace=True, drop=True)

In [None]:
difference = pd.concat([test,final_df]).drop_duplicates(keep=False)

In [None]:
difference.reset_index(inplace=True, drop=True)

In [None]:
difference

In [None]:
len(test.loc[~test["charter_image_url"].str.contains("monasterium")])

In [None]:
len(test.loc[test["charter_image_url"].str.contains("monasterium")])

In [None]:
len(test.loc[~final_df["charter_image_url"].str.contains("monasterium")])

In [None]:
final_df.to_csv("../data/output/all_charters_from_dump.csv", encoding="utf-8")

# Mapping 

## mapping of xml schema to dictionary-like file

### to-do:
- automate xpath extraction either from files or schema instead of using third-party solution
- find solution for exceptions when mapping automatically (goal: hierarchies of tags for use cases should be expressed in the mapping as well)

In [None]:
#remove duplicate xpaths in file to map
lines_seen = set()

with open("../data/mapping/in.txt", "r") as infile:
    for line in infile:
        lines_seen.add(line)

with open("../data/mapping/out.txt", "w") as outfile:
    outfile.write("".join(lines_seen))

In [None]:
# exceptions = ["cei:figure", "cei:graphic", "cei:idno"]
dictionary = {}

with open("../data/mapping/out.txt", encoding="utf-8") as f:
    lines = f.read().splitlines()
    lines = sorted(lines, key=len)
    for line in lines:
        split = line.lstrip("/").split("/")
        key = split[-1]
        if key in dictionary:
            dictionary[f"{split[-2]}/{key}"] = line
        else:
            dictionary[key] = line

In [None]:
with open('../data/mapping/CEI2CSV.pkl', 'wb') as f:
    pickle.dump(dictionary, f)

## indexing files

In [None]:
directoryPath = "../data/db/mom-data/metadata.charter.public"
fileExtension = "*.cei.xml"

In [None]:
import hashlib

def createIndex(file_to_index):
    md5_hash = hashlib.md5()
    with open(file_to_index, "rb") as file:
        content = file.read()
        md5_hash.update(content)
        digest = md5_hash.hexdigest()
        return(digest)

In [None]:
charter_index = {f"{createIndex(file)}": f"{PurePosixPath(file)}" for file in Path(directoryPath).rglob(fileExtension)}

In [None]:
index_path = Path("../data/mapping/index.pkl")

with index_path.open("wb") as file:
    pickle.dump(charter_index, file, pickle.HIGHEST_PROTOCOL)

In [None]:
index_path = Path("../data/mapping/index.pkl")

with index_path.open('rb') as file:
    image = pickle.load(file)

### dynamic query functions to get xml data

in 1 function:
- open the file
- parse the file and make tree
- for each field that is desired, append to respective list in dictionary an xpath expression map[field]



In [None]:
def is_cei_attrib(entry, map):
    if "@" in map.get(entry):
        return True
    else:
        return False

In [None]:
result = (is_cei_attrib(field, map) for field in list_dict)
for item in result:
    print(item)

In [None]:
def append_element_to_list(element, list, map):
    list.append(tree.xpath(f"{map[element]}/text()", namespaces = namespaces, smart_strings = False))

In [None]:
def append_attribute_to_list(element, list, map):
    list.append(tree.xpath(f"{map[element]}", namespaces = namespaces, smart_strings = False))

In [None]:
def get_elements(file, dict):
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(f)
        for field in field_dict:
            field_dict[field].append(tree.xpath(f"{map[field]}", namespaces = namespaces, smart_strings = False))
            return field_dict[field]

In [None]:
field_dict = {field:[] for field in fields}
field_dict

In [None]:
newFrame = pd.DataFrame.from_dict(field_dict)

# Is a charter of a collection stemming from Google OCR?

In [None]:
# todo: get_parentType - check whether charter is collection or archive
# todo: get_parentID - 

In [None]:
collection_data = pd.read_parquet("../data/output/collection_data.parquet")

In [None]:
charter_data = pd.read_parquet("../data/output/charters_full_2022-06-15.parquet")

In [None]:
charter_data["atom_id_split"] = charter_data["atom_id"].explode().str.split('/')

In [None]:
for cell in charter_data["atom_id_split"]:
    del cell[:2]

In [None]:
lists = ["archive", "fond", "collection"]
archive, fond, collection = ([] for i in range(len(lists)))

for cell in charter_data["atom_id_split"]:
    if len(cell) == 3:
        archive.append(cell[0])
        fond.append(cell[1])
        collection.append(None)
    elif len(cell) == 2:
        archive.append(None)
        fond.append(None)
        collection.append(cell[0])
    else:
        continue

contents = list(zip(archive, fond, collection))
contents_df = pd.DataFrame(contents).rename(columns={0: "archive", 1: "fond", 2: "collection"})

In [None]:
contents_df["atom_id"] = charter_data["atom_id"]

In [None]:
contents_df["atom_id_split"] = charter_data["atom_id_split"]

In [None]:
contents_df.drop(columns=["archive", "fond"], inplace=True)

In [None]:
contents_df.dropna(inplace=True)

In [None]:
contents_df.reset_index(drop=True)

In [None]:
contents_df

In [None]:
collection_data["collection"] = collection_data.explode("atom_id")["atom_id"].str.split('/')

In [None]:
for cell in collection_data["collection"]:
    del cell[:2]

In [None]:
collectionsExploded = (collection_data.filter(["collection", "sourceDesc", "xrx_keyword"]).explode("sourceDesc").explode("collection").explode("xrx_keyword"))

In [None]:
collectionsFromGoogle = collectionsExploded.loc[collectionsExploded["sourceDesc"].str.contains("Google", na=False)].reset_index(drop=True)

In [None]:
collectionsFromGoogle["collection"].to_csv("../data/output/collectionsFromGoogle.csv", encoding="utf-8")

In [None]:
collectionChartersFromGoogle = pd.merge(contents_df, collectionsFromGoogle, how="inner", on="collection")

In [None]:
collectionChartersFromGoogle.explode("atom_id")["atom_id"].to_csv("../data/output/collectionChartersFromGoogle.csv", encoding="utf-8")

##    How many Czech charters are there?


In [None]:
charter_data = pd.read_parquet("../data/output/charters_full_2022-06-15.parquet")

In [None]:
x = charter_data.explode("atom_id")
x = x[x["atom_id"].str.contains("charter/CZ-", na=False)]

In [None]:
for row in charter_data["cei_tenor"].sample(n=10):
    print(row, type(row))

In [None]:
for row in charter_data.explode("cei_tenor")["cei_tenor"].sample(n=10):
    print(row, type(row))

## How many brackets in tenor tags?

In [None]:
charter_data_exploded = charter_data.explode("cei_tenor")
tenor_with_brackets = charter_data_exploded[charter_data_exploded["cei_tenor"].str.contains("\(.*\)", na=False, regex=True)]
test = tenor_with_brackets.sample(n=50).explode("atom_id")
for id in test["atom_id"]:
    print(get_url(id))

# dummies

In [None]:
# dummy test list
atom_id_list = charter_data.explode("atom_id")["atom_id"].to_list()

In [None]:
charter_data = pd.read_parquet("../../../ZimLab/didip/general/general/data/output/charters_full_2022-06-15.parquet")

atom_id_list = charter_data["atom_id"].explode().to_list()
#sample = random.sample(atom_id_list, 1000)

with open('../data/output/atomids.txt', 'w') as f:
    for line in atom_id_list:
        f.write(f"{line}\n")

In [None]:
atom_id_lst = []
url_lst = []

for item in atom_id_list:
    atom_id_lst.append(item)
    url_lst.append(get_url(item))

df = list(zip(atom_id_lst, url_lst))
df_full = pd.DataFrame(df).rename(columns={ 0:'atom_id', 1:'url'})

df_full.to_csv("atomID+url.csv", encoding="utf-8")

# get origs

In [100]:
df = pd.read_parquet('../data/output/tests/orig&copyImagePerCharter_test.parquet', engine='pyarrow')
#df = df.iloc[: , 1:]

FileNotFoundError: [Errno 2] No such file or directory: '../data/output/tests/orig&copyImagePerCharter_test.parquet'

In [85]:
df = df[df["hasImage"] == True].reset_index(drop=True).iloc[:, :3]

In [97]:
df.explode("origImageLink")

Unnamed: 0,atom_id,origImageLink,copyImageLink
0,"['tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/0632d68f-f907-4113-...",['http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Bai...,[]
1,"['tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/0e3d1203-0e72-4b17-...",['http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Bai...,[]
2,"['tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/0eb5100f-8132-48fd-...",['http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Bai...,[]
3,"['tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/16bfa2f5-9171-4d18-...",['http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Bai...,[]
4,"['tag:www.monasterium.net,2011:/charter/069622fc-5d55-4a51-8dd5-f23f30d8bf2f/16c37112-ba89-4c4c-...",['http://images.monasterium.net/img/IT-BSNSP/16-Napoli_monasteri_di_S_Gregorio_S_Arcangelo_a_Bai...,[]
...,...,...,...
478591,"['tag:www.monasterium.net,2011:/charter/WirtembergischesUrkundenbuch/faf66a3f-d54b-4439-a443-65c...",[],"['00000042.png', '00000043.png']"
478592,"['tag:www.monasterium.net,2011:/charter/WirtembergischesUrkundenbuch/fb51a6c4-bde7-4df7-86c7-ee6...",[],"['00000338.png', '00000339.png', '00000340.png']"
478593,"['tag:www.monasterium.net,2011:/charter/WirtembergischesUrkundenbuch/fc9ffa72-79b1-4029-8eb5-a53...",[],"['00000149.png', '00000150.png', '00000151.png']"
478594,"['tag:www.monasterium.net,2011:/charter/WirtembergischesUrkundenbuch/fe45e235-cb7f-4ee3-ae89-6e1...",[],"['00000085.png', '00000086.png']"


In [94]:
#df["hasOrig"] = np.where(df["origImageLink"].explode(origImageLink") == None, False)
df = df.explode("origImageLink")
df.loc[(df['origImageLink'] == []), "hasOrig"] = False
#df.loc[(df['Space'] == 'TRUE') & (df['Threshold'] <= 0.2), 'Space_Test'] = 'Pass'

ValueError: ('Lengths must match to compare', (478596,), (0,))

In [3]:
from ddp_util import chatomid_to_url
#chatomid_to_url("tag:www.monasterium.net,2011:/charter/QuellenKoelnII/26770f8e-ecfe-47de-a996-983dcf74f0e5")

chatomid_to_url("tag:www.monasterium.net,2011:/charter/CH-StaASG/Urkunden/Urkunden-Supplement..34")

'https://www.monasterium.net/mom/CH-StaASG/Urkunden/Urkunden-Supplement..34/charter'