# fsdb-compare
This notebook is used to quickly compare the leech2fsdb and db2fsdb based on the 1k cv charter sample.
Atomids derived from bashscripts.

Main findings as per 2023-04-28: 
- some path conventions could be aligned better (e.g., initial characters due to root_variable)
- what is the convention/purpose of adding a namespace (e.g., ".img.png") to (some?) images?  
- charters and images have same md5, fonds and collections not (due to from what the md5 is generated)
Note: there were two reasons for hashing this level. 
one: fonds/collections contain computationally unpleasant characters that we should isolate as much as possible,
two: fonds often form duplicates, e.g., "Urkunden", for which we would hash the fonds based on their atomids
charters and images are fine, and show that the image leeching has different results

In [68]:
with open("../../../data/in/fsdb-compare/1kcv-db-raw.txt") as data:
    db_1kcv = data.readlines()

In [69]:
with open("../../../data/in/fsdb-compare/1kcv-leech.txt") as data:
    leech_1kcv = data.readlines()

In [70]:
leech_1kcv_prepared = [i.strip() for i in leech_1kcv]
leech_1kcv_prepared

['AT-ADG/29e55390c6fae100de8ec8677907460c/793854dfc42db8230321b0ba74b61359/8ee78b0e8596235262c768f2093432bb.img.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/207cd526e08396b4255b12fa19e8e4f8/4844ee9f686008891a44821c6133694d.img.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/207cd526e08396b4255b12fa19e8e4f8/c4db1e7133cbc99ad5f27dde73046f3c.img.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/818d9e5309729a2abb0262a8373f9a40/f5da25d3130ca7bb2f16687aa2dae644.img.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/818d9e5309729a2abb0262a8373f9a40/93e44a08895ea48a77734453a2478cf5.img.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/b9bd0a0f208d84a1b1b09e8d50fda773/28cb9061696fc6c53467ffe02f5ff7a5.img.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/b9bd0a0f208d84a1b1b09e8d50fda773/1d7b9cdc0e3bf183b5fd7663436799c8.img.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/df9864829a249bf0edb2f8fd68023a7f/f2803ffd1c6679fe0ab1fadf8fdae234.img.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/df9864829a249b

In [71]:
leech_1kcv_prepared = [i.replace("img.", "") for i in leech_1kcv_prepared]
leech_1kcv_prepared

['AT-ADG/29e55390c6fae100de8ec8677907460c/793854dfc42db8230321b0ba74b61359/8ee78b0e8596235262c768f2093432bb.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/207cd526e08396b4255b12fa19e8e4f8/4844ee9f686008891a44821c6133694d.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/207cd526e08396b4255b12fa19e8e4f8/c4db1e7133cbc99ad5f27dde73046f3c.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/818d9e5309729a2abb0262a8373f9a40/f5da25d3130ca7bb2f16687aa2dae644.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/818d9e5309729a2abb0262a8373f9a40/93e44a08895ea48a77734453a2478cf5.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/b9bd0a0f208d84a1b1b09e8d50fda773/28cb9061696fc6c53467ffe02f5ff7a5.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/b9bd0a0f208d84a1b1b09e8d50fda773/1d7b9cdc0e3bf183b5fd7663436799c8.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/df9864829a249bf0edb2f8fd68023a7f/f2803ffd1c6679fe0ab1fadf8fdae234.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/df9864829a249bf0edb2f8fd68023a7f/01c5414967257

In [84]:
db_1kcv_prepared = [i.strip() for i in db_1kcv]
db_1kcv_prepared

['./CZ-APH/485891d8d51c342a9f2e3dd4fe3469ea/734b442b9e7b98ee60194c7d98e64dc9/8a016c9ca6c65db97469be6bae50801a.jpg',
 './CZ-APH/485891d8d51c342a9f2e3dd4fe3469ea/734b442b9e7b98ee60194c7d98e64dc9/92f3d951eb2eafee56e3cba22a9800d2.jpg',
 './CZ-APH/485891d8d51c342a9f2e3dd4fe3469ea/d9f6b8a60fca867bab6075985433f94e/c09be37f3686e50328765a981f1f7df5.jpg',
 './CZ-APH/485891d8d51c342a9f2e3dd4fe3469ea/d9f6b8a60fca867bab6075985433f94e/2b315ee635dae77f42553af3be1c0c26.jpg',
 './CZ-APH/485891d8d51c342a9f2e3dd4fe3469ea/bece238e44cdc3dcd09498ca66a38029/67faa393d153b67f132b3b111e082e8b.jpg',
 './CZ-APH/485891d8d51c342a9f2e3dd4fe3469ea/bece238e44cdc3dcd09498ca66a38029/28495f3f1bc71b7a1da167e571acc7ac.jpg',
 './DE-StaAMainz/2f1a1cbb4a8eff9877d0368dde00ea0f/6ce9aa052ec782b1ddb2e7f382bd2ede/45857fca4f4a803753f3fedc423d64d5.jpg',
 './DE-StaAMainz/2f1a1cbb4a8eff9877d0368dde00ea0f/6ce9aa052ec782b1ddb2e7f382bd2ede/b3691899cd4ff3d399d4e6c1464a0ae0.jpg',
 './COLLECTIONS/580d3f8aac8372af6cf3052f0ef2ed9a/84400a67f9e

In [85]:
db_1kcv_prepared = [i[2:] for i in db_1kcv_prepared]
db_1kcv_prepared

['CZ-APH/485891d8d51c342a9f2e3dd4fe3469ea/734b442b9e7b98ee60194c7d98e64dc9/8a016c9ca6c65db97469be6bae50801a.jpg',
 'CZ-APH/485891d8d51c342a9f2e3dd4fe3469ea/734b442b9e7b98ee60194c7d98e64dc9/92f3d951eb2eafee56e3cba22a9800d2.jpg',
 'CZ-APH/485891d8d51c342a9f2e3dd4fe3469ea/d9f6b8a60fca867bab6075985433f94e/c09be37f3686e50328765a981f1f7df5.jpg',
 'CZ-APH/485891d8d51c342a9f2e3dd4fe3469ea/d9f6b8a60fca867bab6075985433f94e/2b315ee635dae77f42553af3be1c0c26.jpg',
 'CZ-APH/485891d8d51c342a9f2e3dd4fe3469ea/bece238e44cdc3dcd09498ca66a38029/67faa393d153b67f132b3b111e082e8b.jpg',
 'CZ-APH/485891d8d51c342a9f2e3dd4fe3469ea/bece238e44cdc3dcd09498ca66a38029/28495f3f1bc71b7a1da167e571acc7ac.jpg',
 'DE-StaAMainz/2f1a1cbb4a8eff9877d0368dde00ea0f/6ce9aa052ec782b1ddb2e7f382bd2ede/45857fca4f4a803753f3fedc423d64d5.jpg',
 'DE-StaAMainz/2f1a1cbb4a8eff9877d0368dde00ea0f/6ce9aa052ec782b1ddb2e7f382bd2ede/b3691899cd4ff3d399d4e6c1464a0ae0.jpg',
 'COLLECTIONS/580d3f8aac8372af6cf3052f0ef2ed9a/84400a67f9e6ad652c7a0c24e772a

# check

summary: leech has other fond hashes, charters are same

In [88]:
sorted(leech_1kcv_prepared)

['AT-ADG/29e55390c6fae100de8ec8677907460c/793854dfc42db8230321b0ba74b61359/8ee78b0e8596235262c768f2093432bb.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/207cd526e08396b4255b12fa19e8e4f8/4844ee9f686008891a44821c6133694d.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/207cd526e08396b4255b12fa19e8e4f8/c4db1e7133cbc99ad5f27dde73046f3c.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/818d9e5309729a2abb0262a8373f9a40/93e44a08895ea48a77734453a2478cf5.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/818d9e5309729a2abb0262a8373f9a40/f5da25d3130ca7bb2f16687aa2dae644.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/b9bd0a0f208d84a1b1b09e8d50fda773/1d7b9cdc0e3bf183b5fd7663436799c8.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/b9bd0a0f208d84a1b1b09e8d50fda773/28cb9061696fc6c53467ffe02f5ff7a5.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/df9864829a249bf0edb2f8fd68023a7f/01c54149672570d1d6e86308fb8125f5.jpg',
 'AT-AES/d3a416ef7813f88859c305fb83b20b5b/df9864829a249bf0edb2f8fd68023a7f/f2803ffd1c667

In [89]:
sorted(db_1kcv_prepared)

['AT-ADG/1fa8326a93b73419e186f8cf32cc86de/440903d445b953b6ac1d057b5d3a5fb3/e3bc52fa6e97b581abbc9d2fee260fda.jpg',
 'AT-ADG/1fa8326a93b73419e186f8cf32cc86de/516cade94d3b1b7c06da5b90bcc8aafc/9b533f4327f8809228d84793ef5ce372.jpg',
 'AT-ADG/1fa8326a93b73419e186f8cf32cc86de/793854dfc42db8230321b0ba74b61359/8ee78b0e8596235262c768f2093432bb.jpg',
 'AT-AES/fd5d0808707281748845a6bf394bc9cc/207cd526e08396b4255b12fa19e8e4f8/4844ee9f686008891a44821c6133694d.jpg',
 'AT-AES/fd5d0808707281748845a6bf394bc9cc/207cd526e08396b4255b12fa19e8e4f8/c4db1e7133cbc99ad5f27dde73046f3c.jpg',
 'AT-AES/fd5d0808707281748845a6bf394bc9cc/818d9e5309729a2abb0262a8373f9a40/93e44a08895ea48a77734453a2478cf5.jpg',
 'AT-AES/fd5d0808707281748845a6bf394bc9cc/818d9e5309729a2abb0262a8373f9a40/f5da25d3130ca7bb2f16687aa2dae644.jpg',
 'AT-AES/fd5d0808707281748845a6bf394bc9cc/b9bd0a0f208d84a1b1b09e8d50fda773/1d7b9cdc0e3bf183b5fd7663436799c8.jpg',
 'AT-AES/fd5d0808707281748845a6bf394bc9cc/b9bd0a0f208d84a1b1b09e8d50fda773/28cb9061696fc

In [38]:
db_set = set(db_1kcv_prepared)
leech_set = set(leech_1kcv_prepared)

In [37]:
len(set(db_1kcv_prepared) - set(leech_1kcv_prepared))

1848

In [33]:
temp = [item for item in set(leech_1kcv_prepared) if item not in set(db_1kcv_prepared)]
len(temp)

1482

In [41]:
len(db_set.difference(leech_set))

1848

# check collections only

In [90]:
sorted([i for i in leech_1kcv_prepared if "COLLECTIONS" in i])

['COLLECTIONS/21ee1643feb18bbd7c93133b82ba98f0/b14e8b6a40f10ab9f0a9608ba592c531/5e8f354c33274f89ad0cb978e98110d5.jpg',
 'COLLECTIONS/21ee1643feb18bbd7c93133b82ba98f0/b14e8b6a40f10ab9f0a9608ba592c531/bea9b143e6868f5aeb7e75f993653799.jpg',
 'COLLECTIONS/49912d7662a3b6fcd89f8904b2ce0d25/1454ffa7d6b71da37f8a038d98c9a3af/ab2e9336b5fb944a26ee0fbff48ee144.jpg',
 'COLLECTIONS/6ee249de58b529b09027aa429b4a2598/05b4fbbcb6ea05008a6b6fa92870a147/3072677387a76dbd985162af485da25c.png',
 'COLLECTIONS/6ee249de58b529b09027aa429b4a2598/05b4fbbcb6ea05008a6b6fa92870a147/87f0dad0b44818c95e70ccf8337a88b6.png',
 'COLLECTIONS/76ec9067eafaa10725cf905eda721dde/0d0e11d33bc0e4f35f29280e49952e31/58a76612d7f927b92c7547043a5acb39.png',
 'COLLECTIONS/76ec9067eafaa10725cf905eda721dde/0d0e11d33bc0e4f35f29280e49952e31/c39b53c4f3fa9baecb3064784af33b1f.png',
 'COLLECTIONS/76ec9067eafaa10725cf905eda721dde/3892c3d676f3be6b6f1870e19a28f4ea/2d15dcd9797d5121c4f75b71cac04ffc.png',
 'COLLECTIONS/76ec9067eafaa10725cf905eda721dde/8

In [92]:
sorted([i for i in db_1kcv_prepared if "COLLECTIONS" in i])

['COLLECTIONS/032bf56eaffba58c224ea4ea4543a3c8/0d0e11d33bc0e4f35f29280e49952e31/58a76612d7f927b92c7547043a5acb39.png',
 'COLLECTIONS/032bf56eaffba58c224ea4ea4543a3c8/0d0e11d33bc0e4f35f29280e49952e31/c39b53c4f3fa9baecb3064784af33b1f.png',
 'COLLECTIONS/032bf56eaffba58c224ea4ea4543a3c8/3892c3d676f3be6b6f1870e19a28f4ea/2d15dcd9797d5121c4f75b71cac04ffc.png',
 'COLLECTIONS/032bf56eaffba58c224ea4ea4543a3c8/8f3d362c0e0b1f890a41e40a2f42d00d/32ea9346e779a867f73231bf0f2e55c4.png',
 'COLLECTIONS/032bf56eaffba58c224ea4ea4543a3c8/a31106151c492002b41c6693bae046f8/6be9afdd11bc3c6cba2e358b302faa1d.png',
 'COLLECTIONS/032bf56eaffba58c224ea4ea4543a3c8/a31106151c492002b41c6693bae046f8/ec19c886e9d69e6b014ccf5c051b603b.png',
 'COLLECTIONS/032bf56eaffba58c224ea4ea4543a3c8/b8bb3c464955f46f47ac35d7e3432fcc/0f78bc6560be9496fdaf6774976c07e6.png',
 'COLLECTIONS/032bf56eaffba58c224ea4ea4543a3c8/f79fd41aa80ef8a3bc9ab286a2dd0338/2e2959ef8d6be4bdfd5750883d3c406b.png',
 'COLLECTIONS/580d3f8aac8372af6cf3052f0ef2ed9a/8

# Why different fonds?

In [1]:
from ddp_util import decompose_chatomid

In [2]:
decompose_chatomid("tag:www.monasterium.net,2011:/charter/DE-StadtA-BOR-W/Urkunden/423")

(['tag:www.monasterium.net,2011:',
  'charter',
  'DE-StadtA-BOR-W',
  'Urkunden',
  '423'],
 'tag:www.monasterium.net,2011:/archive/DE-StadtA-BOR-W',
 'tag:www.monasterium.net,2011:/fond/DE-StadtA-BOR-W/Urkunden')

In [3]:
from ddp_util import fo_atomid_to_url, ar_atomid_to_url

In [4]:
fo_atomid_to_url("tag:www.monasterium.net,2011:/fond/DE-StadtA-BOR-W/Urkunden")

'https://www.monasterium.net/mom/DE-StadtA-BOR-W/Urkunden/fond'

In [5]:
ar_atomid_to_url("tag:www.monasterium.net,2011:/archive/DE-StadtA-BOR-W")

'https://www.monasterium.net/mom/DE-StadtA-BOR-W/archive'

# inspect leech_charter

In [26]:
from ddp_util import get_charter_path_elements

In [27]:
import urllib

In [44]:
from bs4 import BeautifulSoup
import re

In [47]:
from urllib.parse import unquote

In [55]:
def get_names_from_charter_html(html: str):
    href_list = list(BeautifulSoup(html, features="lxml").find_all("a"))

    archive_re = re.compile("mom/[0-9A-Za-z\-]+/archive")
    archive_hrefs = [a.attrs["href"] for a in href_list if len(
        archive_re.findall(a.attrs.get("href", ""))) > 0]

    fond_re = re.compile("mom/[0-9A-Za-z\-]+/.*/fond")
    fond_hrefs = [a.attrs["href"] for a in href_list if len(
        fond_re.findall(a.attrs.get("href", ""))) > 0]

    collection_re = re.compile("mom/[0-9A-Za-z\-]+/collection")
    collection_hrefs = [a.attrs["href"] for a in href_list if len(
        collection_re.findall(a.attrs.get("href", ""))) > 0]

    #print("A:",[a.attrs["href"] for a in href_list if len(
    #    collection_re.findall(a.attrs.get("href", ""))) > 0])
    #if len(set(collection_hrefs)) == 1 and len(set(archive_hrefs)) == 0: # and len(set(fond_hrefs)) == 0: # this charters breaks the no_fond case https://www.monasterium.net/mom/IlluminierteUrkunden/1216-04-15_Michaelbeuern/charter
    if len(set(collection_hrefs)) >= 1 and len(set(archive_hrefs)) == 0: # https://www.monasterium.net/mom/IlluminierteUrkunden/1257-01-99_Auxerre/charter has 2 collections
        # TODO (anguelos) name or whole atomid
        fond_name = collection_hrefs[0].replace(
            "/mom/", "").replace("/collection", "")
        archive_name = "COLLECTIONS"
        #print("B:",fond_name, archive_name)
    elif len(set(collection_hrefs)) == 0 and len(set(archive_hrefs)) == 1 and len(set(fond_hrefs)) == 1:
        # TODO (anguelos) name or whole atomid
        fond_name = fond_hrefs[0].split("/fond")[0].split("/")[-1]
        # TODO (anguelos) name or whole atomid
        archive_name = archive_hrefs[0].replace(
            "/mom/", "").replace("/archive", "")
    elif len(BeautifulSoup(html).find_all('div', attrs = {'id':'error-content'}) ):
        #<div id="error-content">
        # assuming this restricts a few charters
        raise ValueError
    else:
        print("<<<<<<")
        # print(html)
        print("HREFS:\n", "\n".join([a.attrs["href"] for a in href_list if len(
            fond_re.findall(a.attrs.get("href", ""))) > 0]))
        print(repr(collection_hrefs))
        print(repr(archive_hrefs))
        print(repr(fond_hrefs))
        print(">>>>>>")
        sys.exit(1)
        raise ValueError  # html page not a parsable charter

    pdf_export_href_list = [
        a for a in href_list if a.attrs.get("target", "") == "blank"]
    # hopefully we isolated a single href
    assert len(pdf_export_href_list) == 1
    assert pdf_export_href_list[0].text.replace(
        " ", "").lower() == "pdf-export"
    charter_atomid = pdf_export_href_list[0].attrs["href"]
    charter_atomid = charter_atomid.split("?id=")[1].split("&")[0]
    charter_atomid = unquote(charter_atomid)

    return archive_name, fond_name, charter_atomid

In [59]:
def leech_charter(charter_url):
    charter_html = str(urllib.request.urlopen(charter_url, timeout=10).read(), "utf8")
    archive_name, fond_name, charter_atomid = get_names_from_charter_html(charter_html)
    print(fond_name)
    archive_name, fond_name, charter_name = get_charter_path_elements(archive_name, fond_name, charter_atomid)
    charter_full_path = f"{root}/{archive_name}/{fond_name}/{charter_name}"
    return archive_name, fond_name, charter_name, charter_atomid, charter_full_path

In [32]:
root = ""

In [60]:
leech_charter("https://www.monasterium.net/mom/DE-StadtA-BOR-W/Urkunden/423/charter")

Urkunden


('DE-StadtA-BOR-W',
 'd3a416ef7813f88859c305fb83b20b5b',
 'd05077768c3c451f87e6d7564a350b31',
 'tag:www.monasterium.net,2011:/charter/DE-StadtA-BOR-W/Urkunden/423',
 '/DE-StadtA-BOR-W/d3a416ef7813f88859c305fb83b20b5b/d05077768c3c451f87e6d7564a350b31')

# --> fond id not transformed, but only name; same for collections
#TODO: fond_name --> fond_atomid to md5
collection_name --> collection_atomid to md5