# imports

In [115]:
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
from datetime import datetime
import os
import csv 
import pandas as pd
import numpy as np
import pickle
import re
from pprint import pprint
import random
from itertools import chain

from ddp_util import chatomid_to_url, url_to_chatomid

# load datasets, make clean dfs and lists

In [57]:
# load json file generated on the respective date by parsing a Monasterium db dump ("full20220125-1710") using /notebooks/XML2PD/XML2PD.ipynyb
all_df_full = pd.read_json("../../data/in/sampling/charters_2022-09-18-2211.json")
all_df = all_df_full.explode("atom_id")["atom_id"].rename("atomid")
print(all_df)
all_list = all_df.to_list()

0         tag:www.monasterium.net,2011:/charter/069622fc...
1         tag:www.monasterium.net,2011:/charter/069622fc...
2         tag:www.monasterium.net,2011:/charter/069622fc...
3         tag:www.monasterium.net,2011:/charter/069622fc...
4         tag:www.monasterium.net,2011:/charter/069622fc...
                                ...                        
659165    tag:www.monasterium.net,2011:/charter/Wirtembe...
659166    tag:www.monasterium.net,2011:/charter/Wirtembe...
659167    tag:www.monasterium.net,2011:/charter/Wirtembe...
659168    tag:www.monasterium.net,2011:/charter/Wirtembe...
659169    tag:www.monasterium.net,2011:/charter/Wirtembe...
Name: atomid, Length: 659170, dtype: object


In [58]:
# load 1000 charter project sample, downloaded from GSheets
m_cv_df = pd.read_csv("../../data/in/sampling/1000charters_gsheet.csv", encoding="utf-8")
m_cv_df["atomid"] = m_cv_df["URL "].apply(lambda x : url_to_chatomid(x))
m_cv_df = m_cv_df["atomid"]
print(m_cv_df)
m_cv_list = m_cv_df.to_list()

0      tag:www.monasterium.net,2011:/charter/HU-MNL-D...
1      tag:www.monasterium.net,2011:/charter/DE-BayHS...
2      tag:www.monasterium.net,2011:/charter/AT-StiAS...
3      tag:www.monasterium.net,2011:/charter/HU-MNL-D...
4      tag:www.monasterium.net,2011:/charter/DE-HStAM...
                             ...                        
995    tag:www.monasterium.net,2011:/charter/AT-DOZA/...
996    tag:www.monasterium.net,2011:/charter/HU-MNL-D...
997    tag:www.monasterium.net,2011:/charter/DE-HStAM...
998    tag:www.monasterium.net,2011:/charter/DE-BayHS...
999    tag:www.monasterium.net,2011:/charter/AT-StiAG...
Name: atomid, Length: 1000, dtype: object


In [59]:
# load 1000 charter project sample from leech ?
#WSL ~/data/didip/tmp/data/leech/1000_Charters ... once copied, unclear where from
m_cv_leech_df = pd.read_fwf("../../data/in/sampling/atomid_list.txt", header=None)
m_cv_leech_df = m_cv_leech_df.squeeze().rename("atomid")
print(m_cv_leech_df)
m_cv_leech_list = m_cv_leech_df.to_list()

0      tag:www.monasterium.net,2011:/charter/AT-ADG/A...
1      tag:www.monasterium.net,2011:/charter/AT-ADG/A...
2      tag:www.monasterium.net,2011:/charter/AT-ADG/A...
3      tag:www.monasterium.net,2011:/charter/AT-ADG/A...
4      tag:www.monasterium.net,2011:/charter/AT-AES/U...
                             ...                        
995    tag:www.monasterium.net,2011:/charter/SK-SNA/4...
996    tag:www.monasterium.net,2011:/charter/SK-SNA/4...
997    tag:www.monasterium.net,2011:/charter/SK-SNA/4...
998    tag:www.monasterium.net,2011:/charter/SK-SNA/4...
999    tag:www.monasterium.net,2011:/charter/SK-SNA/4...
Name: atomid, Length: 1000, dtype: object


In [60]:
# load 1000 charter project sample from other leechdump
#atzenhofer@21-PX010:/data/anguelos/monasterium/tmp/data/leech$ 
m_cv_leech_2nd_df = pd.read_fwf("../../data/in/sampling/atomid_list_second.txt", header=None)
m_cv_leech_2nd_df = m_cv_leech_2nd_df.squeeze().rename("atomid")
print(m_cv_leech_2nd_df)
m_cv_leech_2nd_list = m_cv_leech_2nd_df.to_list()

0      tag:www.monasterium.net,2011:/charter/AT-ADG/A...
1      tag:www.monasterium.net,2011:/charter/AT-ADG/A...
2      tag:www.monasterium.net,2011:/charter/AT-ADG/A...
3      tag:www.monasterium.net,2011:/charter/AT-ADG/A...
4      tag:www.monasterium.net,2011:/charter/AT-AES/U...
                             ...                        
995    tag:www.monasterium.net,2011:/charter/SK-SNA/4...
996    tag:www.monasterium.net,2011:/charter/SK-SNA/4...
997    tag:www.monasterium.net,2011:/charter/SK-SNA/4...
998    tag:www.monasterium.net,2011:/charter/SK-SNA/4...
999    tag:www.monasterium.net,2011:/charter/SK-SNA/4...
Name: atomid, Length: 1000, dtype: object


In [61]:
# load 1000 charter NLP sample, downloaded from Unicloud
m_nlp_df = pd.read_parquet("../../data/in/sampling/1000charters_nlp.parquet", engine="pyarrow")
m_nlp_df = m_nlp_df.explode("atom_id")["atom_id"].reset_index(drop=True)
print(m_nlp_df)
m_nlp_list = m_nlp_df.to_list()

0      tag:www.monasterium.net,2011:/charter/AFM/1.2.140
1      tag:www.monasterium.net,2011:/charter/StPCanRe...
2      tag:www.monasterium.net,2011:/charter/DEEDS/00...
3      tag:www.monasterium.net,2011:/charter/Bischoef...
4      tag:www.monasterium.net,2011:/charter/AFM/1.4....
                             ...                        
995    tag:www.monasterium.net,2011:/charter/Benedict...
996    tag:www.monasterium.net,2011:/charter/AFM/1.4....
997    tag:www.monasterium.net,2011:/charter/KlosterJ...
998    tag:www.monasterium.net,2011:/charter/Merkwuer...
999    tag:www.monasterium.net,2011:/charter/Lauresha...
Name: atom_id, Length: 1000, dtype: object


# make list of lists for contingency

In [62]:
lists_list = [m_nlp_list, m_cv_list, m_cv_leech_list, m_cv_leech_2nd_list]

# check differences

In [63]:
def get_var_name(variable):
    globals_dict = globals()
    return [var_name for var_name in globals_dict if globals_dict[var_name] is variable]


def isect(list1, list2):
    isect = list(set(list1).intersection(set(list2)))
    return isect

def diff(list1, list2):
    diff = list(set(list1).difference(set(list2)))
    return diff

In [64]:
tab = "\t"
nl = "\n"
sorted_list = sorted(lists_list)

for i in sorted_list:
    print(f"{(get_var_name(i))[0]} has a length of {len(i)}") #question to self: why is get-var-name inconsistent as to the order

print(nl)

for i in sorted_list:
    for j in sorted_list:
        intersection = isect(i, j)
        difference = diff(i, j)
        if i == j:
            break
        else:
          print(f"{(get_var_name(i))[0]} {tab} and {tab} {(get_var_name(j)[-2])} {tab} -> intersection of {len(intersection)} and a difference of {len(difference)}.")

m_nlp_list has a length of 1000
m_cv_leech_list has a length of 1000
m_cv_leech_2nd_list has a length of 1000
m_cv_list has a length of 1000


m_cv_leech_list 	 and 	 m_nlp_list 	 -> intersection of 0 and a difference of 989.
m_cv_leech_2nd_list 	 and 	 m_nlp_list 	 -> intersection of 0 and a difference of 989.
m_cv_leech_2nd_list 	 and 	 m_cv_leech_list 	 -> intersection of 989 and a difference of 0.
m_cv_list 	 and 	 m_nlp_list 	 -> intersection of 0 and a difference of 1000.
m_cv_list 	 and 	 m_cv_leech_list 	 -> intersection of 953 and a difference of 47.
m_cv_list 	 and 	 m_cv_leech_2nd_list 	 -> intersection of 953 and a difference of 47.


In [65]:
print(chatomid_to_url("tag:www.monasterium.net,2011:/charter/AT-StaAR/UrkDominikanerkloster/StA_Retz%7CD%7CU1%7C1303"))
print(chatomid_to_url("tag:www.monasterium.net,2011:/charter/AT-StaAR/UrkDominikanerkloster/StA_Retz%7CD"))

https://www.monasterium.net/mom/AT-StaAR/UrkDominikanerkloster/StA_Retz%7CD%7CU1%7C1303/charter
https://www.monasterium.net/mom/AT-StaAR/UrkDominikanerkloster/StA_Retz%7CD/charter


In [66]:
"tag:www.monasterium.net,2011:/charter/SK-SNA/4156-SukromnyArchivBratislavskejKapituly/636"
"tag:www.monasterium.net,2011:/charter/SK-SNA/4156-SukromnyArchivBratislavskejKapi"

'tag:www.monasterium.net,2011:/charter/SK-SNA/4156-SukromnyArchivBratislavskejKapi'

In [67]:
#80 lengths cutoff?
"tag:www.monasterium.net,2011:/charter/AT-NOeLA/HA_Seefeld-HardeggerUrk/Hardegger_"
"tag:www.monasterium.net,2011:/charter/SK-SNA/4156-SukromnyArchivBratislavskejKapi"

'tag:www.monasterium.net,2011:/charter/SK-SNA/4156-SukromnyArchivBratislavskejKapi'

## check set of sample atomids
conclusion: buggy urls -> buggy atomids -> uneven number

In [68]:
union_list = m_nlp_list + m_cv_list + m_cv_leech_list + m_cv_leech_2nd_list

In [69]:
len(union_list)
len(set(union_list))

2036

# reduce by previous nlp and cv sample

In [70]:
all_less_nlp = all_df[~all_df.isin(m_nlp_list)]
all_less_cv = all_df[~all_df.isin(m_cv_list)]
all_less_nlp_cv = all_df[~all_df.isin(m_nlp_list)][~all_df.isin(m_cv_list)]

# get charters with tenor

In [71]:
all_w_tenor = (all_df_full[all_df_full["cei_tenor_joined"].astype(str) != ""])["atom_id"].explode("atom_id")
all_w_tenor

0        tag:www.monasterium.net,2011:/charter/069622fc...
1        tag:www.monasterium.net,2011:/charter/672f41e1...
2        tag:www.monasterium.net,2011:/charter/672f41e1...
3        tag:www.monasterium.net,2011:/charter/672f41e1...
4        tag:www.monasterium.net,2011:/charter/672f41e1...
                               ...                        
72297    tag:www.monasterium.net,2011:/charter/Wirtembe...
72298    tag:www.monasterium.net,2011:/charter/Wirtembe...
72299    tag:www.monasterium.net,2011:/charter/Wirtembe...
72300    tag:www.monasterium.net,2011:/charter/Wirtembe...
72301    tag:www.monasterium.net,2011:/charter/Wirtembe...
Name: atom_id, Length: 72302, dtype: object

# deduce previous nlp sample from all tenors

In [72]:
all_w_tenor_reduced = all_w_tenor[~all_w_tenor.isin(m_nlp_list)][~all_w_tenor.isin(m_cv_list)]
all_w_tenor_reduced

0        tag:www.monasterium.net,2011:/charter/069622fc...
1        tag:www.monasterium.net,2011:/charter/672f41e1...
2        tag:www.monasterium.net,2011:/charter/672f41e1...
3        tag:www.monasterium.net,2011:/charter/672f41e1...
4        tag:www.monasterium.net,2011:/charter/672f41e1...
                               ...                        
72297    tag:www.monasterium.net,2011:/charter/Wirtembe...
72298    tag:www.monasterium.net,2011:/charter/Wirtembe...
72299    tag:www.monasterium.net,2011:/charter/Wirtembe...
72300    tag:www.monasterium.net,2011:/charter/Wirtembe...
72301    tag:www.monasterium.net,2011:/charter/Wirtembe...
Name: atom_id, Length: 71265, dtype: object

# generate new 1k nlp sample

In [73]:
new_nlp_sample = all_w_tenor_reduced.sample(n=1000, random_state=50)
new_nlp_sample

48351    tag:www.monasterium.net,2011:/charter/KlosterJ...
29133    tag:www.monasterium.net,2011:/charter/CSGVI/13...
6102     tag:www.monasterium.net,2011:/charter/AFM/1.7.647
11901    tag:www.monasterium.net,2011:/charter/AT-StiAK...
71673    tag:www.monasterium.net,2011:/charter/WhalleyA...
                               ...                        
23919    tag:www.monasterium.net,2011:/charter/CodexDip...
21294    tag:www.monasterium.net,2011:/charter/Chronico...
921      tag:www.monasterium.net,2011:/charter/AbteiSan...
70296    tag:www.monasterium.net,2011:/charter/UrkVonZo...
35172    tag:www.monasterium.net,2011:/charter/DEEDS/00...
Name: atom_id, Length: 1000, dtype: object

# deduce new nlp sample from all

In [74]:
all_less_nlp_cv_nlp2 = all_less_nlp_cv[~all_less_nlp_cv.isin(new_nlp_sample)].reset_index(drop=True)
all_less_nlp_cv_nlp2

0         tag:www.monasterium.net,2011:/charter/069622fc...
1         tag:www.monasterium.net,2011:/charter/069622fc...
2         tag:www.monasterium.net,2011:/charter/069622fc...
3         tag:www.monasterium.net,2011:/charter/069622fc...
4         tag:www.monasterium.net,2011:/charter/069622fc...
                                ...                        
656165    tag:www.monasterium.net,2011:/charter/Wirtembe...
656166    tag:www.monasterium.net,2011:/charter/Wirtembe...
656167    tag:www.monasterium.net,2011:/charter/Wirtembe...
656168    tag:www.monasterium.net,2011:/charter/Wirtembe...
656169    tag:www.monasterium.net,2011:/charter/Wirtembe...
Name: atomid, Length: 656170, dtype: object

# generate new 3k sample

In [75]:
sample_3k = all_less_nlp_cv_nlp2.sample(n=3000, random_state=50)
sample_3k

626651    tag:www.monasterium.net,2011:/charter/RIXIii/1...
652177    tag:www.monasterium.net,2011:/charter/Urkunden...
233008    tag:www.monasterium.net,2011:/charter/DE-BayHS...
501344    tag:www.monasterium.net,2011:/charter/Illumini...
349079    tag:www.monasterium.net,2011:/charter/HU-MNL-D...
                                ...                        
470024    tag:www.monasterium.net,2011:/charter/HU-MNL-D...
234764    tag:www.monasterium.net,2011:/charter/DE-BayHS...
23541     tag:www.monasterium.net,2011:/charter/AT-DOZA/...
253939    tag:www.monasterium.net,2011:/charter/DE-BayHS...
409778    tag:www.monasterium.net,2011:/charter/HU-MNL-D...
Name: atomid, Length: 3000, dtype: object

# deduce 3k sample

In [76]:
all_less_samples = all_less_nlp_cv_nlp2[~all_less_nlp_cv_nlp2.isin(sample_3k)].reset_index(drop=True)
all_less_samples

0         tag:www.monasterium.net,2011:/charter/069622fc...
1         tag:www.monasterium.net,2011:/charter/069622fc...
2         tag:www.monasterium.net,2011:/charter/069622fc...
3         tag:www.monasterium.net,2011:/charter/069622fc...
4         tag:www.monasterium.net,2011:/charter/069622fc...
                                ...                        
653165    tag:www.monasterium.net,2011:/charter/Wirtembe...
653166    tag:www.monasterium.net,2011:/charter/Wirtembe...
653167    tag:www.monasterium.net,2011:/charter/Wirtembe...
653168    tag:www.monasterium.net,2011:/charter/Wirtembe...
653169    tag:www.monasterium.net,2011:/charter/Wirtembe...
Name: atomid, Length: 653170, dtype: object

# check lengths

In [77]:
print(len(all_df))
print(len(all_less_nlp))
print(len(all_less_cv))
print(len(all_less_nlp_cv))
print(len(all_less_nlp_cv_nlp2))
print(len(all_less_samples))

659170
658170
658170
657170
656170
653170


# extend atomids of 1k charters by 3k charters; +list of new nlp

In [106]:
atomids_3k = sample_3k.to_list()
atomids_4k = list(chain(m_cv_list, sample_3k.to_list()))
atomids_1k_nlp_new = new_nlp_sample.to_list()

# urls

In [107]:
urls_3k = [chatomid_to_url(i) for i in atomids_3k]
urls_4k = [chatomid_to_url(i) for i in atomids_4k]
urls_1k_nlp_new = [chatomid_to_url(i) for i in atomids_1k_nlp_new]

In [125]:
# random.seed(a=50)
# for i in random.sample(urls_4k, 10):
#     print(i)
# random.getstate()

# export 

In [None]:
with open(f"../../data/out/samples/urls_5k.csv", "w", newline="") as file:
    writer = csv.writer(file, delimiter = "\n")
    writer.writerow(urls_5k)

with open(f"../../data/out/samples/urls_4k.csv", "w", newline="") as file:
    writer = csv.writer(file, delimiter = "\n")
    writer.writerow(urls_4k)

with open(f"../../data/out/samples/atomids_5k.csv", "w", newline="") as file:
    writer = csv.writer(file, delimiter = "\n")
    writer.writerow(atomids_5k)

# Misc

In [167]:
# check amount of /HU- in samples

exploded = all_df_full.explode("atom_id")
test = exploded[exploded["atom_id"].isin(m_cv_list)]
m_cv_HU = test[test["atom_id"].astype(str).str.contains("/HU-")]

exploded = all_df_full.explode("atom_id")
test = exploded[exploded["atom_id"].isin(atomids_3k)]
sample_3k_HU = test[test["atom_id"].astype(str).str.contains("/HU-")]

#[all_df_full.atom_id.isin(m_cv_df)]
#m_cv_HU = m_cv_df[m_cv_df["atomid"].astype(str).str.contains("/HU-")]
#sample_3k_HU = sample_3k[sample_3k["atomid"].astype(str).str.contains("/HU-")]

# difference of HU % in samples
print(f"In the 1k charters project, the amount of /HU- charters was {round((len(m_cv_HU))/1000*100,2)}%; in the new 3k one, it is {round((len(sample_3k_HU))/3000*100,2)}%.")

In the 1k charters project, the amount of /HU- charters was 38.5%; in the new 3k one, it is 25.33%.


In [149]:
m_cv_HU

Unnamed: 0,atom_id,cei_abstract_joined,cei_abstract_foreign,cei_tenor_joined,cei_pTenor,cei_placeName,cei_lang_MOM,cei_date,cei_dateRange,cei_date_ATTRIBUTE_value,cei_dateRange_ATTRIBUTE_from,cei_dateRange_ATTRIBUTE_to,cei_graphic_ATTRIBUTE_url_orig,cei_graphic_ATTRIBUTE_url_copy
335652,"tag:www.monasterium.net,2011:/charter/HU-MNL-D...",Mátyás király a leleszi konventnek. A szepesi ...,[],,[],[],[Latin],[1467-03-11],[],[14670311],[],[],[http://archives.hungaricana.hu/tile/thumb/cha...,[]
335893,"tag:www.monasterium.net,2011:/charter/HU-MNL-D...",Beatrix királyné Pál presbitert (natus Thome l...,[],,[],[],[Latin],[1491-02-20],[],[14910220],[],[],[http://archives.hungaricana.hu/tile/thumb/cha...,[]
336545,"tag:www.monasterium.net,2011:/charter/HU-MNL-D...",Visegrád. 12. d. oct. Nat. Joh. Bapt. Szécsi M...,[],,[],[],[],[1381-07-12],[],[13810712],[],[],[http://archives.hungaricana.hu/tile/thumb/cha...,[]
336631,"tag:www.monasterium.net,2011:/charter/HU-MNL-D...",JÁSZÓI KONVENT,[],,[],[],[],[1393-01-02],[],[13930102],[],[],[http://archives.hungaricana.hu/tile/thumb/cha...,[]
336795,"tag:www.monasterium.net,2011:/charter/HU-MNL-D...",JÁSZÓI KONVENT,[],,[],[],[],[1413-03-10],[],[14130310],[],[],[http://archives.hungaricana.hu/tile/thumb/cha...,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496200,"tag:www.monasterium.net,2011:/charter/HU-MNL-D...",A pannonhalmi konvent jelenti László királynak...,[],,[],[],[Latin],[1454-04-20],[],[14540420],[],[],[http://archives.hungaricana.hu/tile/thumb/cha...,[]
496218,"tag:www.monasterium.net,2011:/charter/HU-MNL-D...",ÚJLAKI MIKLÓS ERDÉLYI VAJDAÚJLAKI MIKLÓS MACSÓ...,[],,[],[],[],[1454-05-20],[],[14540520],[],[],[http://archives.hungaricana.hu/tile/thumb/cha...,[]
496514,"tag:www.monasterium.net,2011:/charter/HU-MNL-D...",A leleszi konvent (Balázs prépost és a konvent...,[],,[],[],[Latin],[1461-12-20],[],[14611220],[],[],[http://archives.hungaricana.hu/tile/thumb/cha...,[]
499542,"tag:www.monasterium.net,2011:/charter/HU-VFL/K...",Aussteller: ISTVÁN IFJABB KIRÁLY,[],,[],[],[],[],[],[12599999],[],[],"[VeL_Cap_1259_200657.jpg, VeL_Cap_1259_200657v...",[]


In [132]:
m_cv_HU

Unnamed: 0,atom_id,cei_abstract_joined,cei_abstract_foreign,cei_tenor_joined,cei_pTenor,cei_placeName,cei_lang_MOM,cei_date,cei_dateRange,cei_date_ATTRIBUTE_value,cei_dateRange_ATTRIBUTE_from,cei_dateRange_ATTRIBUTE_to,cei_graphic_ATTRIBUTE_url_orig,cei_graphic_ATTRIBUTE_url_copy
