# imports

In [None]:
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
from datetime import datetime
import os
import csv 
import pandas as pd
import numpy as np
import pickle
import re
from pprint import pprint
import random
from itertools import chain

from ddp_util import chatomid_to_url, url_to_chatomid

import glob
import ddp_util

# load datasets, make clean dfs and lists

In [None]:
# load json file generated on the respective date by parsing a Monasterium db dump ("full20220125-1710") using /notebooks/XML2PD/XML2PD.ipynyb
all_df_full = pd.read_json("../../data/in/sampling/charters_2022-09-18-2211.json")
all_df = all_df_full.explode("atom_id")["atom_id"].rename("atomid")
print(all_df)
all_list = all_df.to_list()

In [None]:
# load 1000 charter project sample, downloaded from GSheets
m_cv_df = pd.read_csv("../../data/in/sampling/1000charters_gsheet.csv", encoding="utf-8")
m_cv_df["atomid"] = m_cv_df["URL "].apply(lambda x : url_to_chatomid(x))
m_cv_df = m_cv_df["atomid"]
print(m_cv_df)
m_cv_list = m_cv_df.to_list()

In [None]:
# load 1000 charter project sample from leech ?
#WSL ~/data/didip/tmp/data/leech/1000_Charters ... once copied, unclear where from
m_cv_leech_df = pd.read_fwf("../../data/in/sampling/atomid_list.txt", header=None)
m_cv_leech_df = m_cv_leech_df.squeeze().rename("atomid")
print(m_cv_leech_df)
m_cv_leech_list = m_cv_leech_df.to_list()

In [None]:
# load 1000 charter project sample from other leechdump
#atzenhofer@21-PX010:/data/anguelos/monasterium/tmp/data/leech$ 
m_cv_leech_2nd_df = pd.read_fwf("../../data/in/sampling/atomid_list_second.txt", header=None)
m_cv_leech_2nd_df = m_cv_leech_2nd_df.squeeze().rename("atomid")
print(m_cv_leech_2nd_df)
m_cv_leech_2nd_list = m_cv_leech_2nd_df.to_list()

In [None]:
# load 1000 charter NLP sample, downloaded from Unicloud
m_nlp_df = pd.read_parquet("../../data/in/sampling/1000charters_nlp.parquet", engine="pyarrow")
m_nlp_df = m_nlp_df.explode("atom_id")["atom_id"].reset_index(drop=True)
print(m_nlp_df)
m_nlp_list = m_nlp_df.to_list()

# make list of lists for contingency

In [None]:
lists_list = [m_nlp_list, m_cv_list, m_cv_leech_list, m_cv_leech_2nd_list]

# check differences

In [None]:
def get_var_name(variable):
    globals_dict = globals()
    return [var_name for var_name in globals_dict if globals_dict[var_name] is variable]

def isect(list1, list2):
    isect = list(set(list1).intersection(set(list2)))
    return isect

def diff(list1, list2):
    diff = list(set(list1).difference(set(list2)))
    return diff

In [None]:
tab = "\t"
nl = "\n"
sorted_list = sorted(lists_list)

for i in sorted_list:
    print(f"{(get_var_name(i))[0]} has a length of {len(i)}") #question to self: why is get-var-name inconsistent as to the order

print(nl)

for i in sorted_list:
    for j in sorted_list:
        intersection = isect(i, j)
        difference = diff(i, j)
        if i == j:
            break
        else:
          print(f"{(get_var_name(i))[0]} {tab} and {tab} {(get_var_name(j)[-2])} {tab} -> intersection of {len(intersection)} and a difference of {len(difference)}.")

In [None]:
# from co-programming w anguelos: check 1000 charter dirs
#print(glob.glob("../../misc/1000_Charters/*/*/*/*json"))
# daniel_dirs = sorted(set(["/".join(f.split("/")[:-1]) for f in glob.glob("../../misc/1000_Charters/*/*/*/*.*.json")]))
# daniel_atoms = [open(f"{f}/url.txt").read() for f in daniel_dirs]
# daniel_atoms

#print(len(daniel_atoms))
# print(len(daniel_dirs))
# print(set(daniel_dirs)-set())

In [None]:
# print(chatomid_to_url("tag:www.monasterium.net,2011:/charter/AT-StaAR/UrkDominikanerkloster/StA_Retz%7CD%7CU1%7C1303"))
# print(chatomid_to_url("tag:www.monasterium.net,2011:/charter/AT-StaAR/UrkDominikanerkloster/StA_Retz%7CD"))

# "tag:www.monasterium.net,2011:/charter/SK-SNA/4156-SukromnyArchivBratislavskejKapituly/636"
# "tag:www.monasterium.net,2011:/charter/SK-SNA/4156-SukromnyArchivBratislavskejKapi"

#80 lengths cutoff?
# "tag:www.monasterium.net,2011:/charter/AT-NOeLA/HA_Seefeld-HardeggerUrk/Hardegger_"
# "tag:www.monasterium.net,2011:/charter/SK-SNA/4156-SukromnyArchivBratislavskejKapi"

## check set of sample atomids
conclusion: buggy urls -> buggy atomids -> uneven number

In [None]:
union_list = m_nlp_list + m_cv_list + m_cv_leech_list + m_cv_leech_2nd_list

In [None]:
len(union_list)
len(set(union_list))

# reduce by previous nlp and cv sample

In [None]:
all_less_nlp = all_df[~all_df.isin(m_nlp_list)]
all_less_cv = all_df[~all_df.isin(m_cv_list)]
all_less_nlp_cv = all_df[~all_df.isin(m_nlp_list)][~all_df.isin(m_cv_list)]

# get charters with tenor

In [None]:
all_w_tenor = (all_df_full[all_df_full["cei_tenor_joined"].astype(str) != ""])["atom_id"].explode("atom_id")
all_w_tenor

# deduce previous nlp sample from all tenors

In [None]:
all_w_tenor_reduced = all_w_tenor[~all_w_tenor.isin(m_nlp_list)][~all_w_tenor.isin(m_cv_list)]
all_w_tenor_reduced

# generate new 1k nlp sample

In [None]:
new_nlp_sample = all_w_tenor_reduced.sample(n=1000, random_state=50)
new_nlp_sample

# deduce new nlp sample from all

In [None]:
all_less_nlp_cv_nlp2 = all_less_nlp_cv[~all_less_nlp_cv.isin(new_nlp_sample)].reset_index(drop=True)
all_less_nlp_cv_nlp2

# generate new 3k sample

In [None]:
sample_3k = all_less_nlp_cv_nlp2.sample(n=3000, random_state=50)
sample_3k

# deduce 3k sample

In [None]:
all_less_samples = all_less_nlp_cv_nlp2[~all_less_nlp_cv_nlp2.isin(sample_3k)].reset_index(drop=True)
all_less_samples

# check lengths

In [None]:
print(len(all_df))
print(len(all_less_nlp))
print(len(all_less_cv))
print(len(all_less_nlp_cv))
print(len(all_less_nlp_cv_nlp2))
print(len(all_less_samples))

# extend atomids of 1k charters by 3k charters; +list of new nlp

In [None]:
atomids_3k = sample_3k.to_list()
atomids_4k = list(chain(m_cv_list, sample_3k.to_list()))
atomids_1k_nlp_new = new_nlp_sample.to_list()

# urls

In [None]:
urls_3k = [chatomid_to_url(i) for i in atomids_3k]
urls_4k = [chatomid_to_url(i) for i in atomids_4k]
urls_1k_nlp_new = [chatomid_to_url(i) for i in atomids_1k_nlp_new]

In [None]:
# random.seed(a=50)
# for i in random.sample(urls_4k, 10):
#     print(i)
# random.getstate()

# export 

## urls

In [None]:
with open(f"../../data/out/samples/urls_3k.csv", "w", newline="") as file:
    writer = csv.writer(file, delimiter = "\n")
    writer.writerow(urls_3k)

with open(f"../../data/out/samples/urls_4k.csv", "w", newline="") as file:
    writer = csv.writer(file, delimiter = "\n")
    writer.writerow(urls_4k)

with open(f"../../data/out/samples/urls_1k_nlp_new.csv", "w", newline="") as file:
    writer = csv.writer(file, delimiter = "\n")
    writer.writerow(urls_1k_nlp_new)

## atomids

In [None]:
with open(f"../../data/out/samples/atomids_3k.csv", "w", newline="") as file:
    writer = csv.writer(file, delimiter = "\n")
    writer.writerow(atomids_3k)

with open(f"../../data/out/samples/atomids_4k.csv", "w", newline="") as file:
    writer = csv.writer(file, delimiter = "\n")
    writer.writerow(atomids_4k)

with open(f"../../data/out/samples/atomids_1k_nlp_new.csv", "w", newline="") as file:
    writer = csv.writer(file, delimiter = "\n")
    writer.writerow(atomids_1k_nlp_new)

# HU in samples
says:
In the cv 1k charters project, the amount of /HU- charters was 38.5%.

In the old nlp sample, it was 0.63%.

In the new nlp sample, it is 0.6%.

In the new 3k one, it is 25.33%. 

In [None]:
# check amount of /HU- in samples
n = "\n"
exploded = all_df_full.explode("atom_id")
df = exploded[exploded["atom_id"].isin(m_cv_list)]
m_cv_HU = df[df["atom_id"].astype(str).str.contains("/HU-")]

df = exploded[exploded["atom_id"].isin(m_nlp_list)]
m_nlp_HU_old = df[df["atom_id"].astype(str).str.contains("/HU-")]

df = exploded[exploded["atom_id"].isin(new_nlp_sample.to_list())]
m_nlp_HU_new = df[df["atom_id"].astype(str).str.contains("/HU-")]

df = exploded[exploded["atom_id"].isin(atomids_3k)]
sample_3k_HU = df[df["atom_id"].astype(str).str.contains("/HU-")]

# difference of HU % in samples
print(f"""In the cv 1k charters project, the amount of /HU- charters was {round((len(m_cv_HU))/1000*100,2)}%.
{n}In the old nlp sample, it was {round((len(m_nlp_HU_old))/3000*100,2)}%.
{n}In the new nlp sample, it is {round((len(m_nlp_HU_new))/3000*100,2)}%.
{n}In the new 3k one, it is {round((len(sample_3k_HU))/3000*100,2)}%. """)