In [None]:
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
from datetime import datetime
import os
import csv 
import pandas as pd
import numpy as np
import pickle
import re
from pprint import pprint
from random import sample

from ddp_util import chatomid_to_url, url_to_chatomid

# load data

In [None]:
# load parquet file resulting from /notebooks/XML2PD/XML2PD.ipynyb which was generated on the respective date by parsing a Monasterium db dump ("full20220125-1710")
contents_full = pd.read_parquet('../../data/output/charters_2022-09-18-2211.parquet', engine='pyarrow')

In [None]:
# load 1000 charter project sample, downloaded from GSheets
thousand_charter_sample = pd.read_csv('../../data/in/1000charters_gsheet/1000charters_gsheet.csv', encoding='utf-8')

# builds lists of atomids from urls ("m" -> 1000)

In [None]:
m_charters = thousand_charter_sample.iloc[:,2]
m_charters

In [None]:
m_charters_list = list(m_charters)

In [None]:
m_charters_ids = [url_to_chatomid(i) for i in m_charters_list]

In [None]:
m_charters_ids

# check difference of dataframes

In [None]:
contents_full_exploded = contents_full.explode("atom_id")

In [None]:
charters_reduced = contents_full_exploded[~contents_full_exploded["atom_id"].isin(m_charters_ids)]

In [None]:
print(len(contents_full_exploded))
print(len(charters_reduced))

In [None]:
df_diff = pd.concat([charters_reduced.atom_id,contents_full_exploded.atom_id])
df_diff_clean = df_diff.drop_duplicates(keep=False).tolist()

# check if diff_frame == m_charters

In [None]:
m_charters_ids.sort() == df_diff_clean.sort()

# generate new sample with fixed seed

In [None]:
new_sample = charters_reduced.sample(n=4000, random_state=50)
new_sample

# extend atomids of 1000 charters by 4000 charters

In [None]:
new_sample_atomids = list(new_sample.atom_id)
atomids_5k = list(chain(m_charters_ids, new_sample_atomids))
print(len(atomids_5k))

# make urls

In [None]:
urls_5k = [chatomid_to_url(i) for i in atomids_5k]
urls_4k = [chatomid_to_url(i) for i in new_sample_atomids]

# export 

In [None]:
with open(f"../../data/out/samples/urls_5k.csv", "w", newline="") as file:
    writer = csv.writer(file, delimiter = "\n")
    writer.writerow(urls_5k)

with open(f"../../data/out/samples/urls_4k.csv", "w", newline="") as file:
    writer = csv.writer(file, delimiter = "\n")
    writer.writerow(urls_4k)

with open(f"../../data/out/samples/atomids_5k.csv", "w", newline="") as file:
    writer = csv.writer(file, delimiter = "\n")
    writer.writerow(atomids_5k)

# Misc

In [None]:
# check amount of /HU- in samples
m_charters_HU = thousand_charter_sample[thousand_charter_sample["URL "].astype(str).str.contains("/HU-")]
new_sample_HU = new_sample[new_sample["atom_id"].astype(str).str.contains("/HU-")]

# difference of HU % in samples
print(f"In the 1k charters project, the amount of /HU- charters was {round((len(m_charters_HU))/1000*100)}%; in the new one, it is {round((len(new_sample_HU))/4000*100)}%.")