# General

TODO
- add double unnest of xpath result
- fix order

## Imports

In [None]:
from lxml import etree
from pathlib import Path
from pathlib import PurePosixPath
from datetime import datetime
import os
import pandas as pd
import numpy as np
import pickle
import re
from pprint import pprint
from random import sample
from tqdm import tqdm

## Parameters

In [None]:
pd.set_option('display.max_colwidth', 100)

In [None]:
namespaces = {'atom': 'http://www.w3.org/2005/Atom', 'cei': 'http://www.monasterium.net/NS/cei'}
directoryPath = "../../data/db/mom-data/metadata.charter.public"
#directoryPath = "../../data/in/selected_charters/"
fileExtension = ".cei.xml"

# Load .xml mapping

In [None]:
with open("../../data/mapping/CEI2CSV.pkl", "rb") as f:
    mapping = pickle.load(f)

pprint(mapping)

# Scan directory and create paths

In [None]:
def get_file_paths(directory):
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith(fileExtension):
            yield Path(entry.path)
        elif entry.is_dir():
            yield from get_file_paths(entry.path)
        else:
            continue

In [None]:
paths = [f"{PurePosixPath(path)}" for path in get_file_paths(directoryPath)]
#paths = sample(paths, 1000)

# Create deques, query paths, and fill deques

In [None]:
def get_xpath_result(xpath_mapping):
    return tree.xpath(xpath_mapping, namespaces = namespaces, smart_strings = False)

In [None]:
lists = ["atom_id", "cei_abstract_joined", "cei_abstract_foreign", "cei_tenor", "cei_pTenor", "cei_placeName", "cei_lang_MOM", "cei_date", "cei_dateRange", "cei_date_ATTRIBUTE_value", "cei_dateRange_ATTRIBUTE_from", "cei_dateRange_ATTRIBUTE_to", "cei_graphic_ATTRIBUTE_url_orig", "cei_graphic_ATTRIBUTE_url_copy"]
atom_id, cei_abstract_joined, cei_abstract_foreign, cei_tenor, cei_pTenor, cei_placeName, cei_lang_MOM, cei_date, cei_dateRange, cei_date_ATTRIBUTE_value, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy = ([] for i in range(len(lists)))

In [None]:
for file in tqdm(paths):
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(f)
        atom_id.append(get_xpath_result(f"{mapping['atom:id']}/text()"))
        cei_abstract_joined.append("".join(get_xpath_result(f"{mapping['cei:abstract']}/descendant-or-self::node()[not(name()='cei:sup')]/text()")))
        cei_abstract_foreign.append(get_xpath_result(f"{mapping['cei:foreign']}/text()"))
        #cei_tenor_joined.append("".join(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::node()[not(name()='cei:sup')]/text()")))
        cei_tenor.append("".join(get_xpath_result(f"{mapping['cei:tenor']}/descendant-or-self::*/text()"))) #TODO: change to ::*
        cei_pTenor.append(get_xpath_result(f"{mapping['cei:pTenor']}/text()[not(self::cei:sup)]"))
        cei_placeName.append(get_xpath_result(f"{mapping['cei:issued/cei:placeName']}/text()"))
        cei_lang_MOM.append(get_xpath_result(f"{mapping['cei:lang_MOM']}/text()"))
        cei_date.append(get_xpath_result(f"{mapping['cei:date']}/text()"))
        cei_dateRange.append(get_xpath_result(f"{mapping['cei:dateRange']}/text()"))
        cei_date_ATTRIBUTE_value.append(get_xpath_result(f"{mapping['cei:date']}/@value"))
        cei_dateRange_ATTRIBUTE_from.append(get_xpath_result(f"{mapping['cei:dateRange']}/@from"))
        cei_dateRange_ATTRIBUTE_to.append(get_xpath_result(f"{mapping['cei:dateRange']}/@to"))
        cei_graphic_ATTRIBUTE_url_orig.append(get_xpath_result(f"{mapping['@url']}"))
        cei_graphic_ATTRIBUTE_url_copy.append(get_xpath_result(f"{mapping['cei:graphic/@url']}"))

### just atomid parse

In [None]:
atom_id = []

In [None]:
for file in tqdm(paths):
    with open(file, "r", encoding="utf-8") as f:
        tree = etree.parse(f)
        atom_id.append(get_xpath_result(f"{mapping['atom:id']}/text()"))

In [None]:
atom_id

In [None]:
from itertools import chain

atom_id_list = list(chain.from_iterable(atom_id))
atom_id_list

In [None]:
with open("../../data_windows/out/atomids.txt", mode="w") as f:
    for i in atom_id:
        f.write(f"{i}\n")

In [None]:
with open('data.json', 'w') as file:
    json.dump(python_obj, file)

In [None]:
import json
with open("../../data_windows/out/atomid")
json.dumps("../../data/test.json")

# Merge deques to dataframe

In [None]:
column_dict = {}
for i, item in enumerate(lists):
    column_dict[i] = item

contents = list(zip(atom_id, cei_abstract_joined, cei_abstract_foreign, cei_tenor, cei_pTenor, cei_placeName, cei_lang_MOM, cei_date, cei_dateRange, cei_date_ATTRIBUTE_value, cei_dateRange_ATTRIBUTE_from, cei_dateRange_ATTRIBUTE_to, cei_graphic_ATTRIBUTE_url_orig, cei_graphic_ATTRIBUTE_url_copy))
contents_full = pd.DataFrame(contents).rename(columns = column_dict)

In [None]:
contents_full

In [None]:
def save_df_to_nljson(df, location, timemarker, chunk_size=30000):
    df_size=len(df)
    for i, start in enumerate(range(0, df_size, chunk_size)):
        df[start:start+chunk_size].to_json(f"{location}/df_name_{timemarker}_{i}.json", orient="records", lines=True)

# Export whole dataframe

In [None]:
location = "../../data/out/chunked_full/"
timemarker = datetime.today().strftime('%Y-%m-%d-%H%M')

In [None]:
#save_df_to_csv(contents_full, location=location, timemarker=timemarker)

In [None]:
save_df_to_nljson(contents_full, location=location, timemarker=timemarker)

In [None]:
#contents_full.to_json(f"../../data/output/charters_full_{timemarker}.json")

---

# Import generated data as dataframe

In [None]:
#df = pd.read_json("../../data/output/charters_full_2022-11-22-1044.json")

In [None]:
# contents_full_json.sample(n=10)

In [None]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(contents_full_parquet.iloc[1].to_string)

# Misc