**TLDR;** In this notebook we use the `ajmc` library to export one test commentary from canonical to Alto/XML format. The export currently works for GT pages but not for non-GT pages (run the code below to see the raised exception).

**NB**: [This PR](https://github.com/AjaxMultiCommentary/ajmc/pull/10) needs to be merged in order for this notebook to work. 

## Mount Drive and set PATHS 


In [2]:
import ipdb
import ajmc
from ajmc.commons import variables

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [24]:
# ⚠️ change this according to your local settings
variables.DRIVE_BASE_DIR

PosixPath('/Users/matteo/Google Drive/My Drive/AjaxMultiCommentary')

In [4]:
# variables.PATHS['drive_base_dir'] = "/content/gdrive/MyDrive/AjaxMultiCommentary/data/commentaries/commentaries_data"

## Custom functions

In [18]:
import urllib.request, json, os

def extract_commentary_runs_from_config(config_file_url):
  
  with urllib.request.urlopen(config_file_url) as url:
    config_dict = json.load(url)

  return {
    t[0] : t[1]
    for t in
    set(
          [
            (comm['id'], comm['run'])
            for comm in config_dict['data']['eval'] + config_dict['data']['train']
          ]
    )
  }

In [19]:
import os
import pathlib
import shutil
from typing import List, Dict
from ajmc.commons.file_management import read_google_sheet
from ajmc.commons.variables import ALL_COMM_IDS, EXTERNAL_COMM_IDS, PD_COMM_IDS
from ajmc.commons.variables import REGION_TYPES_TO_SEGMONTO, SEGMONTO_TO_VALUE_IDS
from ajmc.text_processing.canonical_classes import CanonicalCommentary

def export_segmonto_alto(canonical_json_path : str, output_base_path : str, groundtruth_only : bool = False):

    # Get the commentary
    comm = CanonicalCommentary.from_json(canonical_json_path)

    if groundtruth_only:
      # ⚠️ Get the list of groundtruth pages ONLY (remember that `commentary` zones are annotated on all pages !)
      olr_gt_df = read_google_sheet(variables.SPREADSHEETS['olr_gt'], 'olr_gt')
      gt_pages_ids = list(olr_gt_df['page_id'][olr_gt_df['commentary_id']==comm.id])
      selected_pages = comm.ocr_gt_pages
      print(f"There are {len(selected_pages)} GT pages in {comm.id}")
    else:
      selected_pages = [p for p in comm.children.pages]


    # Create a sub-folder for the commentary if it does not exist
    commentary_export_path = pathlib.Path(output_base_path) / comm.id
    commentary_export_path.mkdir(exist_ok=True, parents=True)
    alto_export_path = commentary_export_path / "alto"
    images_export_path = commentary_export_path / "images"
    alto_export_path.mkdir(exist_ok=True, parents=True)
    images_export_path.mkdir(exist_ok=True, parents=True)

    # Dump the Alto/XML for selected GT pages
    pages_to_alto(selected_pages, alto_export_path)
    shutil.make_archive(commentary_export_path / "alto_files", 'zip', alto_export_path)
    copy_page_images(selected_pages, images_export_path)
    shutil.make_archive(commentary_export_path / "images", 'zip', images_export_path)
    return

def copy_page_images(pages, target_base_dir):
  for page in pages:
    page_image_fname = f"{page.id}.png"
    page_image_path = variables.COMMS_DATA_DIR / page.parents.commentary.id / variables.COMM_IMG_REL_DIR / page_image_fname
    copy_image_path = os.path.join(target_base_dir, page_image_fname)
    shutil.copy(page_image_path, copy_image_path)

def pages_to_alto(pages, output_path : str):
  """
  This custom function is needed because right now the `CanonicalCommentary.to_alto()`
  method does not allow for specifying a subset of the pages to export. 
  """
  for page in pages:
    page.to_alto(
      children_types = ['regions', 'lines', 'words'],
      region_types_mapping=REGION_TYPES_TO_SEGMONTO,
      region_types_ids=SEGMONTO_TO_VALUE_IDS,
      output_path=output_path / (page.id + '.xml')
    )


def export_dataset(commentary_run_dict : Dict, dataset_path : pathlib.Path, groundtruth_only : bool = False):
  
  # iterate through commentary IDs
  for comm_id in list(commentary_run_dict.keys()):
    
    # reconstruct the path to the canonical JSON
    # for a given OCR run of a commentary
    canonical_json_path = os.path.join(
        variables.COMMS_DATA_DIR,
        comm_id, variables.COMM_CANONICAL_V1_REL_DIR,
        f"{commentary_run_dict[comm_id]}.json"
    )
    export_segmonto_alto(canonical_json_path, dataset_path, groundtruth_only=groundtruth_only)

## Fetch commentary runs

In [6]:
# Take the layoutlm config file for the omnibus experiment
# since this should/will contain all commentary and run IDs
omnibus_config_path = "https://raw.githubusercontent.com/AjaxMultiCommentary/ajmc/main/data/layoutlm/configs/4A_omnibus_base.json"
omnibus_external_config_path = "https://raw.githubusercontent.com/AjaxMultiCommentary/ajmc/main/data/layoutlm/configs/4B_omnibus_external.json"

In [8]:
internal_commentary_runs = extract_commentary_runs_from_config(omnibus_config_path)

In [9]:
external_commentary_runs = extract_commentary_runs_from_config(omnibus_external_config_path)

In [10]:
#internal_commentary_runs

In [63]:
#external_commentary_runs

## Export to Alto/XML

In [20]:
# ⚠️ change this according to your local settings
output_dir = pathlib.Path("/Users/matteo/Downloads/segmonto_dataset_export/")

In [13]:
pd_internal_commentary_runs = {
    comm_id: run
    for comm_id, run in internal_commentary_runs.items()
    if comm_id in PD_COMM_IDS
}

Pick one sample commentary to test the Alto/XML export:

In [21]:
debugging_sample = {
  v[0]: v[1]
  for v in list(internal_commentary_runs.items())[1:2]
}

In [22]:
debugging_sample

{'Untersteiner1934': '28r0XU_tess_base'}

🟢 Export only GT pages (works):

In [17]:
export_dataset(debugging_sample, output_dir, groundtruth_only=True)



There are 5 GT pages in Untersteiner1934


🔴 Export all pages, also non-GT ones (does not work):

In [23]:
export_dataset(debugging_sample, output_dir, groundtruth_only=False)



ValueError: min() arg is an empty sequence