# Retrieving XMLs from eSc
We retrieve xmls of document parts from eScriptorium,
in order to prepare them for processing with Passim.

# Notebook initialization

In [3]:
from functions import *
from packages import *

Get the document parts xmls from eScriptorium

In [4]:
doc_pk = 4381
get_basic_info(doc_pk)

get document segmentation ontology for document:  4381
https://msia.escriptorium.fr/api/documents/4381/
Document: 4381  with  64  parts
region types: [{'pk': 6912, 'name': 'Catchword'}, {'pk': 3, 'name': 'Commentary'}, {'pk': 6910, 'name': 'FooterCentral'}, {'pk': 6914, 'name': 'Handwritten'}, {'pk': 6908, 'name': 'Header'}, {'pk': 4, 'name': 'Illustration'}, {'pk': 2, 'name': 'Main'}, {'pk': 6909, 'name': 'MainCentral'}, {'pk': 6911, 'name': 'Margin'}, {'pk': 6913, 'name': 'RunningHeader'}, {'pk': 1, 'name': 'Title'}]
line types: [{'pk': 15289, 'name': 'Handwritten'}, {'pk': 15288, 'name': 'MainCentral'}, {'pk': 15287, 'name': 'NotMain'}]
transcription_level_list: [{'pk': 11159, 'name': 'Siddur_Ashkenaz_novoc_no_lbs_Daniel_pip_test_regexp_24/05/2024-09_04_53', 'archived': False, 'avg_confidence': 0.9847869869290219}, {'pk': 11160, 'name': 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated_pip_test_regexp_24/05/2024-09_04_53', 'archived': False, 'avg_confidence': 0.9847869869290219}, {'pk

(64,
 [{'pk': 11159,
   'name': 'Siddur_Ashkenaz_novoc_no_lbs_Daniel_pip_test_regexp_24/05/2024-09_04_53',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 11160,
   'name': 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated_pip_test_regexp_24/05/2024-09_04_53',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 11156,
   'name': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated_pip_test_regexp_24/05/2024-09_04_53',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 11158,
   'name': 'Siddur_Ashkenaz_clean_concatenated_pip_test_regexp_24/05/2024-09_04_53',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 11157,
   'name': 'MT_NoVoc_concatenated_pip_test_regexp_24/05/2024-09_04_53',
   'archived': False,
   'avg_confidence': 0.986672220664065},
  {'pk': 11133,
   'name': 'Siddur_Ashkenaz_novoc_no_lbs_Daniel_pip_test_regexp_22/05/2024-16_05_27',
   'archived': False,
   'avg_confidence': 0.984786986

In [6]:
# get the region type list
region_type_list, line_type_list = get_document_segmentation_ontology(doc_pk,print_status=False)
print(region_type_list)


# choose the region type pk
# We want the regions {'pk': 6909, 'name': 'MainCentral'}
region_type_pk_list = [6909]
print(region_type_pk_list)

# Choose the transcription level
tr_level_pk = 10754

[{'pk': 6912, 'name': 'Catchword'}, {'pk': 3, 'name': 'Commentary'}, {'pk': 6910, 'name': 'FooterCentral'}, {'pk': 6914, 'name': 'Handwritten'}, {'pk': 6908, 'name': 'Header'}, {'pk': 4, 'name': 'Illustration'}, {'pk': 2, 'name': 'Main'}, {'pk': 6909, 'name': 'MainCentral'}, {'pk': 6911, 'name': 'Margin'}, {'pk': 6913, 'name': 'RunningHeader'}, {'pk': 1, 'name': 'Title'}]
[6909]


In [7]:
def get_all_parts_infos(doc_pk):
    """
    Get all parts information from the eScriptorium API, handling pagination to retrieve all pages.
    """
    url = f"https://msia.escriptorium.fr/api/documents/{doc_pk}/parts/"
    all_parts_infos = []
    
    while url:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        data = response.json()
        all_parts_infos.extend(data['results'])
        
        # Update the URL to the next page, or set it to None if there are no more pages
        url = data.get('next')
    
    # save the parts information in a json file
    # if the directory does not exist, create it
    all_parts_infos_path = f'eSc_parts_infos'
    if not os.path.exists(all_parts_infos_path):
        os.makedirs(all_parts_infos_path)
    with open(f'{all_parts_infos_path}/all_parts_infos.json', 'w') as f:
        json.dump(all_parts_infos, f)

    
    return all_parts_infos

In [8]:
# get all the parts from the document
all_parts = get_all_parts_infos(doc_pk)

# get the part pk list
part_pk_list = [ part['pk'] for part in all_parts ]

print(f"{len(part_pk_list)} parts found")

64 parts found


In [9]:
# get the xmls of the parts from eScriptorium
export_xml(doc_pk,part_pk_list,tr_level_pk,region_type_pk_list,include_undefined = False, include_orphan = False, file_format = 'alto',include_images = False, print_status = True)


200
b'{"status":"ok"}'


<Response [200]>

In [10]:
def export_xml_2(doc_pk, part_pk_list, tr_level_pk, region_type_pk_list, include_undefined=True, include_orphan=True, file_format='alto', include_images=False, print_status=True):
    # e.g. https://escriptorium.openiti.org/api/documents/3221/export/
    export_url = f"{root_url}/api/documents/{doc_pk}/export/"
    if include_undefined:
        region_type_pk_list += 'Undefined'
    if include_orphan:
        region_type_pk_list += 'Orphan'
    data = {'parts': part_pk_list, 'transcription': tr_level_pk, 'task': 'export',
            'region_types': region_type_pk_list, 'include_images': include_images, 'file_format': file_format}
    # e.g. {"parts": [755434], "transcription": 5631, "task": "export", "region_types": [2,'Undefined','Orphan'], "include_images" : False, "file_format": "alto"}
    res = requests.post(export_url, data=data, headers=headersbrief)
    if print_status:
        print(res.status_code)
        print(res.content)
    return res

In [None]:
export_xml_2(doc_pk,part_pk_list,tr_level_pk,region_type_pk_list,include_undefined = False, include_orphan = False, file_format = 'alto',include_images = False, print_status = True)

--------
