# Import Passim results into eScriptorium
These scripts creates xml altos from Passim alignment results, for import into eScriptorium.


### Importing Required Libraries

In [20]:
import os
import json
import re
from bs4 import BeautifulSoup
from pprint import pprint

### Loading JSON data into a list of dictionaries from a JSONLines file

In [21]:
# path to the directory containing the passim output
path_passim_output = 'json_from_passim/out_n7_docwise/out.json'

In [22]:
# List to store data from all JSON files
out_passim_list = []

# Loop through each file in the directory
for file in os.listdir(path_passim_output):
    if file.endswith(".json"):
        file_path = os.path.join(path_passim_output, file)
        # Open the JSON file and load its content as a list of dictionaries
        with open(file_path, 'r', encoding="utf-8") as json_file:
            data = [json.loads(line) for line in json_file]
            out_passim_list.extend(data)


In [5]:
# Checking some content

print(out_passim_list[0]['lines'][0]['wits'][0]['alg'])
print('len', len(out_passim_list[0]['lines'][0]['wits'][0]['alg']))
print(out_passim_list[0]['lines'][0]['wits'][0]['alg2'])
print('len', len(out_passim_list[0]['lines'][0]['wits'][0]['alg2']))

KeyError: 'wits'

In [26]:
print(f"Number of blocks to be processed:{len(out_passim_list)}")

Number of blocks to be processed:61


### Splitting the aligned GT text portions into lines corresponding to OCR lines
Passim found alignments (between OCR and GT) at the textblock level.
Now we have to split the selected GT, in sections that correspond to the OCR lines.
This can be done by using the 'alg' and 'alg2' fields in passim output, containing the OCR and GT alignments, respectively.
Those fiels contain the exact same character numbers.
The special caracters '🍺' inserted in the OCR text, during the concatenation of the OCR lines
will be used to split the GT text in sections, as they give the position where the GT text should be split.

The splitted GT alignments will be stored in dictionaries in the lines_dict_with_alg_GT

In [24]:
pprint(out_passim_list)

[{'id': 'eSc_textblock_bae3dd88_IE87234800_00004',
  'lines': [{'begin': 0, 'text': 'ישר) נגף בגשת בני ישרא אל הקדש: ג5\n'},
            {'begin': 35,
             'text': 'אתה הוא יי אלהינו שהקטירו אבותינו את קטרת\n',
             'wits': [{'alg': 'אתה הוא יהוה אלהינו שהקטירו אבותינו לפניך את '
                              'קטרת ',
                       'alg2': 'אתה הוא י--י אלהינו שהקטירו אבותינו ------את '
                               'קטרת\n',
                       'begin': 35761,
                       'id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt',
                       'matches': 41,
                       'ref': '1',
                       'series': 'GT',
                       'text': 'אתה הוא יהוה אלהינו שהקטירו אבותינו לפניך את '
                               'קטרת '},
                      {'alg': 'אתה הוא יהוה אלהינו שהקטירו אבותינו לפניך את '
                              'קטרת ',
                       'alg2': 'אתה הוא י--י אלהינו שהקטירו אבותינו---

In [25]:
# list of GTs found in alignments:

GT_ids = list(set([wit['id']
          for textblock in out_passim_list
          for line in textblock['lines']
          for wit in line.get('wits', [])]))
print(GT_ids)


['Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt', 'Siddur_Ashkenaz_clean_concatenated.txt', 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt', 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt', 'MT_NoVoc_concatenated.txt']


In [32]:
# list of GTs found in alignments:
GT_ids = list(set([wit['id']
          for textblock in out_passim_list
          for line in textblock['lines']
          for wit in line.get('wits', [])]))
print('list of GTs found in alignments:', GT_ids)

# Iterate on GT_ids
for GT_id in GT_ids:
    print(f"--- Processing of GT {GT_id} ---")

    # Load dictionary containing OCR line-splitting information
    with open('ocr_lines_dict/ocr_lines_dict.json', 'r', encoding="utf-8") as json_file:
        ocr_lines_dict = json.load(json_file)
    
    # Iterate over out_passim_list dictionaries
    for textblock in out_passim_list:

        # Extract the filename and textblock_id
        textblock_id = re.sub(r'.*(eSc_textblock_[a-f0-9]+).*', r'\1', textblock['id'])
        filename = re.sub(r'.*' + textblock_id + '_(.*)', r'\1', textblock['id'])

        for line in textblock['lines']:
            begin_index = line['begin']
            # Check if the current GT_id is present in the wits of the line
            for wit in line.get('wits', []):
                if wit['id'] == GT_id:
                    alg_text = wit['alg']
                    # clean the alignment text
                    # Replace '-' (45) with '', but avoid empty lines
                    alg_text = alg_text.replace('-', '') if alg_text.replace('-', '') else alg_text
                    # Replace '-' (8208) with '-' (45)
                    alg_text = alg_text.replace(chr(8208), '-')
                    # Remove leading and trailing spaces, but avoid empty lines
                    alg_text = alg_text.strip() if alg_text.strip() else alg_text        


                    # Find the corresponding line in the OCR dictionary
                    for part in ocr_lines_dict:
                        if part['filename'] == filename:
                            for block in part['ocr_blocks']:
                                if block['text_block_id'] == textblock_id:
                                    for ocr_line in block['ocr_lines']:
                                        if ocr_line['start'] == begin_index:
                                            # Update the OCR line with the GT alignment text
                                            ocr_line['alg_GT'] = alg_text
                                            ocr_line['GT_id'] = GT_id
                                            break  # No need to continue searching once found

    # Save a JSON file by GT_id
    directory = 'lines_dict_with_alg_GT'
    os.makedirs(directory, exist_ok=True)
    file_path = os.path.join(directory, f'lines_dict_with_alg_{GT_id}.json')
    with open(file_path, 'w', encoding="utf-8") as json_file:
        json.dump(ocr_lines_dict, json_file, ensure_ascii=False, indent=4)


list of GTs found in alignments: ['Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt', 'Siddur_Ashkenaz_clean_concatenated.txt', 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt', 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt', 'MT_NoVoc_concatenated.txt']
--- Processing of GT Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt ---
--- Processing of GT Siddur_Ashkenaz_clean_concatenated.txt ---
--- Processing of GT Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt ---
--- Processing of GT Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt ---
--- Processing of GT MT_NoVoc_concatenated.txt ---


### Parsing XML files and updating text content with GT alignments

In [33]:
# Path to the directory containing the alto files from eScriptorium
path_xmls = 'xmls_from_eSc'

# Path to the output directory, where the new alto files (and zip) will be saved
path_alto = 'altos_for_eSc_linewise'

# Path to directory containing the dictionaries with the alignment of the GT for each OCR line
path_alg_dicts = 'lines_dict_with_alg_GT'

In [34]:
import zipfile
from bs4 import BeautifulSoup
import Levenshtein

def process_alignment(path_alg_dicts, path_alto, path_xmls, levenshtein_threshold):
    ''' Process the alignment of the GT on the OCR text lines and save the modified XML files in a ZIP archive
    The ZIP archive will be sent to eScriptorium.
    Each alignment candidate with a Levenshtein similarity ratio above the threshold will be considered as a validated alignment,
    and inserted in the XML file.

    Parameters:
    path_alg_dicts (str): Path to the directory containing the dictionaries with the alignment of the different GT, for each OCR line
    path_alto (str): Path to the output directory, where the new alto files (and zip) will be saved
    path_xmls (str): Path to the directory containing the alto files from eScriptorium
    '''
    alignment_register = []  # list of XML files containing an alignment, number of aligned lines and the id of the GT  in this file.

    # Load each JSON file in the lines_dict_with_alg_GT directory
    for file in os.listdir(path_alg_dicts):
        if file.endswith('.json'):
            with open(os.path.join(path_alg_dicts, file), 'r', encoding="utf-8") as json_file:
                lines_dict = json.load(json_file)
            # retrieve the id2 (GT name) from the filename
            id2 = re.sub(r'lines_dict_with_alg_(.*).json', r'\1', file)
            print(f"--- File {file} loaded with id2 = {id2}")

            # Create a folder to store XML files processed for this id2 value
            # These altos files will be compressed into a ZIP file and sent to eScriptorium
            output_folder = os.path.join(path_alto, id2)
            os.makedirs(output_folder, exist_ok=True)

            # List the xmls files containing GT alignments for the current GT
            xml_files_with_alg = [part['filename'] for part in lines_dict
                      for block in part['ocr_blocks']
                      for line in block['ocr_lines']
                      if line.get('alg_GT') and line['GT_id'] == id2]

            print(f"List of XML files containing alignments for {id2}: {xml_files_with_alg}")

            # Loop through all the raw XML files imported from eScriptorium
            for filename in os.listdir(path_xmls):
                if filename.endswith('.xml'):
                    if os.path.splitext(filename)[0] not in xml_files_with_alg:
                        # print(f"Skipping {filename} as it does not contain any alignment for {id2}")
                        continue
                    print(f"Processing {filename} as it contains alignments for {id2}")

                    line_count = 0 # count the number of lines with validated alignment
                    with open(os.path.join(path_xmls, filename), encoding="utf-8") as xml_file:
                        xml = xml_file.read()
                    soup = BeautifulSoup(xml, 'xml')

                    for text_line in soup.find_all('TextLine'):
                        text_line_id = text_line.get('ID')

                        for part in lines_dict:
                            for block in part['ocr_blocks']:
                                for line in block['ocr_lines']:
                                    if line.get('line_id') == text_line_id:
                                        alg_GT_value = line.get('alg_GT', None)  # get GT alignment if available
                                        if alg_GT_value is not None:
                                            string_elements = text_line.find_all('String')
                                            if string_elements:
                                                levenshtein_ratio = Levenshtein.ratio(alg_GT_value, string_elements[0]['CONTENT'])
                                                print(f"Levenshtein ratio: {levenshtein_ratio}")
                                                if levenshtein_ratio >= levenshtein_threshold:
                                                    string_elements[0]['CONTENT'] = alg_GT_value
                                                    line_count += 1

                                                else:
                                                    string_elements[0]['CONTENT'] = ''
                                        else:
                                            string_elements = text_line.find_all('String')
                                            if string_elements:
                                                string_elements[0]['CONTENT'] = ''

                    # Add alignment data to register
                    alignment_register.append({
                        'filename': filename,
                        'aligned_lines_count': line_count,
                        'GT_id': id2
                    })

                    # Write the modified XML file to the output folder corresponding to the id2 value (GT)
                    output_file_path = os.path.join(output_folder, filename)
                    with open(output_file_path, 'w', encoding="utf-8") as output_file:
                        output_file.write(str(soup))
                        print(f"{filename} processed and recorded in {output_folder}")

        # Create a specific name for the ZIP archive based on the id2 value
        zip_file_name = f"{id2}_alignment.zip"
        zip_file_path = os.path.join(path_alto, zip_file_name)

        # Create a zip file of the XML files in the output folder
        with zipfile.ZipFile(zip_file_path, 'w') as zipf:
            for root, _, files in os.walk(output_folder):
                for file in files:
                    zipf.write(os.path.join(root, file), arcname=file)

        print(f"XML files in {output_folder} compressed in {zip_file_path}")

    # Save alignment register in JSON format
    output_folder = 'alignment_register/'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    register_file_path = os.path.join(output_folder, 'alignment_register.json')
    with open(register_file_path, 'w', encoding="utf-8") as json_file:
        json.dump(alignment_register, json_file, ensure_ascii=False, indent=4)

    return alignment_register

In [11]:
# # Run the process_alignment function in parallel

# from multiprocessing import Pool

# # Define a function to process alignment for a single set of arguments
# def process_alignment_single(args):
#     path_alg_dicts, path_alto, path_xmls = args
#     return process_alignment(path_alg_dicts, path_alto, path_xmls)

# # Define the arguments for each process
# arguments = [(path_alg_dicts, path_alto, path_xmls) for _ in range(4)]

# # Use multiprocessing.Pool to process alignments in parallel
# with Pool(4) as p:
#     alignment_register = p.map(process_alignment_single, arguments)


In [35]:
# Run the process_alignment function
process_alignment(path_alg_dicts, path_alto, path_xmls, levenshtein_threshold=0.8)


--- File lines_dict_with_alg_Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt.json loaded with id2 = Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt
List of XML files containing alignments for Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt: ['IE34120895_00033', 'IE34120895_00033', 'IE34120895_00033', 'IE34120895_00033', 'IE34120895_00033', 'IE35481905_00027', 'IE35481905_00027', 'IE35481905_00027', 'IE35481905_00027', 'IE35481905_00027', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 'IE87234800_00004', 

[{'filename': 'IE34120895_00033.xml',
  'aligned_lines_count': 0,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE35481905_00027.xml',
  'aligned_lines_count': 0,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE61220167_00084.xml',
  'aligned_lines_count': 9,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE87234800_00004.xml',
  'aligned_lines_count': 17,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE87234800_00005.xml',
  'aligned_lines_count': 22,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE87297122_00014.xml',
  'aligned_lines_count': 2,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE87363222_00008.xml',
  'aligned_lines_count': 13,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE87447950_00011.xml',
  'aligned_lin

### Importing altos in eScriptorium

In [36]:
# Importing the necessary functions and packages for the eScriptorium API
from functions import *
from packages import *

switching to  msIA


In [37]:
# eScriptorium document where the alignment will be imported
doc_pk = 4381

In [38]:
# ZIP and import each file in the folder
for zip_filename in os.listdir(path_alto):
    if zip_filename.endswith('.zip'):
        # Build the full path to the ZIP file
        zip_file_path = os.path.join(path_alto, zip_filename)
        
        # name of the alignment file
        name = zip_filename.split(".")[0]+"pipeline_lev_r_08"
        

        # Import in eScriptorium
        import_xml(doc_pk, path_alto, zip_filename, name)
        
        print(f"{zip_filename} has been successfully imported into eScriptorium.")

print(f"Link do the document in eScriptorium: https://msia.escriptorium.fr/document/{doc_pk}/images/")

200 b'{"status":"ok"}'
Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
MT_NoVoc_concatenated.txt_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Siddur_Ashkenaz_clean_concatenated.txt_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt_alignment.zip has been successfully imported into eScriptorium.
Link do the document in eScriptorium: https://msia.escriptorium.fr/document/4381/images/


In [39]:
# delete transcriptions layers
doc_pk = 4381
get_basic_info(doc_pk)

get document segmentation ontology for document:  4381
https://msia.escriptorium.fr/api/documents/4381/
Document: 4381  with  64  parts
region types: [{'pk': 6912, 'name': 'Catchword'}, {'pk': 3, 'name': 'Commentary'}, {'pk': 6910, 'name': 'FooterCentral'}, {'pk': 6914, 'name': 'Handwritten'}, {'pk': 6908, 'name': 'Header'}, {'pk': 4, 'name': 'Illustration'}, {'pk': 2, 'name': 'Main'}, {'pk': 6909, 'name': 'MainCentral'}, {'pk': 6911, 'name': 'Margin'}, {'pk': 6913, 'name': 'RunningHeader'}, {'pk': 1, 'name': 'Title'}]
line types: [{'pk': 15289, 'name': 'Handwritten'}, {'pk': 15288, 'name': 'MainCentral'}, {'pk': 15287, 'name': 'NotMain'}]
transcription_level_list: [{'pk': 10801, 'name': 'Sid_Ashk_Daniel_08_7_600', 'archived': False, 'avg_confidence': None}, {'pk': 10800, 'name': 'Siddur_Ashkenaz_novoc_no_lbs_Danielpipeline_lev_r_08', 'archived': False, 'avg_confidence': 0.9847869869290219}, {'pk': 10799, 'name': 'Siddur_Ashkenaz_clean_concatenatedpipeline_lev_r_08', 'archived': False,

(64,
 [{'pk': 10801,
   'name': 'Sid_Ashk_Daniel_08_7_600',
   'archived': False,
   'avg_confidence': None},
  {'pk': 10800,
   'name': 'Siddur_Ashkenaz_novoc_no_lbs_Danielpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 10799,
   'name': 'Siddur_Ashkenaz_clean_concatenatedpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 10797,
   'name': 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenatedpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 10798,
   'name': 'MT_NoVoc_concatenatedpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.986672220664065},
  {'pk': 10796,
   'name': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenatedpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 10765,
   'name': 'MT_NoVoc_pipeline_lev_ratio_08',
   'archived': True,
   'avg_confidence': 0.9790699318765446},
  {'pk': 10766,
   'na

In [40]:
part_pk_list = get_part_pk_list(doc_pk)

In [18]:
for part_pk in part_pk_list:
    # delete_part_transcription(doc_pk,part_pk,tr_level=10740)

# Exploring the alignment register with pandas

In [41]:
import pandas as pd
df = pd.read_json('alignment_register/alignment_register.json')

In [42]:
df.head()

Unnamed: 0,filename,aligned_lines_count,GT_id
0,IE34120895_00033.xml,0,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
1,IE35481905_00027.xml,0,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
2,IE61220167_00084.xml,9,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
3,IE87234800_00004.xml,17,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
4,IE87234800_00005.xml,22,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...


In [43]:
# display le lines where GT_ID is 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated'
df[df['GT_id'] == 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated']

# sort by number of aligned lines
df.sort_values(by='aligned_lines_count', ascending=False)

# display files with more than 5 lines aligned
df[df['aligned_lines_count'] > 5]

Unnamed: 0,filename,aligned_lines_count,GT_id
2,IE61220167_00084.xml,9,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
3,IE87234800_00004.xml,17,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
4,IE87234800_00005.xml,22,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
6,IE87363222_00008.xml,13,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
7,IE87447950_00011.xml,12,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
...,...,...,...
200,IE87752740_00022.xml,13,Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt
201,IE87752740_00038.xml,10,Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt
202,IE87755510_00024.xml,16,Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt
204,IE87555665_00021.xml,6,Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt


In [44]:
# display GT_id values
df['GT_id'].unique()

array(['Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt',
       'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt',
       'MT_NoVoc_concatenated.txt',
       'Siddur_Ashkenaz_clean_concatenated.txt',
       'Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt'], dtype=object)

In [45]:

all_parts = get_all_parts(doc_pk)
print(all_parts)

[{'pk': 734735, 'name': '', 'filename': 'IE103402206_00010.jpg', 'title': 'Element 1', 'typology': None, 'image': {'uri': '/media/documents/4381/IE103402206_00010.jpg', 'size': [1702, 2511], 'thumbnails': {'card': '/media/documents/4381/IE103402206_00010.jpg.180x180_q85_crop-smart.jpg', 'large': '/media/documents/4381/IE103402206_00010.jpg.1000x1000_q85.jpg'}}, 'image_file_size': 285936, 'original_filename': 'IE103402206_00010.jpg', 'bw_image': None, 'workflow': {'convert': 'done', 'segment': 'done', 'align': 'done'}, 'order': 0, 'recoverable': False, 'transcription_progress': 100, 'source': 'mets//export_doc4367_test4matthieu_alto_202404261220.zip/IE103402206_00010.jpg', 'max_avg_confidence': 0.9849466164969554, 'comments': 'tanach'}, {'pk': 734736, 'name': '', 'filename': 'IE103402206_00014.jpg', 'title': 'Element 2', 'typology': None, 'image': {'uri': '/media/documents/4381/IE103402206_00014.jpg', 'size': [1709, 2509], 'thumbnails': {'card': '/media/documents/4381/IE103402206_00014.

In [46]:
part_pk = next((item['pk'] for item in all_parts if item['filename'] == 'IE87532920_00033.jpg'), None)
print(part_pk)
print(f"https://msia.escriptorium.fr/document/{doc_pk}/part/{part_pk}/edit")

None
https://msia.escriptorium.fr/document/4381/part/None/edit


In [48]:
# display files where GT_ID is 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated' and aligned_lines_count > 5
df[(df['GT_id'] == 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt') & (df['aligned_lines_count'] > 1)].sort_values(by='aligned_lines_count', ascending=False)


Unnamed: 0,filename,aligned_lines_count,GT_id
11,IE87474895_00044.xml,27,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
19,IE87582245_00015.xml,23,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
8,IE87447950_00015.xml,22,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
35,IE87744435_00039.xml,22,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
26,IE87708411_00021.xml,22,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
4,IE87234800_00005.xml,22,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
34,IE87744435_00015.xml,21,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
16,IE87580382_00041.xml,20,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
13,IE87502633_00044.xml,20,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
9,IE87447950_00018.xml,20,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...


In [49]:
list = ["IE87755510_00024.jpg", "IE87752740_00038.jpg", "IE87752740_00022.jpg", "IE87752740_00019.jpg", "IE87744435_00039.jpg", "IE87744435_00015.jpg", "IE87739615_00009.jpg", "IE87733114_00008.jpg", "IE87733114_00007.jpg", "IE87733114_00006.jpg", "IE87726132_00023.jpg", "IE87719995_00053.jpg", "IE87719995_00046.jpg", "IE87717323_00012.jpg", "IE87716931_00014.jpg", "IE87708411_00021.jpg", "IE87708411_00019.jpg", "IE87705976_00036.jpg", "IE87705976_00034.jpg", "IE87705971_00006.jpg", "IE87700963_00014.jpg", "IE87694978_00020.jpg", "IE87690674_00007.jpg", "IE87675634_00010.jpg", "IE87675634_00007.jpg", "IE87610546_00015.jpg", "IE87582245_00015.jpg", "IE87581919_00018.jpg", "IE87581919_00016.jpg", "IE87581919_00011.jpg", "IE87580382_00041.jpg", "IE87580382_00008.jpg", "IE87555665_00021.jpg", "IE87532920_00026.jpg", "IE87519524_00013.jpg", "IE87508468_00031.jpg", "IE87502633_00044.jpg", "IE87502633_00018.jpg", "IE87476216_00051.jpg", "IE87474895_00044.jpg", "IE87474895_00014.jpg", "IE87447950_00018.jpg", "IE87447950_00015.jpg", "IE87447950_00011.jpg", "IE87363222_00008.jpg", "IE87297122_00014.jpg", "IE87234800_00005.jpg", "IE87234800_00004.jpg", "IE87234800_00003.jpg"]
doc_pk = 4381

In [50]:
all_parts = get_all_parts(doc_pk)

part_pk_list = []
for picture in list:
    part_pk = next((item['pk'] for item in all_parts if item['filename'] == picture), None)
    part_pk_list.append(part_pk)
print(part_pk_list)


[734798, 734797, 734796, 734795, 734794, 734793, 734792, 734791, 734790, 734789, 734788, 734787, 734786, 734785, 734784, 734783, 734782, 734781, 734780, 734779, 734778, 734777, 734776, 734775, 734774, 734773, 734772, 734771, 734770, 734769, 734768, 734767, 734766, 734765, 734764, 734763, 734762, 734761, 734760, 734759, 734758, 734757, 734756, 734755, 734754, 734753, 734752, 734751, 734750]


In [51]:
print(len(part_pk_list))

49


In [52]:
get_basic_info(doc_pk)

get document segmentation ontology for document:  4381
https://msia.escriptorium.fr/api/documents/4381/
Document: 4381  with  64  parts
region types: [{'pk': 6912, 'name': 'Catchword'}, {'pk': 3, 'name': 'Commentary'}, {'pk': 6910, 'name': 'FooterCentral'}, {'pk': 6914, 'name': 'Handwritten'}, {'pk': 6908, 'name': 'Header'}, {'pk': 4, 'name': 'Illustration'}, {'pk': 2, 'name': 'Main'}, {'pk': 6909, 'name': 'MainCentral'}, {'pk': 6911, 'name': 'Margin'}, {'pk': 6913, 'name': 'RunningHeader'}, {'pk': 1, 'name': 'Title'}]
line types: [{'pk': 15289, 'name': 'Handwritten'}, {'pk': 15288, 'name': 'MainCentral'}, {'pk': 15287, 'name': 'NotMain'}]
transcription_level_list: [{'pk': 10801, 'name': 'Sid_Ashk_Daniel_08_7_600', 'archived': False, 'avg_confidence': None}, {'pk': 10800, 'name': 'Siddur_Ashkenaz_novoc_no_lbs_Danielpipeline_lev_r_08', 'archived': False, 'avg_confidence': 0.9847869869290219}, {'pk': 10799, 'name': 'Siddur_Ashkenaz_clean_concatenatedpipeline_lev_r_08', 'archived': False,

(64,
 [{'pk': 10801,
   'name': 'Sid_Ashk_Daniel_08_7_600',
   'archived': False,
   'avg_confidence': None},
  {'pk': 10800,
   'name': 'Siddur_Ashkenaz_novoc_no_lbs_Danielpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 10799,
   'name': 'Siddur_Ashkenaz_clean_concatenatedpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 10797,
   'name': 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenatedpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 10798,
   'name': 'MT_NoVoc_concatenatedpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.986672220664065},
  {'pk': 10796,
   'name': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenatedpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 10765,
   'name': 'MT_NoVoc_pipeline_lev_ratio_08',
   'archived': True,
   'avg_confidence': 0.9790699318765446},
  {'pk': 10766,
   'na

In [35]:
# Choose the transcription level
tr_level_pk = 10678
# Choose the region type
region_type_pk_list = [6798]


# get the xmls of the parts from eScriptorium
export_xml(doc_pk,part_pk_list,tr_level_pk,region_type_pk_list,include_undefined = False, include_orphan = False, file_format = 'alto',include_images = True, print_status = True)


200
b'{"status":"ok"}'


<Response [200]>