# Import Passim results into eScriptorium
These scripts creates xml altos from Passim alignment results, for import into eScriptorium.


### Importing Required Libraries

In [1]:
import os
import json
import re
from bs4 import BeautifulSoup
from pprint import pprint

### Loading JSON data into a list of dictionaries from a JSONLines file

In [2]:
# path to the directory containing the passim output
path_passim_output = 'json_from_passim/out_n7_docwise_all-pairs/out.json'

In [3]:
# List to store data from all JSON files
out_passim_list = []

# Loop through each file in the directory
for file in os.listdir(path_passim_output):
    if file.endswith(".json"):
        file_path = os.path.join(path_passim_output, file)
        # Open the JSON file and load its content as a list of dictionaries
        with open(file_path, 'r', encoding="utf-8") as json_file:
            data = [json.loads(line) for line in json_file]
            out_passim_list.extend(data)


In [4]:
# Checking some content
print(out_passim_list[0]['lines'][0]['wits'][0]['alg'])
print('len', len(out_passim_list[0]['lines'][0]['wits'][0]['alg']))
print(out_passim_list[0]['lines'][0]['wits'][0]['alg2'])
print('len', len(out_passim_list[0]['lines'][0]['wits'][0]['alg2']))

וידבר יהוה אל משה במדבר סיני 
len 29
וידבר יהוה אל‐משה במדבר סיני

len 29


In [5]:
print(f"Number of blocks to be processed:{len(out_passim_list)}")

Number of blocks to be processed:8


### Splitting the aligned GT text portions into lines corresponding to OCR lines
Passim found alignments (between OCR and GT) at the textblock level.
Now we have to split the selected GT, in sections that correspond to the OCR lines.
This can be done by using the 'alg' and 'alg2' fields in passim output, containing the OCR and GT alignments, respectively.
Those fiels contain the exact same character numbers.
The special caracters '🍺' inserted in the OCR text, during the concatenation of the OCR lines
will be used to split the GT text in sections, as they give the position where the GT text should be split.

The splitted GT alignments will be stored in dictionaries in the lines_dict_with_alg_GT

In [6]:
pprint(out_passim_list)

[{'id': 'eSc_textblock_f8f9ef30_IE103402206_00010',
  'lines': [{'begin': 0,
             'text': 'וידבר יהוה אל-משה במדבר סיני\n',
             'wits': [{'alg': 'וידבר יהוה אל משה במדבר סיני ',
                       'alg2': 'וידבר יהוה אל‐משה במדבר סיני\n',
                       'begin': 242880,
                       'id': 'MT_NoVoc_concatenated.txt',
                       'matches': 28,
                       'ref': '1',
                       'series': 'GT',
                       'text': 'וידבר יהוה אל משה במדבר סיני '}]},
            {'begin': 29,
             'text': 'באהל מועד באחד לחדש השני בשנה\n',
             'wits': [{'alg': 'באהל מועד באחד לחדש השני בשנה ',
                       'alg2': 'באהל מועד באחד לחדש השני בשנה\n',
                       'begin': 242909,
                       'id': 'MT_NoVoc_concatenated.txt',
                       'matches': 30,
                       'ref': '1',
                       'series': 'GT',
                       'text': 'באהל מועד

In [7]:
# list of GTs found in alignments:
# GT_ids = list(set([textblock['id2'] for textblock in out_passim_list]))
# list of GTs found in alignments:

GT_ids = list(set([wit['id']
          for textblock in out_passim_list
          for line in textblock['lines']
          for wit in line.get('wits', [])]))
print(GT_ids)


['Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt', 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt', 'Siddur_Ashkenaz_clean_concatenated.txt', 'MT_NoVoc_concatenated.txt', 'Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt']


In [8]:
'''
This add the GT text to the OCR lines in the dictionnary of ocr lines (ocr_lines_dict.json)
As there are several GTs, the script iterates over ocr_lines_dict.json for each GT.
When and alignment is found, the GT text is added to the corresponding in the dictionary.

'''

# list of GTs found in alignments:
GT_ids = list(set([wit['id']
          for textblock in out_passim_list
          for line in textblock['lines']
          for wit in line.get('wits', [])]))
print(GT_ids)

print('list of GTs found in alignments:', GT_ids)

# Iterate on GT_ids
for GT_id in GT_ids:
    print(f"--- Processing of GT {GT_id} ---")

    # Load dictionary containing OCR line-splitting information
    with open('ocr_lines_dict/ocr_lines_dict.json', 'r', encoding="utf-8") as json_file:
        ocr_lines_dict = json.load(json_file)
    
    # Iterate over out_passim_list dictionaries
    # (out_passim_list: passim alignment results, by textblock.)
    for textblock in out_passim_list:

        # Initialize textblock_alg_lines_list
        textblock_alg_lines_list = []
        # retrieve textblock and file identifiers
        textblock_id = re.sub(r'.*(eSc_textblock_[a-f0-9]+).*', r'\1', textblock['id'])
        filename = re.sub(r'.*' + textblock_id + '_(.*)', r'\1', textblock['id'])
        
        # Check if GT_id is in at least one wit['id']
        GT_in_textblock = any(GT_id in wit['id'] for line in textblock['lines'] for wit in line.get('wits', []))
        if GT_in_textblock:
            print(f'{GT_id} found in textblock {textblock_id} in file {filename}')

            # Extract wit['alg'] for the current GT_id
            for line in textblock['lines']:
                for wit in line.get('wits', []):
                    if GT_id in wit['id']:
                        textblock_alg_lines_list.append(wit['alg'])
                    else:
                        textblock_alg_lines_list.append('')            

            print('textblock_alg_lines_list (not clean):', textblock_alg_lines_list)
            
            # Cleaning of GT lines
            # Replace '-' (45) with '', but avoid empty lines
            textblock_alg_lines_list = [GT_line.replace('-', '') if GT_line.replace('-', '') else GT_line for GT_line in textblock_alg_lines_list]
            # Replace '-' (8208) with '-' (45)
            textblock_alg_lines_list = [GT_line.replace(chr(8208), '-') for GT_line in textblock_alg_lines_list]
            # Remove leading and trailing spaces, but avoid empty lines
            textblock_alg_lines_list = [GT_line.strip() if GT_line.strip() else GT_line for GT_line in textblock_alg_lines_list]

            print ('textblock_alg_lines_list (cleaned):', textblock_alg_lines_list)

            # Find the textblock in ocr_lines_dict to insert the aligned GT lines
            for part in ocr_lines_dict: # ocr_lines_dict: OCR line infos, grouped by part and textblocks
                if part['filename'] == filename:
                    for block in part['ocr_blocks']:
                        if block['text_block_id'] == textblock_id:
                            for line in block['ocr_lines']:
                                if textblock_alg_lines_list:
                                    line['alg_GT'] = textblock_alg_lines_list.pop(0)
                                    line['GT_id'] = GT_id
                                else:
                                    # Handle the case where textblock_alg_lines_list is empty
                                    # For example, you may want to break out of the loop or take another action
                                    break
                        
    # Save a JSON file by GT_id
    directory = 'lines_dict_with_alg_GT'
    os.makedirs(directory, exist_ok=True)
    file_path = os.path.join(directory, f'lines_dict_with_alg_{GT_id}.json')
    with open(file_path, 'w', encoding="utf-8") as json_file:
        json.dump(ocr_lines_dict, json_file, ensure_ascii=False, indent=4)


['Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt', 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt', 'Siddur_Ashkenaz_clean_concatenated.txt', 'MT_NoVoc_concatenated.txt', 'Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt']
list of GTs found in alignments: ['Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt', 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt', 'Siddur_Ashkenaz_clean_concatenated.txt', 'MT_NoVoc_concatenated.txt', 'Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt']
--- Processing of GT Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt ---
Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt found in textblock eSc_textblock_ab76e7e2 in file IE61220167_00084
textblock_alg_lines_list (not clean): ['הגדול הגבור והנורא. אל עליון קונה -------', '', '', '', 'שמים וארץ:-מגן אבות בדברו. מחיה מתים ', '', '', '', 'במאמרו. המלך הקדוש שאין כמוהו. המניח ', '', '', '', 'לעמו ביום שבת קדשו. כי בם רצה להניח להם. ', '', '', '', 'לפניו נעבוד ביראה ופחד. ונודה לשמו בכל ', '', '', '', '

### Parsing XML files and updating text content with GT alignments

In [9]:
# Path to the directory containing the alto files from eScriptorium
path_xmls = 'xmls_from_eSc'

# Path to the output directory, where the new alto files (and zip) will be saved
path_alto = 'altos_for_eSc_linewise'

# Path to directory containing the dictionaries with the alignment of the GT for each OCR line
path_alg_dicts = 'lines_dict_with_alg_GT'

In [10]:
import zipfile
from bs4 import BeautifulSoup
import Levenshtein

def process_alignment(path_alg_dicts, path_alto, path_xmls, levenshtein_threshold):
    ''' Process the alignment of the GT on the OCR text lines and save the modified XML files in a ZIP archive
    The ZIP archive will be sent to eScriptorium.
    Each alignment candidate with a Levenshtein similarity ratio above the threshold will be considered as a validated alignment,
    and inserted in the XML file.

    Parameters:
    path_alg_dicts (str): Path to the directory containing the dictionaries with the alignment of the different GT, for each OCR line
    path_alto (str): Path to the output directory, where the new alto files (and zip) will be saved
    path_xmls (str): Path to the directory containing the alto files from eScriptorium
    '''
    alignment_register = []  # list of XML files containing an alignment, number of aligned lines and the id of the GT  in this file.

    # Load each JSON file in the lines_dict_with_alg_GT directory
    for file in os.listdir(path_alg_dicts):
        if file.endswith('.json'):
            with open(os.path.join(path_alg_dicts, file), 'r', encoding="utf-8") as json_file:
                lines_dict = json.load(json_file)
            # retrieve the id2 (GT name) from the filename
            id2 = re.sub(r'lines_dict_with_alg_(.*).json', r'\1', file)
            print(f"--- File {file} loaded with id2 = {id2}")

            # Create a folder to store XML files processed for this id2 value
            # These altos files will be compressed into a ZIP file and sent to eScriptorium
            output_folder = os.path.join(path_alto, id2)
            os.makedirs(output_folder, exist_ok=True)

            # List the xmls files containing GT alignments for the current GT
            xml_files_with_alg = [part['filename'] for part in lines_dict
                      for block in part['ocr_blocks']
                      for line in block['ocr_lines']
                      if line.get('alg_GT') and line['GT_id'] == id2]

            print(f"List of XML files containing alignments for {id2}: {xml_files_with_alg}")

            # Loop through all the raw XML files imported from eScriptorium
            for filename in os.listdir(path_xmls):
                if filename.endswith('.xml'):
                    if os.path.splitext(filename)[0] not in xml_files_with_alg:
                        # print(f"Skipping {filename} as it does not contain any alignment for {id2}")
                        continue
                    print(f"Processing {filename} as it contains alignments for {id2}")

                    line_count = 0 # count the number of lines with validated alignment
                    with open(os.path.join(path_xmls, filename), encoding="utf-8") as xml_file:
                        xml = xml_file.read()
                    soup = BeautifulSoup(xml, 'xml')

                    for text_line in soup.find_all('TextLine'):
                        text_line_id = text_line.get('ID')

                        for part in lines_dict:
                            for block in part['ocr_blocks']:
                                for line in block['ocr_lines']:
                                    if line.get('line_id') == text_line_id:
                                        alg_GT_value = line.get('alg_GT', None)  # get GT alignment if available
                                        if alg_GT_value is not None:
                                            string_elements = text_line.find_all('String')
                                            if string_elements:
                                                levenshtein_ratio = Levenshtein.ratio(alg_GT_value, string_elements[0]['CONTENT'])
                                                print(f"Levenshtein ratio: {levenshtein_ratio}")
                                                if levenshtein_ratio >= levenshtein_threshold:
                                                    string_elements[0]['CONTENT'] = alg_GT_value
                                                    line_count += 1

                                                else:
                                                    string_elements[0]['CONTENT'] = ''
                                        else:
                                            string_elements = text_line.find_all('String')
                                            if string_elements:
                                                string_elements[0]['CONTENT'] = ''

                    # Add alignment data to register
                    alignment_register.append({
                        'filename': filename,
                        'aligned_lines_count': line_count,
                        'GT_id': id2
                    })

                    # Write the modified XML file to the output folder corresponding to the id2 value (GT)
                    output_file_path = os.path.join(output_folder, filename)
                    with open(output_file_path, 'w', encoding="utf-8") as output_file:
                        output_file.write(str(soup))
                        print(f"{filename} processed and recorded in {output_folder}")

        # Create a specific name for the ZIP archive based on the id2 value
        zip_file_name = f"{id2}_alignment.zip"
        zip_file_path = os.path.join(path_alto, zip_file_name)

        # Create a zip file of the XML files in the output folder
        with zipfile.ZipFile(zip_file_path, 'w') as zipf:
            for root, _, files in os.walk(output_folder):
                for file in files:
                    zipf.write(os.path.join(root, file), arcname=file)

        print(f"XML files in {output_folder} compressed in {zip_file_path}")

    # Save alignment register in JSON format
    output_folder = 'alignment_register/'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    register_file_path = os.path.join(output_folder, 'alignment_register.json')
    with open(register_file_path, 'w', encoding="utf-8") as json_file:
        json.dump(alignment_register, json_file, ensure_ascii=False, indent=4)

    return alignment_register

In [11]:
# # Run the process_alignment function in parallel

# from multiprocessing import Pool

# # Define a function to process alignment for a single set of arguments
# def process_alignment_single(args):
#     path_alg_dicts, path_alto, path_xmls = args
#     return process_alignment(path_alg_dicts, path_alto, path_xmls)

# # Define the arguments for each process
# arguments = [(path_alg_dicts, path_alto, path_xmls) for _ in range(4)]

# # Use multiprocessing.Pool to process alignments in parallel
# with Pool(4) as p:
#     alignment_register = p.map(process_alignment_single, arguments)


In [12]:
# Run the process_alignment function
process_alignment(path_alg_dicts, path_alto, path_xmls, levenshtein_threshold=0)


--- File lines_dict_with_alg_Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt.json loaded with id2 = Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt
List of XML files containing alignments for Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt: ['IE34120895_00033', 'IE34120895_00033', 'IE34120895_00033', 'IE34120895_00033', 'IE34120895_00033', 'IE35481905_00027', 'IE35481905_00027', 'IE35481905_00027', 'IE35481905_00027', 'IE35481905_00027', 'IE35481905_00027', 'IE35481905_00027', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084', 'IE61220167_00084']
Processing IE34120895_00033.xml as it contains alignments for Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt
Levenshtein ratio: 0.339622641509434
Levenshtein ratio: 0.4158415841584159
Levenshtein ratio: 0.33644859813084116
Levenshtein ratio: 0.36190476190476195
Levenshtein ratio: 0.20266666666666666
IE34120895_00033.xml processed and recorded in altos_for_eSc_linewise/Machzor_Rosh_Hashanah_

[{'filename': 'IE34120895_00033.xml',
  'aligned_lines_count': 5,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE35481905_00027.xml',
  'aligned_lines_count': 14,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE61220167_00084.xml',
  'aligned_lines_count': 19,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE35481905_00027.xml',
  'aligned_lines_count': 14,
  'GT_id': 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE61220167_00083.xml',
  'aligned_lines_count': 19,
  'GT_id': 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE61220167_00084.xml',
  'aligned_lines_count': 19,
  'GT_id': 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE61220167_00097.xml',
  'aligned_lines_count': 19,
  'GT_id': 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE103402206_00010.xml',
  'aligned_lines_count

### Importing altos in eScriptorium

In [13]:
# Importing the necessary functions and packages for the eScriptorium API
from functions import *
from packages import *

switching to  msIA


In [14]:
# eScriptorium document where the alignment will be imported
doc_pk = 4368

In [15]:
# ZIP and import each file in the folder
for zip_filename in os.listdir(path_alto):
    if zip_filename.endswith('.zip'):
        # Build the full path to the ZIP file
        zip_file_path = os.path.join(path_alto, zip_filename)
        
        # name of the alignment file
        name = zip_filename.split(".")[0] + "3"
        

        # Import in eScriptorium
        import_xml(doc_pk, path_alto, zip_filename, name)
        
        print(f"{zip_filename} has been successfully imported into eScriptorium.")

print(f"Link do the document in eScriptorium: https://msia.escriptorium.fr/document/{doc_pk}/images/")

200 b'{"status":"ok"}'
Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
MT_NoVoc_concatenated.txt_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Siddur_Ashkenaz_clean_concatenated.txt_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt_alignment.zip has been successfully imported into eScriptorium.
Link do the document in eScriptorium: https://msia.escriptorium.fr/document/4368/images/


In [16]:
# delete transcriptions layers
doc_pk = 4368
get_basic_info(doc_pk)

get document segmentation ontology for document:  4368
https://msia.escriptorium.fr/api/documents/4368/
Document: 4368  with  15  parts
region types: [{'pk': 6834, 'name': 'Catchword'}, {'pk': 3, 'name': 'Commentary'}, {'pk': 6832, 'name': 'FooterCentral'}, {'pk': 6836, 'name': 'Handwritten'}, {'pk': 6830, 'name': 'Header'}, {'pk': 4, 'name': 'Illustration'}, {'pk': 2, 'name': 'Main'}, {'pk': 6831, 'name': 'MainCentral'}, {'pk': 6833, 'name': 'Margin'}, {'pk': 6835, 'name': 'RunningHeader'}, {'pk': 1, 'name': 'Title'}]
line types: [{'pk': 15217, 'name': 'Handwritten'}, {'pk': 15216, 'name': 'MainCentral'}, {'pk': 15215, 'name': 'NotMain'}]
transcription_level_list: [{'pk': 10748, 'name': 'MT_NoVoc_concatenated3', 'archived': False, 'avg_confidence': 0.9863512041097893}, {'pk': 10746, 'name': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated3', 'archived': False, 'avg_confidence': 0.9801566943712431}, {'pk': 10747, 'name': 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated3', 'archived': 

(15,
 [{'pk': 10748,
   'name': 'MT_NoVoc_concatenated3',
   'archived': False,
   'avg_confidence': 0.9863512041097893},
  {'pk': 10746,
   'name': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated3',
   'archived': False,
   'avg_confidence': 0.9801566943712431},
  {'pk': 10747,
   'name': 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated3',
   'archived': False,
   'avg_confidence': 0.9809075333973954},
  {'pk': 10744,
   'name': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated2',
   'archived': True,
   'avg_confidence': 0.9907498145714785},
  {'pk': 10745,
   'name': 'Siddur_Ashkenaz_clean_concatenated2',
   'archived': True,
   'avg_confidence': 0.9918216731074374},
  {'pk': 10743,
   'name': 'MT_NoVoc_concatenated2',
   'archived': True,
   'avg_confidence': 0.9875341580465107},
  {'pk': 10740,
   'name': 'MT_NoVoc_concatenated',
   'archived': True,
   'avg_confidence': 0.9875341580465107},
  {'pk': 10742,
   'name': 'Siddur_Ashkenaz_clean_concatenated',
   'archived': True,
 

In [17]:
part_pk_list = get_part_pk_list(doc_pk)

In [18]:
for part_pk in part_pk_list:
    delete_part_transcription(doc_pk,part_pk,tr_level=10740)

# Exploring the alignment register with pandas

In [19]:
import pandas as pd
df = pd.read_json('alignment_register/alignment_register.json')

In [20]:
df.head()

Unnamed: 0,filename,aligned_lines_count,GT_id
0,IE34120895_00033.xml,5,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
1,IE35481905_00027.xml,14,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
2,IE61220167_00084.xml,19,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
3,IE35481905_00027.xml,14,Machzor_Yom_Kippur_Ashkenaz_clean_concatenated...
4,IE61220167_00083.xml,19,Machzor_Yom_Kippur_Ashkenaz_clean_concatenated...


In [21]:
# display le lines where GT_ID is 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated'
df[df['GT_id'] == 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated']

# sort by number of aligned lines
df.sort_values(by='aligned_lines_count', ascending=False)

# display files with more than 5 lines aligned
df[df['aligned_lines_count'] > 5]

Unnamed: 0,filename,aligned_lines_count,GT_id
1,IE35481905_00027.xml,14,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
2,IE61220167_00084.xml,19,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
3,IE35481905_00027.xml,14,Machzor_Yom_Kippur_Ashkenaz_clean_concatenated...
4,IE61220167_00083.xml,19,Machzor_Yom_Kippur_Ashkenaz_clean_concatenated...
5,IE61220167_00084.xml,19,Machzor_Yom_Kippur_Ashkenaz_clean_concatenated...
6,IE61220167_00097.xml,19,Machzor_Yom_Kippur_Ashkenaz_clean_concatenated...
7,IE103402206_00010.xml,11,MT_NoVoc_concatenated.txt
8,IE103402206_00014.xml,9,MT_NoVoc_concatenated.txt
9,IE103409244_00025.xml,28,MT_NoVoc_concatenated.txt
10,IE61220167_00083.xml,19,MT_NoVoc_concatenated.txt


In [22]:
# display GT_id values
df['GT_id'].unique()

array(['Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt',
       'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt',
       'MT_NoVoc_concatenated.txt',
       'Siddur_Ashkenaz_clean_concatenated.txt',
       'Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt'], dtype=object)

In [23]:
all_parts = get_all_parts(doc_pk)
print(all_parts)

[{'pk': 734556, 'name': '', 'filename': 'IE103402206_00010.jpg', 'title': 'Element 1', 'typology': None, 'image': {'uri': '/media/documents/4368/IE103402206_00010.jpg', 'size': [1702, 2511], 'thumbnails': {'card': '/media/documents/4368/IE103402206_00010.jpg.180x180_q85_crop-smart.jpg', 'large': '/media/documents/4368/IE103402206_00010.jpg.1000x1000_q85.jpg'}}, 'image_file_size': 285936, 'original_filename': 'IE103402206_00010.jpg', 'bw_image': None, 'workflow': {'convert': 'done', 'segment': 'done', 'align': 'done'}, 'order': 0, 'recoverable': False, 'transcription_progress': 0, 'source': 'mets//export_doc4367_test4matthieu_alto_202404241147.zip/IE103402206_00010.jpg', 'max_avg_confidence': None, 'comments': 'tanach'}, {'pk': 734557, 'name': '', 'filename': 'IE103402206_00014.jpg', 'title': 'Element 2', 'typology': None, 'image': {'uri': '/media/documents/4368/IE103402206_00014.jpg', 'size': [1709, 2509], 'thumbnails': {'card': '/media/documents/4368/IE103402206_00014.jpg.180x180_q85_

In [24]:
part_pk = next((item['pk'] for item in all_parts if item['filename'] == 'IE87532920_00033.jpg'), None)
print(part_pk)
print(f"https://msia.escriptorium.fr/document/{doc_pk}/part/{part_pk}/edit")

None
https://msia.escriptorium.fr/document/4368/part/None/edit


In [25]:
# display files where GT_ID is 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated' and aligned_lines_count > 5
df[(df['GT_id'] == 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated') & (df['aligned_lines_count'] > 1)].sort_values(by='aligned_lines_count', ascending=False)


Unnamed: 0,filename,aligned_lines_count,GT_id
