# Import Passim results into eScriptorium
These scripts creates xml altos from Passim alignment results, for import into eScriptorium.


### Importing Required Libraries

In [10]:
import os
import json
import re
import Levenshtein
import zipfile  
from pprint import pprint


### Extracting Passim's alignment results

In [11]:
# Define the document pk for interactions with the eSc API
doc_pk = 4381

In [12]:
# path to the directory containing the passim output
path_passim_output = 'json_from_passim/out_n7_docwise/out.json'

In [13]:
def build_list_from_passim_outputs(path_passim_output):
    '''
    Gather the Passim outputs from jsons into a list of dictionaries.
    '''
    # List to store data from all JSON files
    out_passim_list = []

    # Loop through each file in the directory
    for file in os.listdir(path_passim_output):
        if file.endswith(".json"):
            file_path = os.path.join(path_passim_output, file)
            # Open the JSON file and load its content as a list of dictionaries
            with open(file_path, 'r', encoding="utf-8") as json_file:
                data = [json.loads(line) for line in json_file]
                out_passim_list.extend(data)
    print(f"Number of blocks to be processed:{len(out_passim_list)}")
    return out_passim_list

In [14]:
# list of GTs found in alignments:
def list_GT_from_passim_output(out_passim_list):
    '''
    Get the GT present in Passim alignment results.
    '''
    GT_ids = list(set([wit['id']
              for textblock in out_passim_list
              for line in textblock['lines']
              for wit in line.get('wits', [])]))
    return GT_ids


In [15]:
def extract_passim_results(path_passim_output):
    '''
    Extracts the Passim alignments results and build a dictionary for each GT.
    The dictionaries are updates of the ocr_lines_dict.json file.
    For each ocr line where Passim found a GT, the GT is added with (among others) the following informations:
    - the cleaned text of the GT (leading and trailing spaces removed, '-' (45) replaced with '', '-' (8208) replaced with '-' (45))
    - the position of the first aligned character in the GT text
    '''

    # Gather Passim's results in a list
    out_passim_list = build_list_from_passim_outputs(path_passim_output)

    # list of GTs found in alignments:
    GT_ids = list_GT_from_passim_output(out_passim_list)
    print(f'list of GTs found in alignments: {GT_ids}')

    # Iterate over GT_ids, and update the ocr_lines_dict with the GT alignment text
    for GT_id in GT_ids:
        print(f"--- Processing of GT {GT_id} ---")

        # Load dictionary containing OCR line-splitting information
        with open('ocr_lines_dict/ocr_lines_dict.json', 'r', encoding="utf-8") as json_file:
            ocr_lines_dict = json.load(json_file)
        
        # Iterate over out_passim_list dictionaries
        for textblock in out_passim_list:

            # Extract the filename and textblock_id
            textblock_id = re.sub(r'.*(eSc_textblock_[a-f0-9]+).*', r'\1', textblock['id'])
            filename = re.sub(r'.*' + textblock_id + '_(.*)', r'\1', textblock['id'])

            for line in textblock['lines']:
                begin_index = line['begin']
                # Check if the current GT_id is present in the wits of the line
                for wit in line.get('wits', []):
                    if wit['id'] == GT_id:
                        alg_text = wit['alg']
                        # clean the alignment text
                        # Replace '-' (45) with '', but avoid empty lines
                        alg_text = alg_text.replace('-', '') if alg_text.replace('-', '') else alg_text
                        # Replace '-' (8208) with '-' (45)
                        alg_text = alg_text.replace(chr(8208), '-')
                        # Remove leading and trailing spaces, but avoid empty lines
                        alg_text = alg_text.strip() if alg_text.strip() else alg_text
                        GT_start = wit['begin']
                        GT_length = len(wit['text'])     


                        # Find the corresponding line in the OCR dictionary
                        for part in ocr_lines_dict:
                            if part['filename'] == filename:
                                for block in part['ocr_blocks']:
                                    if block['text_block_id'] == textblock_id:
                                        for ocr_line in block['ocr_lines']:
                                            if ocr_line['start'] == begin_index:
                                                # Update the OCR line with the GT alignment text
                                                ocr_line['alg_GT'] = alg_text
                                                ocr_line['GT_id'] = GT_id
                                                ocr_line['GT_start'] = GT_start
                                                ocr_line['GT_len'] = GT_length
                                                break  # No need to continue searching once found

        # Save a JSON file for each GT_id
        directory = 'lines_dict_with_alg_GT'
        os.makedirs(directory, exist_ok=True)
        file_path = os.path.join(directory, f'lines_dict_with_alg_{GT_id}.json')
        with open(file_path, 'w', encoding="utf-8") as json_file:
            json.dump(ocr_lines_dict, json_file, ensure_ascii=False, indent=4)
        print(f"    File {file_path} saved.")
        


In [16]:
extract_passim_results(path_passim_output)

Number of blocks to be processed:61
list of GTs found in alignments: ['Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt', 'Siddur_Ashkenaz_clean_concatenated.txt', 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt', 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt', 'MT_NoVoc_concatenated.txt']
--- Processing of GT Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt ---
    File lines_dict_with_alg_GT/lines_dict_with_alg_Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt.json saved.
--- Processing of GT Siddur_Ashkenaz_clean_concatenated.txt ---
    File lines_dict_with_alg_GT/lines_dict_with_alg_Siddur_Ashkenaz_clean_concatenated.txt.json saved.
--- Processing of GT Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt ---
    File lines_dict_with_alg_GT/lines_dict_with_alg_Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt.json saved.
--- Processing of GT Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt ---
    File lines_dict_with_alg_GT/lines_dict_with_alg_Machzor_Yom_Kippur_Ashkenaz_clean_concatenat

### Parsing XML files and updating text content with GT alignments

In [17]:
# Path to the directory containing the alto files from eScriptorium
path_xmls_from_eSc = 'xmls_from_eSc'

# Path to the output directory, where the new alto files (and zip) will be saved
path_xmls_for_eSc = 'xmls_for_eSc'

# Path to directory containing the dictionaries with the alignment of the GT for each OCR line
path_alg_dicts = 'lines_dict_with_alg_GT'

In [18]:
def save_alignment_register_to_json(alignment_register):
    '''
    Save the alignment register in a JSON file.
    This register contains the list of XML files containing an alignment, the number of aligned lines and the id of the GT in this file.
    '''
    output_folder = 'alignment_register/'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    register_file_path = os.path.join(output_folder, 'alignment_register.json')
    with open(register_file_path, 'w', encoding="utf-8") as json_file:
        json.dump(alignment_register, json_file, ensure_ascii=False, indent=4)
    return alignment_register

def get_xml_files_with_alignment(lines_dict, id2):
    ''' Get XML files containing alignments for a given ID '''
    xml_files_with_alg = []
    for part in lines_dict:
        for block in part['ocr_blocks']:
            for line in block['ocr_lines']:
                if line.get('alg_GT') and line['GT_id'] == id2:
                    xml_files_with_alg.append(part['filename'])
    return list(set(xml_files_with_alg))

def process_alignment_xml_as_txt(path_alg_dicts, path_alto, path_xmls_from_eSc, levenshtein_threshold):
    ''' Process the alignment of the GT on the OCR text lines and save the modified XML files in a ZIP archive
    The ZIP archive will be sent to eScriptorium.
    Each alignment candidate with a Levenshtein similarity ratio above the threshold will be considered as a validated alignment,
    and inserted in the XML file.

    Parameters:
    path_alg_dicts (str): Path to the directory containing the dictionaries with the alignment of the different GT, for each OCR line
    path_xmls_for_eSc (str): Path to the output directory, where the new alto files (and zip) will be saved
    path_xmls_from_eSc (str): Path to the directory containing the alto files from eScriptorium
    '''
    alignment_register = []  # list of XML files containing an alignment, number of aligned lines and the id of the GT in this file.

    # Load each JSON file in the lines_dict_with_alg_GT directory
    for json_file in os.listdir(path_alg_dicts):
        if json_file.endswith('.json'):
            with open(os.path.join(path_alg_dicts, json_file), 'r', encoding="utf-8") as json_file_handler:
                lines_dict = json.load(json_file_handler)
            # retrieve the id2 (GT name) from the filename
            id2 = re.sub(r'lines_dict_with_alg_(.*).json', r'\1', json_file)
            # print(f"--- File {json_file} loaded with id2 = {id2}")

            # Create a folder to store XML files processed for this id2 value
            # These altos files will be compressed into a ZIP file and sent to eScriptorium
            output_folder = os.path.join(path_alto, id2)
            os.makedirs(output_folder, exist_ok=True)

            # List the xmls files containing GT alignments for the current GT
            xml_files_with_alg = get_xml_files_with_alignment(lines_dict, id2)
            # print(f"List of XML files containing alignments for {id2}: {xml_files_with_alg}")

            # Loop through all the raw XML files imported from eScriptorium
            for xml_file in os.listdir(path_xmls_from_eSc):
                if xml_file.endswith('.xml'):
                    if os.path.splitext(xml_file)[0] not in xml_files_with_alg:
                        # print(f"Skipping {xml_file} as it does not contain any alignment for {id2}")
                        continue
                    # print(f"Processing {xml_file} as it contains alignments for {id2}")

                    # List the line ids with GT alignment for the current GT, for the current part
                    # as a tuple (line_id, alg_GT)
                    line_ids_with_GT = []
                    for part in lines_dict:
                        if part['filename'] == os.path.splitext(xml_file)[0]:
                            for block in part['ocr_blocks']:
                                for line in block['ocr_lines']:
                                    if line.get('alg_GT') and line['GT_id'] == id2:
                                        line_ids_with_GT.append((line['line_id'], line['alg_GT']))
                    # print(f"Line ids with GT alignment for {id2} in {xml_file}: {line_ids_with_GT}")
                    
                    # open the XML file as a text file
                    with open(os.path.join(path_xmls_from_eSc, xml_file), encoding="utf-8") as xml_file_handler:
                        xml_as_txt = xml_file_handler.read()
                    
                    line_count = 0 # count the number of lines with validated alignment                  

                    # Search the XML file for character groups corresponding to <TextLine ID=".*?".*?</TextLine>.
                    xml_text_lines = re.findall(r'<TextLine ID=".*?".*?</TextLine>', xml_as_txt, re.DOTALL)
                    # For each group of characters, check if the ID is in line_ids_with_GT:
                    for xml_text_line in xml_text_lines:
                        xml_text_line_id = re.search(r'<TextLine ID="(.*?)"', xml_text_line).group(1)
                        # If the id is not in line_ids_with_GT, replace the line content by ""
                        if xml_text_line_id not in [line_id for line_id, _ in line_ids_with_GT]:
                            updated_content = re.sub(r'<String CONTENT=".*?"', '<String CONTENT=""', xml_text_line)
                            xml_as_txt = xml_as_txt.replace(xml_text_line, updated_content)

                        else:
                            # If the id is in line_ids_with_GT, replace the content by the GT alignment
                            for text_line_id, alg_GT_value in line_ids_with_GT:
                                if text_line_id == xml_text_line_id:
                                    string_content_match = re.search(r'<String CONTENT="(.*?)"', xml_text_line)
                                    if string_content_match:
                                        # Extract value from current content
                                        current_content = string_content_match.group(1)
                                        # Calculate the Levenshtein ratio between current content and GT alignment
                                        levenshtein_ratio = Levenshtein.ratio(alg_GT_value, current_content)
                                        # Determine new content according to Levenshtein threshold
                                        new_content = f'CONTENT="{alg_GT_value}"' if levenshtein_ratio >= levenshtein_threshold else 'CONTENT=""'
                                        # Replace the content of the <String element with the new content
                                        xml_as_txt = xml_as_txt.replace(string_content_match.group(0), f'<String {new_content}')
                                        # Increment the counter of aligned lines if the content has been modified
                                        if levenshtein_ratio >= levenshtein_threshold:
                                            line_count += 1
                                        else:
                                            new_content = 'CONTENT=""'
                                        # Replace the content of the String element with the updated value
                                        updated_string_element = re.sub(r'CONTENT=".*?"', new_content, current_content)
                                        # Replace the original String element with the updated one in the XML
                                        xml_as_txt = xml_as_txt.replace(current_content, updated_string_element)                  
                    
                    # Add alignment data to register if at least one line has been aligned
                    if line_count > 0:
                        alignment_register.append({
                            'filename': xml_file,
                            'aligned_lines_count': line_count,
                            'GT_id': id2
                        })

                    # Write the modified XML file to the output folder corresponding to the id2 value (GT)
                    output_file_path = os.path.join(output_folder, xml_file)
                    with open(output_file_path, 'w', encoding="utf-8") as output_file:
                        output_file.write(xml_as_txt)
                        # print(f"{xml_file} processed and recorded in {output_folder}")
                        # print(f"Number of lines with validated alignment: {line_count}")

                              
            # Create a specific name for the ZIP archive based on the id2 value
            zip_file_name = f"{id2}_alignment.zip"
            zip_file_path = os.path.join(path_alto, zip_file_name)

            # Create a zip file of the XML files in the output folder
            with zipfile.ZipFile(zip_file_path, 'w') as zipf:
                for root, _, files in os.walk(output_folder):
                    for file in files:
                        zipf.write(os.path.join(root, file), arcname=file)

            print(f"XML files in {output_folder} compressed in {zip_file_path}")

    # Save alignment register in JSON format
    save_alignment_register_to_json(alignment_register)

    

    return alignment_register


In [19]:
# Run the process_alignment function
process_alignment_xml_as_txt(path_alg_dicts, path_xmls_for_eSc, path_xmls_from_eSc, levenshtein_threshold=0.8)

XML files in xmls_for_eSc/Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt compressed in xmls_for_eSc/Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt_alignment.zip
XML files in xmls_for_eSc/Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt compressed in xmls_for_eSc/Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt_alignment.zip
XML files in xmls_for_eSc/MT_NoVoc_concatenated.txt compressed in xmls_for_eSc/MT_NoVoc_concatenated.txt_alignment.zip
XML files in xmls_for_eSc/Siddur_Ashkenaz_clean_concatenated.txt compressed in xmls_for_eSc/Siddur_Ashkenaz_clean_concatenated.txt_alignment.zip
XML files in xmls_for_eSc/Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt compressed in xmls_for_eSc/Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt_alignment.zip


[{'filename': 'IE61220167_00084.xml',
  'aligned_lines_count': 9,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE87234800_00004.xml',
  'aligned_lines_count': 17,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE87234800_00005.xml',
  'aligned_lines_count': 22,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE87297122_00014.xml',
  'aligned_lines_count': 2,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE87363222_00008.xml',
  'aligned_lines_count': 13,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE87447950_00011.xml',
  'aligned_lines_count': 12,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE87447950_00015.xml',
  'aligned_lines_count': 22,
  'GT_id': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt'},
 {'filename': 'IE87447950_00018.xml',
  'aligned_l

In [1]:
def process_passim_results(path_passim_output, path_alg_dicts, path_xmls_from_eSc, path_xmls_for_eSc, levenshtein_threshold=0.8):
    '''
    This global function processes the Passim results and updates the XML alto files with the GT alignment.
    - Extracts the Passim alignment results
    - Parse the XML files from eScriptorium, and update the OCR lines with the GT alignment
    if the levenstein ratio between the OCR and GT textblocks containing the lines is above a given threshold.
    - Save the updated XML files in a ZIP archive, ready to be sent to eScriptorium.
    Parameters:
    path_passim_output (str): Path to the directory containing the Passim output JSON files
    path_alg_dicts (str): Path to the directory containing the dictionaries with the alignment of the different GT, for each OCR line
    path_xmls_from_eSc (str): Path to the directory containing the alto files from eScriptorium
    path_xmls_for_eSc (str): Path to the output directory, where the new alto files (and zip) will be saved
    '''

    # Extract the Passim alignment results
    extract_passim_results(path_passim_output)
    # Parse the XML files from eScriptorium, and update the OCR lines with the GT alignment if the levenstein ratio is above a given threshold
    process_alignment_xml_as_txt(path_alg_dicts, path_xmls_for_eSc, path_xmls_from_eSc, levenshtein_threshold)   

### Build tsv files for results analysis

In [20]:
from functions import *

def load_alignment_register(alignment_register_path="alignment_register/alignment_register.json"):
    """
    Load the alignment register from the JSON file.
    """
    with open(alignment_register_path, 'r', encoding="utf-8") as json_file_handler:
        return json.load(json_file_handler)

def create_aligned_counts_by_image(alignment_register):
    """
    Create a dictionary with aligned lines counts for each image and GT.
    """
    # Collect all unique GT_ids
    gt_ids = sorted(set(entry["GT_id"] for entry in alignment_register))
    
    # Initialize a dictionary to hold aligned lines counts for each image
    aligned_counts_by_image = {entry["filename"]: {gt_id: 0 for gt_id in gt_ids} for entry in alignment_register}
    
    # Fill the dictionary
    for entry in alignment_register:
        filename = entry["filename"]
        gt_id = entry["GT_id"]
        aligned_lines_count = entry["aligned_lines_count"]
        aligned_counts_by_image[filename][gt_id] = aligned_lines_count
    
    return aligned_counts_by_image

def identify_top_n_best_gt(aligned_counts_by_image, n_best_gt):
    """ 
    Identify the n best GTs for each image based on the number of aligned lines.
    Parameters:
    - aligned_counts_by_image: dictionary with aligned lines counts for each image and GT
    - n_best_gt: number of best GTs to identify
    """
    top_n_best_gt = {}
    for filename, gt_counts in aligned_counts_by_image.items():
        # Sort GTs by aligned lines count
        sorted_gt_counts = sorted(gt_counts.items(), key=lambda x: x[1], reverse=True)
        # Take top n GTs
        top_n_gt_counts = sorted_gt_counts[:n_best_gt]
        top_n_best_gt[filename] = [(gt_id, aligned_lines_count) for gt_id, aligned_lines_count in top_n_gt_counts]
    return top_n_best_gt

def get_pk_from_filename(all_parts_infos,filename):
    """
    Function to get the pk and the title in eScriptorium of a part from its filename.
    Parameters:
    - all_parts_infos: list of dictionaries containing informations about the parts.
    requested from the eScriptorium API. This dictionnary is requested (all_parts_infos = get_all_parts(doc_pk))
    from the eScriptorium API outisde of this function to avoid multiple requests.
    - filename of the image. Extension should be .jpg, but the function
    handles the case where the extension is missing or different.
    """
    filename, extension = os.path.splitext(filename)
    for item in all_parts_infos:
        if item['filename'] == filename + '.jpg':
            return (item['pk'], item['title'])
    return None

def create_tsv(aligned_counts_by_image, doc_pk, top_n_best_gt, display_n_best_gt, n_best_gt):
    """
    Create a TSV file with aligned lines counts for each GT and image.
    If display_n_best_gt is True, include columns for the n best GTs with most aligned lines.
    The TSV is based on the aligned_counts_by_image dictionary.        
    """
    # Collect all unique GT_ids
    gt_ids = sorted(set(gt_id for gt_counts in aligned_counts_by_image.values() for gt_id in gt_counts.keys()))

    # Update TSV header
    tsv_header = "filename\tpk\ttitle\t"  # Added 'title' column header
    if display_n_best_gt:
        tsv_header += "\t".join(f"best_GT_{i}_id\tbest_GT_{i}_aligned_lines_count" for i in range(1, n_best_gt + 1)) + "\t"
    tsv_header += "\t".join(gt_ids) + "\n"
    
    # Create TSV rows
    tsv_rows = ""
    all_parts_infos = get_all_parts(doc_pk)
    for filename, gt_counts in aligned_counts_by_image.items():
        row = filename + "\t"
        
        # Retrieve PK and title based on filename
        pk, title = get_pk_from_filename(all_parts_infos, filename)
        row += f"{pk}\t{title}\t"  # Add PK and title columns
        
        if display_n_best_gt:
            for i in range(1, n_best_gt + 1):
                gt_id, aligned_lines_count = top_n_best_gt[filename][i-1] if filename in top_n_best_gt else ('', '')
                row += f"{gt_id}\t{aligned_lines_count}\t"
        for gt_id in gt_ids:
            row += str(gt_counts.get(gt_id, '')) + "\t"
        tsv_rows += row.strip() + "\n"
    
    return tsv_header + tsv_rows

def create_tsv_from_alignment_register(alignment_register,doc_pk, display_n_best_gt=True, n_best_gt=1):
    """
    Create a tsv file with the number of aligned lines for each GT and image.
    This function cab give the n best GT for each image.
    Rows: one row per image (filename from the XML file)
    Columns:
    - one column per Ground Truth (GT_ids)
    - one column with the number of aligned lines for the best GT
    - additional columns for n best GTs with most aligned lines (optional)
    """
    aligned_counts_by_image = create_aligned_counts_by_image(alignment_register)
    top_n_best_gt = identify_top_n_best_gt(aligned_counts_by_image, n_best_gt)
    tsv_content = create_tsv(aligned_counts_by_image, doc_pk, top_n_best_gt, display_n_best_gt, n_best_gt)

    # Write to file
    with open("alignment_data.tsv", "w") as tsv_file:
        tsv_file.write(tsv_content)

switching to  msIA


In [21]:
alignment_register = load_alignment_register(alignment_register_path="alignment_register/alignment_register.json")
create_tsv_from_alignment_register(alignment_register,doc_pk=4381, display_n_best_gt=True, n_best_gt=3)

### Find the best GTs for a single image

In [23]:
def find_image_n_best_gt(image_name, n_best_gt):
    """
    Function to get the n best GT for a specific image.
    
    Parameters:
    - image_name: name of the image (filename without extension)
    - n_best_gt: number of best GTs to retrieve (default is 3)
    
    Returns:
    - List of tuples containing the n best GTs for the specified image, where each tuple
      contains the GT id and the number of aligned lines.
    - If no data is found for the image, returns a string indicating so.
    """
    # keep the image name as entered by the user for the output message
    user_image_name = image_name

    # Remove the file extension
    image_name, extension = os.path.splitext(image_name)

    # Remove the ".xml" extension if present
    image_name = image_name + ".xml" if not image_name.endswith(".xml") else image_name
    
    alignment_register = load_alignment_register('alignment_register/alignment_register.json')
    aligned_counts_by_image = create_aligned_counts_by_image(alignment_register)
    top_n_best_gt = identify_top_n_best_gt(aligned_counts_by_image, n_best_gt)
    
    # Trouver les n meilleurs GT pour l'image spécifiée
    if image_name in top_n_best_gt:
        top_gt_list = top_n_best_gt[image_name]
        print(f"Top {n_best_gt} GTs for image {image_name}:")
        for i, (gt_id, aligned_lines_count) in enumerate(top_gt_list, start=1):
            print(f"\tTop {i} GT: {gt_id} with {aligned_lines_count} alignments")
        return top_gt_list
    else:
        return f"No data found for image {user_image_name}"


In [24]:
find_image_n_best_gt('IE87726132_00023.png', n_best_gt=3)

Top 3 GTs for image IE87726132_00023.xml:
	Top 1 GT: MT_NoVoc_concatenated.txt with 22 alignments
	Top 2 GT: Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt with 14 alignments
	Top 3 GT: Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt with 14 alignments


[('MT_NoVoc_concatenated.txt', 22),
 ('Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt', 14),
 ('Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt', 14)]

### Importing altos in eScriptorium

In [25]:
# Importing the necessary functions and packages for the eScriptorium API
from functions import *
from packages import *

In [26]:
# eScriptorium document where the alignment will be imported
doc_pk = 4381

In [27]:
# Insert date + time
from datetime import datetime
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y-%H_%M_%S")

# ZIP and import each file in the folder
for zip_filename in os.listdir(path_xmls_for_eSc):
    if zip_filename.endswith('.zip'):
        # Build the full path to the ZIP file
        zip_file_path = os.path.join(path_xmls_for_eSc, zip_filename)
        
        # name of the alignment file
        name = zip_filename.split(".")[0]+"_pip_test_regexp_" + dt_string
        

        # Import in eScriptorium
        import_xml(doc_pk, path_xmls_for_eSc, zip_filename, name)
        
        print(f"{zip_filename} has been successfully imported into eScriptorium.")

print(f"Link do the document in eScriptorium: https://msia.escriptorium.fr/document/{doc_pk}/images/")

200 b'{"status":"ok"}'
Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
MT_NoVoc_concatenated.txt_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Siddur_Ashkenaz_clean_concatenated.txt_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt_alignment.zip has been successfully imported into eScriptorium.
Link do the document in eScriptorium: https://msia.escriptorium.fr/document/4381/images/


### Cleaning up the transcription levels in eSc document

In [28]:
def display_active_transcription_levels(doc_pk):
    """
    Display the active transcription levels in a document.
    In order to be able to delete them, if necessary.
    """
    active_transcription_level_list = []
    # Display the basic information of the document
    nu_parts,transcription_level_list,region_type_list,line_type_list = get_basic_info(doc_pk=doc_pk)
    print('---')
    # Display the pk and the names of the transcriptions in the document, that are not archived
    print(f"Active transcription levels in doc {doc_pk}:")
    for transcription in transcription_level_list:    
        if transcription['archived'] == False:
            print(f"\t{transcription['pk']} {transcription['name']}")
            active_transcription_level_list.append(transcription['pk'])
    print(f'list of active transcription levels: {active_transcription_level_list}')

In [29]:
display_active_transcription_levels(doc_pk=4381)

get document segmentation ontology for document:  4381
https://msia.escriptorium.fr/api/documents/4381/
Document: 4381  with  64  parts
region types: [{'pk': 6912, 'name': 'Catchword'}, {'pk': 3, 'name': 'Commentary'}, {'pk': 6910, 'name': 'FooterCentral'}, {'pk': 6914, 'name': 'Handwritten'}, {'pk': 6908, 'name': 'Header'}, {'pk': 4, 'name': 'Illustration'}, {'pk': 2, 'name': 'Main'}, {'pk': 6909, 'name': 'MainCentral'}, {'pk': 6911, 'name': 'Margin'}, {'pk': 6913, 'name': 'RunningHeader'}, {'pk': 1, 'name': 'Title'}]
line types: [{'pk': 15289, 'name': 'Handwritten'}, {'pk': 15288, 'name': 'MainCentral'}, {'pk': 15287, 'name': 'NotMain'}]
transcription_level_list: [{'pk': 11039, 'name': 'Siddur_Ashkenaz_novoc_no_lbs_Daniel_pip_test_regexp_15/05/2024-15_38_32', 'archived': False, 'avg_confidence': 0.9847869869290219}, {'pk': 11037, 'name': 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated_pip_test_regexp_15/05/2024-15_38_32', 'archived': False, 'avg_confidence': 0.9847869869290219}, {'pk

In [30]:
tr_level_list_to_delete = [11039, 11037, 11038, 11035, 11036, 11034, 11032, 11030, 11033, 11031]

In [31]:
def delete_tr_levels(doc_pk,tr_level_list_to_delete):
    """
    Function to delete a list of transcription layers
    from a specific document.
    """
    for tr_level in tr_level_list_to_delete:
        delete_url = f"https://msia.escriptorium.fr/api/documents/{doc_pk}/transcriptions/{tr_level}/"
        r = requests.delete(delete_url, headers=headers)
        if r.status_code == 204:
            print(f"Transcription level {tr_level} has been deleted from document {doc_pk}.")
        else:
            print(f"Error: Transcription level {tr_level} could not be deleted from document {doc_pk}.")
            print(r.text)
    

In [32]:
delete_tr_levels(doc_pk,tr_level_list_to_delete)

Transcription level 11039 has been deleted from document 4381.
Transcription level 11037 has been deleted from document 4381.
Transcription level 11038 has been deleted from document 4381.
Transcription level 11035 has been deleted from document 4381.
Transcription level 11036 has been deleted from document 4381.
Transcription level 11034 has been deleted from document 4381.
Transcription level 11032 has been deleted from document 4381.
Transcription level 11030 has been deleted from document 4381.
Transcription level 11033 has been deleted from document 4381.
Transcription level 11031 has been deleted from document 4381.


# Exploring the alignment register with pandas

In [41]:
import pandas as pd
df = pd.read_json('alignment_register/alignment_register.json')

In [42]:
df.head()

Unnamed: 0,filename,aligned_lines_count,GT_id
0,IE34120895_00033.xml,0,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
1,IE35481905_00027.xml,0,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
2,IE61220167_00084.xml,9,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
3,IE87234800_00004.xml,17,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
4,IE87234800_00005.xml,22,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...


In [43]:
# display le lines where GT_ID is 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated'
df[df['GT_id'] == 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated']

# sort by number of aligned lines
df.sort_values(by='aligned_lines_count', ascending=False)

# display files with more than 5 lines aligned
df[df['aligned_lines_count'] > 5]

Unnamed: 0,filename,aligned_lines_count,GT_id
2,IE61220167_00084.xml,9,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
3,IE87234800_00004.xml,17,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
4,IE87234800_00005.xml,22,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
6,IE87363222_00008.xml,13,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
7,IE87447950_00011.xml,12,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
...,...,...,...
200,IE87752740_00022.xml,13,Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt
201,IE87752740_00038.xml,10,Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt
202,IE87755510_00024.xml,16,Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt
204,IE87555665_00021.xml,6,Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt


In [44]:
# display GT_id values
df['GT_id'].unique()

array(['Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt',
       'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated.txt',
       'MT_NoVoc_concatenated.txt',
       'Siddur_Ashkenaz_clean_concatenated.txt',
       'Siddur_Ashkenaz_novoc_no_lbs_Daniel.txt'], dtype=object)

In [45]:

all_parts = get_all_parts(doc_pk)
print(all_parts)

[{'pk': 734735, 'name': '', 'filename': 'IE103402206_00010.jpg', 'title': 'Element 1', 'typology': None, 'image': {'uri': '/media/documents/4381/IE103402206_00010.jpg', 'size': [1702, 2511], 'thumbnails': {'card': '/media/documents/4381/IE103402206_00010.jpg.180x180_q85_crop-smart.jpg', 'large': '/media/documents/4381/IE103402206_00010.jpg.1000x1000_q85.jpg'}}, 'image_file_size': 285936, 'original_filename': 'IE103402206_00010.jpg', 'bw_image': None, 'workflow': {'convert': 'done', 'segment': 'done', 'align': 'done'}, 'order': 0, 'recoverable': False, 'transcription_progress': 100, 'source': 'mets//export_doc4367_test4matthieu_alto_202404261220.zip/IE103402206_00010.jpg', 'max_avg_confidence': 0.9849466164969554, 'comments': 'tanach'}, {'pk': 734736, 'name': '', 'filename': 'IE103402206_00014.jpg', 'title': 'Element 2', 'typology': None, 'image': {'uri': '/media/documents/4381/IE103402206_00014.jpg', 'size': [1709, 2509], 'thumbnails': {'card': '/media/documents/4381/IE103402206_00014.

In [46]:
part_pk = next((item['pk'] for item in all_parts if item['filename'] == 'IE87532920_00033.jpg'), None)
print(part_pk)
print(f"https://msia.escriptorium.fr/document/{doc_pk}/part/{part_pk}/edit")

None
https://msia.escriptorium.fr/document/4381/part/None/edit


In [48]:
# display files where GT_ID is 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated' and aligned_lines_count > 5
df[(df['GT_id'] == 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated.txt') & (df['aligned_lines_count'] > 1)].sort_values(by='aligned_lines_count', ascending=False)


Unnamed: 0,filename,aligned_lines_count,GT_id
11,IE87474895_00044.xml,27,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
19,IE87582245_00015.xml,23,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
8,IE87447950_00015.xml,22,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
35,IE87744435_00039.xml,22,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
26,IE87708411_00021.xml,22,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
4,IE87234800_00005.xml,22,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
34,IE87744435_00015.xml,21,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
16,IE87580382_00041.xml,20,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
13,IE87502633_00044.xml,20,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...
9,IE87447950_00018.xml,20,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatena...


In [49]:
list = ["IE87755510_00024.jpg", "IE87752740_00038.jpg", "IE87752740_00022.jpg", "IE87752740_00019.jpg", "IE87744435_00039.jpg", "IE87744435_00015.jpg", "IE87739615_00009.jpg", "IE87733114_00008.jpg", "IE87733114_00007.jpg", "IE87733114_00006.jpg", "IE87726132_00023.jpg", "IE87719995_00053.jpg", "IE87719995_00046.jpg", "IE87717323_00012.jpg", "IE87716931_00014.jpg", "IE87708411_00021.jpg", "IE87708411_00019.jpg", "IE87705976_00036.jpg", "IE87705976_00034.jpg", "IE87705971_00006.jpg", "IE87700963_00014.jpg", "IE87694978_00020.jpg", "IE87690674_00007.jpg", "IE87675634_00010.jpg", "IE87675634_00007.jpg", "IE87610546_00015.jpg", "IE87582245_00015.jpg", "IE87581919_00018.jpg", "IE87581919_00016.jpg", "IE87581919_00011.jpg", "IE87580382_00041.jpg", "IE87580382_00008.jpg", "IE87555665_00021.jpg", "IE87532920_00026.jpg", "IE87519524_00013.jpg", "IE87508468_00031.jpg", "IE87502633_00044.jpg", "IE87502633_00018.jpg", "IE87476216_00051.jpg", "IE87474895_00044.jpg", "IE87474895_00014.jpg", "IE87447950_00018.jpg", "IE87447950_00015.jpg", "IE87447950_00011.jpg", "IE87363222_00008.jpg", "IE87297122_00014.jpg", "IE87234800_00005.jpg", "IE87234800_00004.jpg", "IE87234800_00003.jpg"]
doc_pk = 4381

In [50]:
all_parts = get_all_parts(doc_pk)

part_pk_list = []
for picture in list:
    part_pk = next((item['pk'] for item in all_parts if item['filename'] == picture), None)
    part_pk_list.append(part_pk)
print(part_pk_list)


[734798, 734797, 734796, 734795, 734794, 734793, 734792, 734791, 734790, 734789, 734788, 734787, 734786, 734785, 734784, 734783, 734782, 734781, 734780, 734779, 734778, 734777, 734776, 734775, 734774, 734773, 734772, 734771, 734770, 734769, 734768, 734767, 734766, 734765, 734764, 734763, 734762, 734761, 734760, 734759, 734758, 734757, 734756, 734755, 734754, 734753, 734752, 734751, 734750]


In [51]:
print(len(part_pk_list))

49


In [52]:
get_basic_info(doc_pk)

get document segmentation ontology for document:  4381
https://msia.escriptorium.fr/api/documents/4381/
Document: 4381  with  64  parts
region types: [{'pk': 6912, 'name': 'Catchword'}, {'pk': 3, 'name': 'Commentary'}, {'pk': 6910, 'name': 'FooterCentral'}, {'pk': 6914, 'name': 'Handwritten'}, {'pk': 6908, 'name': 'Header'}, {'pk': 4, 'name': 'Illustration'}, {'pk': 2, 'name': 'Main'}, {'pk': 6909, 'name': 'MainCentral'}, {'pk': 6911, 'name': 'Margin'}, {'pk': 6913, 'name': 'RunningHeader'}, {'pk': 1, 'name': 'Title'}]
line types: [{'pk': 15289, 'name': 'Handwritten'}, {'pk': 15288, 'name': 'MainCentral'}, {'pk': 15287, 'name': 'NotMain'}]
transcription_level_list: [{'pk': 10801, 'name': 'Sid_Ashk_Daniel_08_7_600', 'archived': False, 'avg_confidence': None}, {'pk': 10800, 'name': 'Siddur_Ashkenaz_novoc_no_lbs_Danielpipeline_lev_r_08', 'archived': False, 'avg_confidence': 0.9847869869290219}, {'pk': 10799, 'name': 'Siddur_Ashkenaz_clean_concatenatedpipeline_lev_r_08', 'archived': False,

(64,
 [{'pk': 10801,
   'name': 'Sid_Ashk_Daniel_08_7_600',
   'archived': False,
   'avg_confidence': None},
  {'pk': 10800,
   'name': 'Siddur_Ashkenaz_novoc_no_lbs_Danielpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 10799,
   'name': 'Siddur_Ashkenaz_clean_concatenatedpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 10797,
   'name': 'Machzor_Yom_Kippur_Ashkenaz_clean_concatenatedpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 10798,
   'name': 'MT_NoVoc_concatenatedpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.986672220664065},
  {'pk': 10796,
   'name': 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenatedpipeline_lev_r_08',
   'archived': False,
   'avg_confidence': 0.9847869869290219},
  {'pk': 10765,
   'name': 'MT_NoVoc_pipeline_lev_ratio_08',
   'archived': True,
   'avg_confidence': 0.9790699318765446},
  {'pk': 10766,
   'na

In [35]:
# Choose the transcription level
tr_level_pk = 10678
# Choose the region type
region_type_pk_list = [6798]


# get the xmls of the parts from eScriptorium
export_xml(doc_pk,part_pk_list,tr_level_pk,region_type_pk_list,include_undefined = False, include_orphan = False, file_format = 'alto',include_images = True, print_status = True)


200
b'{"status":"ok"}'


<Response [200]>