# Import Passim results into eScriptorium
These scripts creates xml altos from Passim alignment results, for import into eScriptorium.


### Importing Required Libraries

In [12]:
import os
import json
import re
from bs4 import BeautifulSoup
from pprint import pprint

### Loading JSON data into a list of dictionaries from a JSONLines file

In [13]:
# path to the directory containing the passim output
path_passim_output = 'json_from_passim/out_n7_m5_a7_g25_linewise/out.json'

In [14]:
# List to store data from all JSON files
out_passim_list = []

# Loop through each file in the directory
for file in os.listdir(path_passim_output):
    if file.endswith(".json"):
        file_path = os.path.join(path_passim_output, file)
        # Open the JSON file and load its content as a list of dictionaries
        with open(file_path, 'r', encoding="utf-8") as json_file:
            data = [json.loads(line) for line in json_file]
            out_passim_list.extend(data)


In [15]:
# Checking some content
print(out_passim_list[-1]['alg'])
print(out_passim_list[-1]['alg2'])

 של עולם כבו שכבש רחמיו לעשות🍺רצונך בלבב שלם כן יכבשו רחמיך🍺את בעסך -----‧ ויגולו רחמיך על מדותיך- ותתגהג🍺עמנו 
שחט אותו כדי לעשות רצונך --------------------כן יכבשו רחמיך את כעסך מעלינו ויג-לו רחמיך על מדותיך, ו-תכנס אתנו 


In [16]:
print(f"Number of blocks to be processed:{len(out_passim_list)}")

Number of blocks to be processed:175


### Splitting the aligned GT text portions into lines corresponding to OCR lines
Passim found alignments (between OCR and GT) at the textblock level.
Now we have to split the selected GT, in sections that correspond to the OCR lines.
This can be done by using the 'alg' and 'alg2' fields in passim output, containing the OCR and GT alignments, respectively.
Those fiels contain the exact same character numbers.
The special caracters '🍺' inserted in the OCR text, during the concatenation of the OCR lines
will be used to split the GT text in sections, as they give the position where the GT text should be split.

The splitted GT alignments will be stored in dictionaries in the lines_dict_with_alg_GT

In [17]:
# list of GTs found in alignments:
GT_ids = list(set([textblock['id2'] for textblock in out_passim_list]))
print('list of GTs found in alignments:', GT_ids)

# Iterate on GT_ids
for GT_id in GT_ids:

    print(f"--- Processing of GT {GT_id} ---")

    # Load dictionary containing OCR line-splitting information
    with open('ocr_lines_dict/ocr_lines_dict.json', 'r', encoding="utf-8") as json_file:
        ocr_lines_dict = json.load(json_file)
    
    # iterate over out_passim_list dictionaries
    # (out_passim_list: passim alignment results, by textblock.)
    for textblock in out_passim_list:
        if textblock['id2'] == GT_id:
            
            # retrieve textblock and file identifiers
            textblock_id = re.sub(r'.*(eSc_textblock_[a-f0-9]+).*', r'\1', textblock['id'])
            filename = re.sub(r'.*' + textblock_id + '_(.*)', r'\1', textblock['id'])

            # retrieve GT text aligned with OCR
            # and divide it into lines
            OCR_block = textblock['alg']
            GT_block = textblock['alg2']
            OCR_block_begin = textblock['begin']
            GT_id = textblock['id2'] # digital witness id

            # find the separators in the OCR_block
            separator = r'🍺'
            matches = re.finditer(separator, OCR_block)
            indexes = [match.start() for match in matches]

            # Split GT alignments into lines, according to separator index in OCR_block
            GT_lines = []
            word_break_char = '\u2011' # used to materialize word breaks in the GT
            
            if len(indexes) == 0:
                GT_lines.append(GT_block)
            else:
                for i, index in enumerate(indexes):
                    if i == 0:
                        # check if the GT_block splitting leads to a word break
                        if GT_block[index] not in (' ', '-'):
                            GT_lines.append(GT_block[:index] + word_break_char)
                        else:
                            GT_lines.append(GT_block[:index])
                        
                    else:
                        # check if the GT_block splitting leads to a word break
                        if GT_block[index] not in (' ', '-'):
                            GT_lines.append(GT_block[indexes[i-1]:index] + word_break_char)
                        else:                
                            GT_lines.append(GT_block[indexes[i-1]:index])                
                GT_lines.append(GT_block[indexes[-1]:])

                # cleaning of splitted GT lines
                # replace '-' (45) with '', but avoid empty lines
                GT_lines = [line.replace('-', '') if line.replace('-', '') else line for line in GT_lines]
                # replace '-' (8208) with '-' (45)
                GT_lines = [line.replace(chr(8208), '-') for line in GT_lines]
                # remove leading and trailing spaces, but avoid empty lines
                GT_lines = [line.strip() if line.strip() else line for line in GT_lines]

            # find the textblock in ocr_lines_dict, to retrieve lines from OCR
            for part in ocr_lines_dict: # ocr_lines_dict: ocr line infos, grouped by part and textblocks
                if part['filename'] == filename:
                    for block in part['ocr_blocks']:
                        if block['text_block_id'] == textblock_id:
                        
                            # find the OCR line corresponding to the start of the GT_block
                            for index, line in enumerate(block['ocr_lines']):
                                if OCR_block_begin >= line['start'] and OCR_block_begin <= line['end']:
                                    line['alg_GT'] = GT_lines.pop(0)
                                    line['GT_id'] = GT_id
                                    
                                    for i in range(index + 1, len(block['ocr_lines'])):
                                        if GT_lines:
                                            block['ocr_lines'][i]['alg_GT'] = GT_lines.pop(0)
                                            block['ocr_lines'][i]['GT_id'] = GT_id

                                        else:
                                            break

                        
    # Save a JSON file by GT_id
    directory = 'lines_dict_with_alg_GT'
    os.makedirs(directory, exist_ok=True)
    file_path = os.path.join(directory, f'lines_dict_with_alg_{GT_id}.json')
    with open(file_path, 'w', encoding="utf-8") as json_file:
        json.dump(ocr_lines_dict, json_file, ensure_ascii=False, indent=4)



list of GTs found in alignments: ['Machzor_Yom_Kippur_Ashkenaz_clean_concatenated', '01MT_NoVoc', 'Siddur_Ashkenaz_clean_concatenated', 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated']
--- Processing of GT Machzor_Yom_Kippur_Ashkenaz_clean_concatenated ---
--- Processing of GT 01MT_NoVoc ---
--- Processing of GT Siddur_Ashkenaz_clean_concatenated ---
--- Processing of GT Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated ---


### Parsing XML files and updating text content with GT alignments

In [18]:
# Path to the directory containing the alto files from eScriptorium
path_xmls = 'xmls_from_eSc'

# Path to the output directory, where the new alto files (and zip) will be saved
path_alto = 'altos_for_eSc_linewise'

# Path to directory containing the dictionaries with the alignment of the GT for each OCR line
path_alg_dicts = 'lines_dict_with_alg_GT'

In [19]:
import zipfile
from bs4 import BeautifulSoup
import Levenshtein

def process_alignment(path_alg_dicts, path_alto, path_xmls, levenshtein_threshold=0.8):
    ''' Process the alignment of the GT on the OCR text lines and save the modified XML files in a ZIP archive
    The ZIP archive will be sent to eScriptorium.
    Each alignment candidate with a Levenshtein similarity ratio above the threshold will be considered as a validated alignment,
    and inserted in the XML file.

    Parameters:
    path_alg_dicts (str): Path to the directory containing the dictionaries with the alignment of the different GT, for each OCR line
    path_alto (str): Path to the output directory, where the new alto files (and zip) will be saved
    path_xmls (str): Path to the directory containing the alto files from eScriptorium
    '''
    alignment_register = []  # list of XML files containing an alignment, number of aligned lines and the id of the GT  in this file.

    # Load each JSON file in the lines_dict_with_alg_GT directory
    for file in os.listdir(path_alg_dicts):
        if file.endswith('.json'):
            with open(os.path.join(path_alg_dicts, file), 'r', encoding="utf-8") as json_file:
                lines_dict = json.load(json_file)
            # retrieve the id2 (GT name) from the filename
            id2 = re.sub(r'lines_dict_with_alg_(.*).json', r'\1', file)
            print(f"--- File {file} loaded with id2 = {id2}")

            # Create a folder to store XML files processed for this id2 value
            # These altos files will be compressed into a ZIP file and sent to eScriptorium
            output_folder = os.path.join(path_alto, id2)
            os.makedirs(output_folder, exist_ok=True)

            # List the xmls files containing GT alignments for the current GT
            xml_files_with_alg = [part['filename'] for part in lines_dict
                      for block in part['ocr_blocks']
                      for line in block['ocr_lines']
                      if line.get('alg_GT') and line['GT_id'] == id2]

            print(f"List of XML files containing alignments for {id2}: {xml_files_with_alg}")

            # Loop through all the raw XML files imported from eScriptorium
            for filename in os.listdir(path_xmls):
                if filename.endswith('.xml'):
                    if os.path.splitext(filename)[0] not in xml_files_with_alg:
                        # print(f"Skipping {filename} as it does not contain any alignment for {id2}")
                        continue
                    print(f"Processing {filename} as it contains alignments for {id2}")

                    line_count = 0 # count the number of lines with validated alignment
                    with open(os.path.join(path_xmls, filename), encoding="utf-8") as xml_file:
                        xml = xml_file.read()
                    soup = BeautifulSoup(xml, 'xml')

                    for text_line in soup.find_all('TextLine'):
                        text_line_id = text_line.get('ID')

                        for part in lines_dict:
                            for block in part['ocr_blocks']:
                                for line in block['ocr_lines']:
                                    if line.get('line_id') == text_line_id:
                                        alg_GT_value = line.get('alg_GT', None)  # get GT alignment if available
                                        if alg_GT_value is not None:
                                            string_elements = text_line.find_all('String')
                                            if string_elements:
                                                levenshtein_ratio = Levenshtein.ratio(alg_GT_value, string_elements[0]['CONTENT'])
                                                print(f"Levenshtein ratio: {levenshtein_ratio}")
                                                if levenshtein_ratio >= levenshtein_threshold:
                                                    string_elements[0]['CONTENT'] = alg_GT_value
                                                    line_count += 1

                                                else:
                                                    string_elements[0]['CONTENT'] = ''
                                        else:
                                            string_elements = text_line.find_all('String')
                                            if string_elements:
                                                string_elements[0]['CONTENT'] = ''

                    # Add alignment data to register
                    alignment_register.append({
                        'filename': filename,
                        'aligned_lines_count': line_count,
                        'GT_id': id2
                    })

                    # Write the modified XML file to the output folder corresponding to the id2 value (GT)
                    output_file_path = os.path.join(output_folder, filename)
                    with open(output_file_path, 'w', encoding="utf-8") as output_file:
                        output_file.write(str(soup))
                        print(f"{filename} processed and recorded in {output_folder}")

        # Create a specific name for the ZIP archive based on the id2 value
        zip_file_name = f"{id2}_alignment.zip"
        zip_file_path = os.path.join(path_alto, zip_file_name)

        # Create a zip file of the XML files in the output folder
        with zipfile.ZipFile(zip_file_path, 'w') as zipf:
            for root, _, files in os.walk(output_folder):
                for file in files:
                    zipf.write(os.path.join(root, file), arcname=file)

        print(f"XML files in {output_folder} compressed in {zip_file_path}")

    # Save alignment register in JSON format
    output_folder = 'alignment_register/'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    register_file_path = os.path.join(output_folder, 'alignment_register.json')
    with open(register_file_path, 'w', encoding="utf-8") as json_file:
        json.dump(alignment_register, json_file, ensure_ascii=False, indent=4)

    return alignment_register

In [None]:
# Run the process_alignment function in parallel

from multiprocessing import Pool

# Define a function to process alignment for a single set of arguments
def process_alignment_single(args):
    path_alg_dicts, path_alto, path_xmls = args
    return process_alignment(path_alg_dicts, path_alto, path_xmls)

# Define the arguments for each process
arguments = [(path_alg_dicts, path_alto, path_xmls) for _ in range(4)]

# Use multiprocessing.Pool to process alignments in parallel
with Pool(4) as p:
    alignment_register = p.map(process_alignment_single, arguments)


In [20]:
# Run the process_alignment function
process_alignment(path_alg_dicts, path_alto, path_xmls, levenshtein_threshold=0.8)


--- File lines_dict_with_alg_01MT_NoVoc.json loaded with id2 = 01MT_NoVoc
List of XML files containing alignments for 01MT_NoVoc: ['IE103402206_00010', 'IE103402206_00010', 'IE103402206_00010', 'IE103402206_00010', 'IE103402206_00010', 'IE103402206_00010', 'IE103402206_00010', 'IE103402206_00010', 'IE103402206_00010', 'IE103402206_00010', 'IE103402206_00010', 'IE103402206_00014', 'IE103402206_00014', 'IE103402206_00014', 'IE103402206_00014', 'IE103402206_00014', 'IE103402206_00014', 'IE103402206_00014', 'IE103402206_00014', 'IE103402206_00014', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE103409244_00025', 'IE10340

[{'filename': 'IE103402206_00010.xml',
  'aligned_lines_count': 9,
  'GT_id': '01MT_NoVoc'},
 {'filename': 'IE103402206_00014.xml',
  'aligned_lines_count': 7,
  'GT_id': '01MT_NoVoc'},
 {'filename': 'IE103409244_00025.xml',
  'aligned_lines_count': 27,
  'GT_id': '01MT_NoVoc'},
 {'filename': 'IE34120895_00033.xml',
  'aligned_lines_count': 1,
  'GT_id': '01MT_NoVoc'},
 {'filename': 'IE34120895_00046.xml',
  'aligned_lines_count': 0,
  'GT_id': '01MT_NoVoc'},
 {'filename': 'IE34120895_00054.xml',
  'aligned_lines_count': 0,
  'GT_id': '01MT_NoVoc'},
 {'filename': 'IE35481905_00011.xml',
  'aligned_lines_count': 0,
  'GT_id': '01MT_NoVoc'},
 {'filename': 'IE36149273_00006.xml',
  'aligned_lines_count': 0,
  'GT_id': '01MT_NoVoc'},
 {'filename': 'IE36149273_00009.xml',
  'aligned_lines_count': 0,
  'GT_id': '01MT_NoVoc'},
 {'filename': 'IE36149273_00015.xml',
  'aligned_lines_count': 0,
  'GT_id': '01MT_NoVoc'},
 {'filename': 'IE61220167_00083.xml',
  'aligned_lines_count': 5,
  'GT_id':

### Importing altos in eScriptorium

In [21]:
# Importing the necessary functions and packages for the eScriptorium API
from functions import *
from packages import *

switching to  msIA


In [22]:
# eScriptorium document where the alignment will be imported
doc_pk = 4368

In [23]:
# ZIP and import each file in the folder
for zip_filename in os.listdir(path_alto):
    if zip_filename.endswith('.zip'):
        # Build the full path to the ZIP file
        zip_file_path = os.path.join(path_alto, zip_filename)
        
        # name of the alignment file
        name = zip_filename.split(".")[0]

        # Import in eScriptorium
        import_xml(doc_pk, path_alto, zip_filename, name)
        
        print(f"{zip_filename} has been successfully imported into eScriptorium.")

print(f"Link do the document in eScriptorium: https://msia.escriptorium.fr/document/{doc_pk}/images/")

200 b'{"status":"ok"}'
01MT_NoVoc_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Machzor_Yom_Kippur_Ashkenaz_clean_concatenated_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Siddur_Ashkenaz_clean_concatenated_alignment.zip has been successfully imported into eScriptorium.
Link do the document in eScriptorium: https://msia.escriptorium.fr/document/4368/images/


# Exploring the alignment register with pandas

In [24]:
import pandas as pd
df = pd.read_json('alignment_register/alignment_register.json')

In [25]:
df.head()

Unnamed: 0,filename,aligned_lines_count,GT_id
0,IE103402206_00010.xml,9,01MT_NoVoc
1,IE103402206_00014.xml,7,01MT_NoVoc
2,IE103409244_00025.xml,27,01MT_NoVoc
3,IE34120895_00033.xml,1,01MT_NoVoc
4,IE34120895_00046.xml,0,01MT_NoVoc


In [26]:
# display le lines where GT_ID is 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated'
df[df['GT_id'] == 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated']

# sort by number of aligned lines
df.sort_values(by='aligned_lines_count', ascending=False)

# display files with more than 5 lines aligned
df[df['aligned_lines_count'] > 5]

Unnamed: 0,filename,aligned_lines_count,GT_id
0,IE103402206_00010.xml,9,01MT_NoVoc
1,IE103402206_00014.xml,7,01MT_NoVoc
2,IE103409244_00025.xml,27,01MT_NoVoc
11,IE61220167_00097.xml,7,01MT_NoVoc
13,IE61220167_00084.xml,11,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated
19,IE61220167_00084.xml,10,Machzor_Yom_Kippur_Ashkenaz_clean_concatenated
23,IE61220167_00084.xml,8,Siddur_Ashkenaz_clean_concatenated


In [27]:
# display GT_id values
df['GT_id'].unique()

array(['01MT_NoVoc', 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated',
       'Machzor_Yom_Kippur_Ashkenaz_clean_concatenated',
       'Siddur_Ashkenaz_clean_concatenated'], dtype=object)

In [28]:
all_parts = get_all_parts(doc_pk)
print(all_parts)

[{'pk': 734556, 'name': '', 'filename': 'IE103402206_00010.jpg', 'title': 'Element 1', 'typology': None, 'image': {'uri': '/media/documents/4368/IE103402206_00010.jpg', 'size': [1702, 2511], 'thumbnails': {'card': '/media/documents/4368/IE103402206_00010.jpg.180x180_q85_crop-smart.jpg', 'large': '/media/documents/4368/IE103402206_00010.jpg.1000x1000_q85.jpg'}}, 'image_file_size': 285936, 'original_filename': 'IE103402206_00010.jpg', 'bw_image': None, 'workflow': {'convert': 'done'}, 'order': 0, 'recoverable': False, 'transcription_progress': 0, 'source': 'mets//export_doc4367_test4matthieu_alto_202404241147.zip/IE103402206_00010.jpg', 'max_avg_confidence': None, 'comments': None}, {'pk': 734557, 'name': '', 'filename': 'IE103402206_00014.jpg', 'title': 'Element 2', 'typology': None, 'image': {'uri': '/media/documents/4368/IE103402206_00014.jpg', 'size': [1709, 2509], 'thumbnails': {'card': '/media/documents/4368/IE103402206_00014.jpg.180x180_q85_crop-smart.jpg', 'large': '/media/docume

In [37]:
part_pk = next((item['pk'] for item in all_parts if item['filename'] == 'IE87532920_00033.jpg'), None)
print(part_pk)
print(f"https://msia.escriptorium.fr/document/{doc_pk}/part/{part_pk}/edit")

734238
https://msia.escriptorium.fr/document/4366/part/734238/edit


In [21]:
# display files where GT_ID is 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated' and aligned_lines_count > 5
df[(df['GT_id'] == 'Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated') & (df['aligned_lines_count'] > 1)].sort_values(by='aligned_lines_count', ascending=False)


Unnamed: 0,filename,aligned_lines_count,GT_id
518,IE87744435_00039.xml,21,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated
451,IE87502633_00044.xml,21,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated
445,IE87474895_00044.xml,21,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated
496,IE87708411_00021.xml,20,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated
440,IE87447950_00015.xml,19,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated
...,...,...,...
436,IE87379183_00005.xml,2,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated
455,IE87521073_00008.xml,2,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated
458,IE87532920_00033.xml,2,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated
469,IE87582245_00010.xml,2,Machzor_Rosh_Hashanah_Ashkenaz_clean_concatenated
