# Preparing data for passim

These scripts are used to prepare the data for the passim alignment process.
  
The input datas are the xml altos from eScriptorium containing the OCR text,
and the digital editions from Sefaria (cleaned and concatenated with this pipeline:
https://github.com/Freymat/from_Sefaria_to_Passim).

The output file, that will be processed with passim is a jsonl file, each line of which is a dictionary containing
either:
- the content of an OCR textblock. These textblocks are constituted by the concatenation of the text of the OCR lines, from a part region.
  The content of the OCR lines is retrieved from the xmls alto files from eScriptorium.
- the text of a digital edition (Ground Truth), concatenated in one line. Those texts are retrieved from Sefaria, cleanded and concatenated.

Passim will then align the OCR textblocks with the Ground Truth texts, and will output a jsonl file containing the alignment.

### Importing Required Libraries

In [1]:
import json # To work with JSON data
import jsonlines # To write data in JSON Lines format
import os # To interact with the operating system
import glob # To search for files in a directory
from xml.etree import ElementTree # To parse XML files
import subprocess # To run Passim
from pprint import pprint

# Import functions for eScriptorium's API
from functions import *

switching to  msIA


### Initializing

In [3]:
# ground truth files
GT_texts_path = "digital_editions"

# xmls files from eScriptorium (OCR results) /path
xmls_directory_path = "xmls_from_eSc"

# path to the dictionary that will contain the extracted textblocks from OCR
ocr_lines_dict_path = "ocr_lines_dict/ocr_lines_dict.json"

# path for the JSON file that will be used as input for Passim
input_passim_path = "json_for_passim/passim_input.json" # path for the output JSON file. This file will be used as input for Passim.




In [4]:
# Initialize the list where output datas for Passim will be stored
def initialize_passim_input():
    global passim_input
    passim_input = []
    return passim_input

In [5]:
# initialize_passim_input()

### Build the content of OCR textblocks and prepare them for Passim

#### Loop through XML files from eSC, and extract TextLine elements text and ID

In [6]:
import glob
import os
from xml.etree import ElementTree
from pprint import pprint

def extract_ocr_textblocks(xmls_directory_path, ocr_lines_dict_path):
    """ This script reads the XML alto files produced by eScriptorium,
    extracts and concatenate the text of the ocr lines from each TextBlock elements.

    The result is a list of dictionaries, one per alto file.
    Each dictionnary contains the list of the text blocks in the file. Each element of this list is a dictionary containing:
    - the concatenated text of the lines in the TextBlock element,
    - the ID of the TextBlock element,
    - the IDs of the TextLine elements in the text block, and the starting position of each line in the concatenated text.
    """

    # Initialize list to store parts
    parts = []

    # Loop through all XML files in the directory
    for filename in glob.glob(os.path.join(xmls_directory_path, "*.xml")):
        # Obtenir le nom de chaque fichier
        basename = os.path.splitext(os.path.basename(filename))[0]

        # Initialize list to store text blocks
        blocks = []

        # Parse the XML file
        tree = ElementTree.parse(filename)
        root = tree.getroot()

        # Loop through all TextBlock elements in the XML file
        for text_block in root.iter("{http://www.loc.gov/standards/alto/ns-v4#}TextBlock"):
            # Obtenir l'ID de l'attribut ID de l'élément TextBlock
            text_block_id = text_block.get("ID")

            lines = []
            continuous_text = ""
            char_count = 0  # Initial position for continuous text

            # Loop through all TextLine elements in the TextBlock element
            for text_line in text_block.iter("{http://www.loc.gov/standards/alto/ns-v4#}TextLine"):
                text = text_line.find("{http://www.loc.gov/standards/alto/ns-v4#}String").get("CONTENT").strip()

                line_dict = {
                    "line_id": text_line.get("ID"),
                    "start": char_count,  # Start position of line in continuous text
                    "end" : char_count + len(text) -1,
                    "length": len(text),
                    "text": text
                }
                separator = "\n"
                continuous_text += (text + separator)
                char_count += len(text + separator)  # Updates the starting position for the next line.

                lines.append(line_dict)

            # Add text block to block list
            blocks.append({
                "ocr_block_text": continuous_text.strip(),  # Removes superfluous spaces at the beginning and end
                "text_block_id": text_block_id,
                "ocr_lines_in_block": len(lines),  # Number of lines in the block
                "ocr_lines": lines,
                "series": 'OCR'  # Distinguishes OCR from control
            })

        # Ajouter un document avec ses blocs correspondants à la liste des parties

        parts.append({
            "filename": basename,
            "ocr_lines_in_part": sum([block["ocr_lines_in_block"] for block in blocks]),
            "ocr_blocks": blocks,
            # total number of ocr lines in the xml part, for the selected text regions
            "ocr_lines_in_part": sum([block["ocr_lines_in_block"] for block in blocks])
        })

    # Save the 'parts' dictionnary to a JSON file named lines_dict
    if not os.path.exists("ocr_lines_dict"):
        os.makedirs("ocr_lines_dict")

    # Save the 'parts' dictionnary to a JSON file
    with open(ocr_lines_dict_path, "w", encoding="utf-8") as file_handler:
        json.dump(parts, file_handler, ensure_ascii=False, indent=4)

    # pprint(parts)


In [7]:
# extract_ocr_textblocks(xmls_directory_path, ocr_lines_dict_path)

#### Build the input data for passim, from the OCR line dictionnary

In [8]:
def add_OCR_textblocks_to_passim_input(ocr_lines_dict_path):
    '''
    Read the JSON file containing the OCR textblocks and build the input for Passim.
    GT texts still need to be added to the input. 
    '''
    # open the dictionnary
    with open(ocr_lines_dict_path, "r", encoding="utf-8") as f:
        parts = json.load(f)        
    
    for part in parts:
        for block in part["ocr_blocks"]:
            text_block_id = block["text_block_id"]
            text_block_text = block["ocr_block_text"]
            filename = part["filename"]
            passim_input.append({"id": text_block_id +'_' + filename, "series": 'OCR',"ref": '0', "text": text_block_text})
            # print(text_block_id, filename)
    # print(passim_input)
    return passim_input


In [9]:
# add_OCR_textblocks_to_passim_input(ocr_lines_dict_path)

### Build the GT text datas for Passim
Add every txt file in the GT directory to the output data

In [10]:
def add_GT_texts_to_passim_input(GT_texts_path):
    '''
    Add every digital witness text to the passim_input list 
    '''
    for root, dirs, files in os.walk(GT_texts_path):
        for file in files:
            if file.endswith(".txt"):
                text_file = os.path.join(root, file)
                with open(text_file, "r", encoding="utf-8") as file_handler:
                    text = file_handler.read()
                    filename = os.path.basename(text_file)
                    passim_input.append({"id": filename, "series": 'GT', "ref": '1', "text": text})
                    # print(f"Added to output: {filename}")
       # print(passim_input)
    return passim_input

In [11]:
# print(passim_input)

In [12]:
# add_GT_texts_to_passim_input(GT_texts_path)

### Writing Data to JSONLines File in Compact Format without ASCII Encoding, for Passim

In [13]:
def write_passim_input_to_json(input_passim_path, passim_input):
    ''' 
    Write Data to JSONLines File in compact format without ASCII Encoding, for Passim
    '''
    # Open the output file in write mode
    with open(input_passim_path, "w", encoding="utf-8") as file_handler:
        # Create a jsonlines writer object that writes to the output file
        writer = jsonlines.Writer(file_handler)
        # Loop through each item in the output_data list
        for item in passim_input:
            # Write the current item to the output file using the jsonlines writer
            writer.write(item)
    print(f"input file for passim created: {input_passim_path}")

In [14]:
# write_passim_input_to_json(input_passim_path, passim_input)

### Build the input for Passim - global function

In [15]:
def build_passim_input(xmls_directory_path, ocr_lines_dict_path, GT_texts_path, input_passim_path):
    '''
    Build the input for Passim from the OCR textblocks and the GT texts.
    Parameters:
    - xmls_directory_path: path to the directory containing the XML alto files imported from eScriptorium, containing the OCR results.
    - ocr_lines_dict_path: path to the JSON file that will contain the extracted textblocks from OCR.
    - GT_texts_path: path to the directory containing the ground truth texts.
    - input_passim_path: path for the output JSON file. This file will be used as input for Passim.
    '''
    # Initialize the list where output datas for Passim will be stored
    initialize_passim_input()
    # Extract OCR textblocks from XML alto files imported from eScriptorium
    extract_ocr_textblocks(xmls_directory_path, ocr_lines_dict_path)
    # Add OCR textblocks to the passim_input list
    add_OCR_textblocks_to_passim_input(ocr_lines_dict_path)
    # Add GT texts to the passim_input list
    add_GT_texts_to_passim_input(GT_texts_path)
    # Write the passim_input list to a JSON file
    write_passim_input_to_json(input_passim_path, passim_input)

In [16]:
build_passim_input(xmls_directory_path, ocr_lines_dict_path, GT_texts_path, input_passim_path)

input file for passim created: json_for_passim/passim_input.json


# Composing the command to run Passim

In [2]:
# Command Line to request an interactive session on HTC
# example: % srun -t 0-08:00 -n 4 --mem 2G --pty bash -i
t = "12:00:00" # Session duration - hours:minutes:seconds
n_cores = 60 # number of cpu cores
mem = 128 # memory per node, in GB
driver_mem = 40 # memory for the driver, in GB


command_srun = f"srun -t {t} -n {n_cores} --mem {mem}G --pty bash -i"

# Command Line to run Passim

n = 15 # n-gram order (default: 25) 
# m = 5 # Minimum number of n-gram matches between document (default: 5)
# a = 7 # Minimum length of alignment (default: 50)
# g = 25 # Minimum size of gap that separates passages (default: 600)
align_mode = 'docwise' # alignment mode

input_file = "passim_input.json" # input file for Passim
output_folder = f"out_n{n}_{align_mode}"

command_passim = f"SPARK_SUBMIT_ARGS='--master local[{n_cores}] --executor-memory {mem}G --driver-memory {driver_mem}G' seriatim --{align_mode} --floating-ngrams --fields ref --filterpairs 'ref = 1 AND ref2 = 0' --all-pairs --complete-lines -n {n} {input_file} {output_folder}"

print(f"Session interactive HTC:\n{command_srun}")
print(f"Lauch Singularity container")
print(r"singularity shell --bind /sps:/sps --bind /pbs:/pbs --nv /sps/humanum/eScriptorium/share/acdc.sif")
print(f"Passim command line:\n{command_passim}")

Session interactive HTC:
srun -t 12:00:00 -n 60 --mem 128G --pty bash -i
Lauch Singularity container
singularity shell --bind /sps:/sps --bind /pbs:/pbs --nv /sps/humanum/eScriptorium/share/acdc.sif
Passim command line:
SPARK_SUBMIT_ARGS='--master local[60] --executor-memory 128G --driver-memory 40G' seriatim --docwise --floating-ngrams --fields ref --filterpairs 'ref = 1 AND ref2 = 0' --all-pairs --complete-lines -n 15 passim_input.json out_n15_docwise


In [14]:
# Executing passim locally
subprocess.run(command_passim, shell=True)

https://repos.spark-packages.org/ added as a remote repository with the name: repo-1
Ivy Default Cache set to: /home/matthieu/.ivy2/cache
The jars for the packages stored in: /home/matthieu/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e70355d0-3756-4e46-947a-16d0cd92321e;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/matthieu/anaconda3/envs/acdc/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found graphframes#graphframes;0.8.0-spark3.0-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 274ms :: artifacts dl 10ms
	:: modules in use:
	graphframes#graphframes;0.8.0-spark3.0-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-e70355d0-3756-4e46-947a-16d0cd92321e
	confs: [default]
	0 artifacts copied, 2 already retrieved (0kB/10ms)


Namespace(id='id', text='text', locs='locs', pages='pages', minDF=2, maxDF=100, min_match=5, n=7, floating_ngrams=True, complete_lines=True, gap=600, max_offset=20, beam=20, pcopy=0.8, min_align=50, src_overlap=0.9, dst_overlap=0.5, fields=['ref'], filterpairs='ref = 1 AND ref2 = 0', all_pairs=True, pairwise=False, docwise=True, linewise=False, to_pairs=False, to_extents=False, link_model=None, link_features=None, log_level='WARN', input_format='json', output_format='json', inputPath='json_for_passim/passim_input.json', outputPath='json_from_passim/out_n7_docwise_complete-lines')
15:02:27.011 [Thread-5] WARN  org.apache.spark.sql.catalyst.util.SparkStringUtils - Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


CompletedProcess(args="SPARK_SUBMIT_ARGS='--master local[4] --executor-memory 6G --driver-memory 4G' seriatim --docwise --floating-ngrams --fields ref --filterpairs 'ref = 1 AND ref2 = 0' --all-pairs --complete-lines -n 7 json_for_passim/passim_input.json json_from_passim/out_n7_docwise_complete-lines", returncode=0)