# Import Passim results into eScriptorium
This script creates xml altos from Passim alignment results, for import into eScriptorium.


### Importing Required Libraries

In [10]:
import os
import json
from bs4 import BeautifulSoup

### Loading JSON data into a list of dictionaries from a JSONLines file

In [11]:
# path to the directory containing the passim output
path_passim_output = 'json_from_passim/part-00000-fea080fa-623d-4af9-8684-2cc8257adde1-c000_1v3.json'

In [12]:
# Open the jsonlines file and load its content into the data variable
# Read the JSON file and store its content as a list of dictionaries
with open(path_passim_output, encoding="utf-8") as json_file:
    data = [json.loads(line) for line in json_file]

### Parsing XML files and updating text content from JSON data, then saving the modified XML files to a different directory



In [13]:
# Path to the directory containing the alto files from eScriptorium
path_xmls = 'xmls_from_eSc/export_doc3585_1col_05_alto_202403260856'

# Path to the output directory, where the new alto files (and zip) will be saved
path_alto = 'altos_for_eSc_2'


In [7]:
import os
import json
import zipfile
from bs4 import BeautifulSoup

# Path to the JSON file containing the data to be used
path_passim_output = 'json_from_passim/part-00000-fea080fa-623d-4af9-8684-2cc8257adde1-c000_1v3.json'

# Path to the folder containing the original XML files
path_xmls = 'xmls_from_eSc/export_doc3585_1col_05_alto_202403260856'

# Create a dictionary to store data grouped by id2 value
grouped_data = {}

# Load data from JSON file and group by id2
with open(path_passim_output, encoding="utf-8") as json_file:
    for line in json_file:
        item = json.loads(line)
        id2 = item.get('id2')
        if id2 not in grouped_data:
            grouped_data[id2] = []
        grouped_data[id2].append(item)

# Process each data group separately
for id2, data in grouped_data.items():
    # Create a folder to store XML files processed for this id2 value
    output_folder = os.path.join(path_alto, id2)
    os.makedirs(output_folder, exist_ok=True)

    # Loop through all XML files in the folder
    for filename in os.listdir(path_xmls):
        if filename.endswith('.xml'):
            with open(os.path.join(path_xmls, filename), encoding="utf-8") as xml_file:
                xml = xml_file.read()
            soup = BeautifulSoup(xml, 'xml')

            for text_line in soup.find_all('TextLine'):
                text_line_id = text_line.get('ID')

                for item in data:
                    item_id = item.get('id')
                    item_id_prefix = '_'.join(item_id.split('_')[:3])

                    if text_line_id == item_id_prefix:
                        s2_value = item.get('s2')
                        string_elements = text_line.find_all('String')
                        if string_elements:
                            print(f"Match found between {filename} - {text_line_id} / {id2}:\n{string_elements[0]['CONTENT']} / {s2_value}")
                            string_elements[0]['CONTENT'] = s2_value

                        break
                else:
                    string_elements = text_line.find_all('String')
                    if string_elements:
                        string_elements[0]['CONTENT'] = ''

            # Write the modified XML file to the output folder corresponding to the id2 value
            output_file_path = os.path.join(output_folder, filename)
            with open(output_file_path, 'w', encoding="utf-8") as output_file:
                output_file.write(str(soup))
                print(f"{filename} processed and recorded in {output_folder}")

    # Create a specific name for the ZIP archive based on the id2 value
    zip_file_name = f"{id2}_alignment.zip"
    zip_file_path = os.path.join(path_alto, zip_file_name)

    # Create a zip file of the XML files in the output folder
    with zipfile.ZipFile(zip_file_path, 'w') as zipf:
        for root, _, files in os.walk(output_folder):
            for file in files:
                zipf.write(os.path.join(root, file), arcname=file)

    print(f"XML files in {output_folder} compressed in {zip_file_path}")

IE30411733_00064.xml processed and recorded in altos_for_eSc_2/01MT_NoVoc
Match found between IE35427954_00020.xml - eSc_line_65dbfccb / 01MT_NoVoc:
הנחמה החיה באור חראה . הן כל אלה יפעל אל פעמים שלשעם גבר כדי להשיב נפשו / וחיתי באור תראה : הן כל אלה יפעל אל פעמים שלוש עם גבר --: להשיב נפשו
Match found between IE35427954_00020.xml - eSc_line_465c49d1 / 01MT_NoVoc:
הנה אשרי אנוש יוכיתנו אלוה אם מוסר שדי אל ימאס כי הוא יכאיב ויחבש ג"כ בשובו / דל תקוה ועלתה קפצה פיה : הנה אשרי אנוש יוכ-חנו אלוה --ומוסר שדי אל תמאס : כי הוא יכאיב ויחבש
IE35427954_00020.xml processed and recorded in altos_for_eSc_2/01MT_NoVoc
Match found between IE21323667_00006.xml - eSc_line_611d813c / 01MT_NoVoc:
שהבטה' בין הבתרים עמדה ליעקב להצילו מעשו בעת הברכות כמו שאמ' יקרבו ימי אבל אבי ואהרגה את / י ידעתי את מכאביו : וארד ----להצילו מיד
Match found between IE21323667_00006.xml - eSc_line_578a3d11 / 01MT_NoVoc:
וכל אשר לו ויבא בארה שבע ויזבה זבחים לאלהי אביו יצחק ויאמ' אלהים לישראל וגו' אל תירא מרדה / מר אנכי- האל אל

### Importing altos in eScriptorium

In [14]:
# Importing the necessary functions and packages for the eScriptorium API
from functions import *
from packages import *

switching to  msIA


In [15]:
# eScriptorium document where the alignment will be imported
doc_pk = 3585

In [16]:
# ZIP and import each file in the folder
for zip_filename in os.listdir(path_alto):
    if zip_filename.endswith('.zip'):
        # Build the full path to the ZIP file
        zip_file_path = os.path.join(path_alto, zip_filename)
        
        # name of the alignment file
        name = zip_filename.split(".")[0]

        # Import in eScriptorium
        import_xml(doc_pk, path_alto, zip_filename, name)
        
        print(f"{zip_filename} has been successfully imported into eScriptorium.")

print(f"Link do the document in eScriptorium: https://msia.escriptorium.fr/document/{doc_pk}/images/")

200 b'{"status":"ok"}'
01MT_NoVoc_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Pesach_Haggadah_alignment.zip has been successfully imported into eScriptorium.
200 b'{"status":"ok"}'
Siddur_Ashkenaz_alignment.zip has been successfully imported into eScriptorium.
Link do the document in eScriptorium: https://msia.escriptorium.fr/document/3585/images/
