In [22]:
import os
import shutil
import xml.etree.ElementTree as ET
import openpyxl

# Function to replace commas with spaces in all coordinate attributes
def modify_coordinates_in_xml(xml_file, output_folder, jpg_filename):
    # Define the namespace
    namespace = {'ns': 'http://www.loc.gov/standards/alto/ns-v4#'}

    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Add the missing xmlns attribute to the root element (if not already present)
    if 'xmlns' not in root.attrib:
        root.set('xmlns', 'http://www.loc.gov/standards/alto/ns-v4#')

    # Iterate through all elements in the XML
    for elem in root.iter():
        # Iterate through all attributes of each element
        for key, value in elem.attrib.items():
            # Check if the attribute value contains coordinates (e.g., points or coords or baseline)
            if 'POINTS' in key or 'COORDS' in key or 'BASELINE' in key:
                # Replace commas with spaces in the coordinates
                new_value = value.replace(',', ' ')
                # Update the attribute with the modified value
                elem.attrib[key] = new_value

            # Remove MASK if empty
            if key == 'MASK' and not value:
                del elem.attrib[key]

            # Ensure proper 'BASELINE' data (add placeholder if missing)
#             if key == 'BASELINE' and not value:
#                 # Add a default baseline if missing (This is a placeholder, adjust as needed)
#                 elem.attrib[key] = "0 0 0 0 0 0 0 0"  # Example placeholder, you can adjust this

        # Update 'fileName' attribute without the namespace prefix
        if elem.tag == '{http://www.loc.gov/standards/alto/ns-v4#}fileName':  # Adjust based on actual tag with namespace
            elem.tag = 'fileName'  # Remove the namespace prefix by changing the tag name
            elem.text = jpg_filename  # Set the filename without the path

    # Write the modified XML to the new folder
    modified_xml_file = os.path.join(output_folder, os.path.basename(xml_file))
    tree.write(modified_xml_file)

    return modified_xml_file

# Function to copy files and create metadata
def process_folder_structure(root_folder, destination_folder, metadata_filename):
    # Create a new Excel file for storing metadata
    wb = openpyxl.Workbook()
    sheet = wb.active
    sheet.append(["File Name", "Path"])  # Header for the metadata

    # Traverse the folder structure
    for subdir, dirs, files in os.walk(root_folder):
        # Look for JPG files and Alto XML files in the appropriate subfolders
        if subdir.endswith("alto"):  # If we're in the 'alto' folder
            # Find all JPGs in the parent folder (above 'alto')
            parent_folder = os.path.dirname(subdir)
            for file in os.listdir(parent_folder):
                if file.lower().endswith(('.jpg', '.jpeg', '.tif', '.tiff')):  # Handle jpg, jpeg, tif, and tiff files
                    # Copy the JPG file to the destination folder
                    image_file_path = os.path.join(parent_folder, file)
                    destination_image = os.path.join(destination_folder, file)
                    shutil.copy(image_file_path, destination_image)

                    # Log metadata for the image
                    sheet.append([file, image_file_path])

            # Now look for the XML files in the 'alto' folder
            for file in files:
                if file.lower().endswith(".xml"):
                    # Process the XML file by fixing its coordinates and updating its filename
                    xml_file_path = os.path.join(subdir, file)
                    image_filename = os.path.splitext(file)[0] + (".tif" if file.lower().endswith(".tif") else ".jpg")  # Update extension accordingly
                    modified_xml_path = modify_coordinates_in_xml(xml_file_path, destination_folder, image_filename)

                    # Log metadata for the XML
                    sheet.append([file, xml_file_path])

    # Save the metadata Excel file
    metadata_file_path = os.path.join(destination_folder, metadata_filename)
    wb.save(metadata_file_path)

    print(f"Processing complete. Metadata saved in: {metadata_file_path}")


# Example usage
root_folder = 'HebHTR_Transkribus'  # Replace with the path to your root folder
destination_folder = 'HTR_pages_Transkribus_NEW'  # Replace with the path to your destination folder
metadata_filename = 'page_metadata.xlsx'  # Name for the metadata file

# Make sure the destination folder exists
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Call the function to process the folder structure
process_folder_structure(root_folder, destination_folder, metadata_filename)


Processing complete. Metadata saved in: HTR_pages_Transkribus\page_metadata.xlsx


In [2]:
# adapting coordinates (no commas and flipping LTR)
import os
import xml.etree.ElementTree as ET
import shutil
import openpyxl

def flip_ltr_baseline_if_needed(baseline_str):
    coords = baseline_str.replace(',', ' ').split()
    if len(coords) % 2 != 0:
        return baseline_str  # Invalid, skip flipping

    points = [(float(coords[i]), float(coords[i+1])) for i in range(0, len(coords), 2)]
    x_values = [x for x, y in points]

    # Detect if LTR: Xs mostly decreasing
    if all(x_values[i] > x_values[i + 1] for i in range(len(x_values) - 1)):
        flipped = list(reversed(points))
        return ' '.join(f"{x:.2f} {y:.2f}" for x, y in flipped)
    else:
        return baseline_str.replace(',', ' ')  # Just replace commas with spaces if not LTR


def modify_coordinates_in_xml(xml_file, output_folder, jpg_filename):
    namespace = {'ns': 'http://www.loc.gov/standards/alto/ns-v4#'}
    tree = ET.parse(xml_file)
    root = tree.getroot()

    if 'xmlns' not in root.attrib:
        root.set('xmlns', 'http://www.loc.gov/standards/alto/ns-v4#')

    for elem in root.iter():
        for key, value in elem.attrib.items():
            if 'BASELINE' in key:
                new_value = flip_ltr_baseline_if_needed(value)
                elem.attrib[key] = new_value

            elif 'POINTS' in key or 'COORDS' in key:
                elem.attrib[key] = value.replace(',', ' ')

            if key == 'MASK' and not value:
                del elem.attrib[key]

        # Fix <fileName> element tag and value
        if elem.tag == '{http://www.loc.gov/standards/alto/ns-v4#}fileName':
            elem.tag = 'fileName'
            elem.text = jpg_filename

    modified_xml_file = os.path.join(output_folder, os.path.basename(xml_file))
    tree.write(modified_xml_file)
    return modified_xml_file

# Function to copy files and create metadata
def process_folder_structure(root_folder, destination_folder, metadata_filename):
    # Create a new Excel file for storing metadata
    wb = openpyxl.Workbook()
    sheet = wb.active
    sheet.append(["File Name", "Path"])  # Header for the metadata

    # Traverse the folder structure
    for subdir, dirs, files in os.walk(root_folder):
        # Look for JPG files and Alto XML files in the appropriate subfolders
        if subdir.endswith("alto"):  # If we're in the 'alto' folder
            # Find all JPGs in the parent folder (above 'alto')
            parent_folder = os.path.dirname(subdir)
            for file in os.listdir(parent_folder):
                if file.lower().endswith(('.jpg', '.jpeg', '.tif', '.tiff')):  # Handle jpg, jpeg, tif, and tiff files
                    # Copy the JPG file to the destination folder
                    image_file_path = os.path.join(parent_folder, file)
                    destination_image = os.path.join(destination_folder, file)
                    shutil.copy(image_file_path, destination_image)

                    # Log metadata for the image
                    sheet.append([file, image_file_path])

            # Now look for the XML files in the 'alto' folder
            for file in files:
                if file.lower().endswith(".xml"):
                    # Process the XML file by fixing its coordinates and updating its filename
                    xml_file_path = os.path.join(subdir, file)
                    image_filename = os.path.splitext(file)[0] + (".tif" if file.lower().endswith(".tif") else ".jpg")  # Update extension accordingly
                    modified_xml_path = modify_coordinates_in_xml(xml_file_path, destination_folder, image_filename)

                    # Log metadata for the XML
                    sheet.append([file, xml_file_path])

    # Save the metadata Excel file
    metadata_file_path = os.path.join(destination_folder, metadata_filename)
    wb.save(metadata_file_path)

    print(f"Processing complete. Metadata saved in: {metadata_file_path}")


# Example usage
root_folder = 'HebHTR_Transkribus'  # Replace with the path to your root folder
destination_folder = 'HTR_pages_Transkribus_NEW - Copy'  # Replace with the path to your destination folder
metadata_filename = 'page_metadata.xlsx'  # Name for the metadata file

# Make sure the destination folder exists
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Call the function to process the folder structure
process_folder_structure(root_folder, destination_folder, metadata_filename)

Processing complete. Metadata saved in: HTR_pages_Transkribus_NEW - Copy\page_metadata.xlsx


In [4]:
import os
import xml.etree.ElementTree as ET

# Function to modify XML to point to the corresponding image file
def update_xml_filename(xml_file, image_files, output_folder):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Add the missing xmlns attribute to the root element (if not already present)
    if 'xmlns' not in root.attrib:
        root.set('xmlns', 'http://www.loc.gov/standards/alto/ns-v4#')
        
    # Get the base filename without extension
    base_filename = os.path.splitext(os.path.basename(xml_file))[0]

    # Find the corresponding image file by matching base filenames
    corresponding_image = None
    for image in image_files:
        if os.path.splitext(image)[0] == base_filename:
            corresponding_image = image
            break

            
    
    # If a corresponding image is found, update the XML
    if corresponding_image:
        # Look for the fileName tag (or its equivalent) in the XML
        for elem in root.iter():
            if 'fileName' in elem.tag:  # Assuming 'fileName' is the tag you want to modify
                elem.text = corresponding_image  # Set the exact filename (including extension)

        # Write the modified XML to the output folder
        modified_xml_file = os.path.join(output_folder, os.path.basename(xml_file))
        tree.write(modified_xml_file)
        print(f"Updated XML: {modified_xml_file}")
    else:
        print(f"Warning: No corresponding image found for {xml_file}")

# Function to process the folder and update all XML files
def process_folder(folder_path, output_folder):
    # Get a list of all XML and image files in the folder
    xml_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.xml')]
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.tif', '.tiff'))]

#     # Ensure there are exactly 330 XML and 330 image files
#     if len(xml_files) != 330 or len(image_files) != 330:
#         print("Error: The folder should contain exactly 330 XML files and 330 image files.")
#         return

    # Process each XML file
    for xml_file in xml_files:
        xml_file_path = os.path.join(folder_path, xml_file)
        update_xml_filename(xml_file_path, image_files, output_folder)

# Example usage
folder_path = 'HTR_pages_Transkribus_NEW - Copy'  # Replace with your folder path
output_folder = 'HTR_pages_Transkribus_NEW - Copy'  # Replace with your output folder path

# Make sure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Process the folder and update the XML files
process_folder(folder_path, output_folder)


Updated XML: HTR_pages_Transkribus_NEW - Copy\0001_11-3_312_5_0002-Mattathias-0003.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0001_11-3_312_5_0003-Mattathias-0001.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0001_26.4.48 - P1.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0001_CamScanner 11-05-2020 13.22_page-0023.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0001_CamScanner 11-05-2020 13.22_page-0034.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0001_CamScanner 11-05-2020 13.22_page-0039.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0001_CamScanner 11-05-2020 13.22_page-0043.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0001_CamScanner 11-05-2020 13.22_page-0050.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0001_CamScanner 11-05-2020 13.22_page-0052.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0001_CamScanner 11-05-2020 13.22_page-0055.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0001_CamScanner 11-05-2020 13.22_page-0057.xml
Updated XML: H

Updated XML: HTR_pages_Transkribus_NEW - Copy\0009_11-3_312_5_0002-Mattathias-0055.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0009_11-3_312_5_0003-Mattathias-0014.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0009_ea.en.letters.01.01.0008.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0009_ea.en.letters.01.01.03.0009.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0009_IMG_20190731_123729.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0009_IMG_20190801_164646.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0009_אחי145-3_Page_09.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0010_11-3_312_5_0002-Mattathias-0057.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0010_11-3_312_5_0003-Mattathias-0015.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0010_ea.en.letters.01.01.0009.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0010_ea.en.letters.01.01.03.0011.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0010_IMG_20190731_123743.xml
Updated XML: HTR_pages_Transk

Updated XML: HTR_pages_Transkribus_NEW - Copy\0037_11-3_312_5_0002-Mattathias-0196.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0037_IMG_20190731_124135.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0037_IMG_20190801_165030.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0038_11-3_312_5_0002-Mattathias-0198.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0038_IMG_20190731_124147.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0038_IMG_20190801_165034.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0039_11-3_312_5_0002-Mattathias-0200.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0039_IMG_20190731_124153.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0039_IMG_20190801_165042.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0040_11-3_312_5_0002-Mattathias-0201.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0040_IMG_20190731_124202.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0040_IMG_20190801_165047.xml
Updated XML: HTR_pages_Transkribus_NEW - Copy\0041_1

In [16]:
# update flipped coordinates in xmls in a single folder
import os
import xml.etree.ElementTree as ET

# Function to flip coordinates if they are RTL
def flip_coordinates_if_needed(baseline):
    # Split the baseline into pairs of (X, Y)
    coordinates = baseline.split()
    
    # Extract the X-coordinates
    x_coords = coordinates[::2]  # X-coordinates are at even indices
    
    # Check if the X-coordinates are decreasing (RTL)
    if all(x_coords[i] > x_coords[i+1] for i in range(len(x_coords)-1)):
        flipped_coordinates = []
        
        # Reverse the X-coordinates while keeping the Y-coordinates in the same order
        for i in range(0, len(coordinates), 2):
            flipped_coordinates.insert(0, coordinates[i])  # X-coordinate
            flipped_coordinates.insert(0, coordinates[i+1])  # Y-coordinate
        
        # Return the flipped coordinates
        return ' '.join(flipped_coordinates)
    
    # Handle cases where X-coordinates might be too close (e.g., vertical lines)
    # We only flip if there's a significant difference between X-coordinates
    x_differences = [abs(int(x_coords[i]) - int(x_coords[i+1])) for i in range(len(x_coords)-1)]
    
    # If X-coordinates are too close (indicating vertical), we won't flip
    if all(diff < 10 for diff in x_differences):  # Threshold of 10, adjust as needed
        return baseline
    
    # Otherwise, return as is (LTR)
    return baseline

# Function to modify XML to point to the corresponding image file and flip coordinates
def update_xml_filename(xml_file, image_files, output_folder):
    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Add the missing xmlns attribute to the root element (if not already present)
    if 'xmlns' not in root.attrib:
        root.set('xmlns', 'http://www.loc.gov/standards/alto/ns-v4#')
        
    # Get the base filename without extension
    base_filename = os.path.splitext(os.path.basename(xml_file))[0]

    # Find the corresponding image file by matching base filenames
    corresponding_image = None
    for image in image_files:
        if os.path.splitext(image)[0] == base_filename:
            corresponding_image = image
            break
    
    # If a corresponding image is found, update the XML
    if corresponding_image:
        # Look for the fileName tag (or its equivalent) in the XML
        for elem in root.iter():
            if 'fileName' in elem.tag:  # Assuming 'fileName' is the tag you want to modify
                elem.text = corresponding_image  # Set the exact filename (including extension)
        
        # Check and flip the coordinates if needed
        for elem in root.iter():
            if 'BASELINE' in elem.attrib:  # Look for the BASELINE attribute
                flipped_value = flip_coordinates_if_needed(elem.attrib['BASELINE'])  # Flip if needed
                elem.attrib['BASELINE'] = flipped_value  # Set the flipped coordinates
        
        # Write the modified XML to the output folder
        modified_xml_file = os.path.join(output_folder, os.path.basename(xml_file))
        tree.write(modified_xml_file)
        print(f"Updated XML: {modified_xml_file}")
    else:
        print(f"Warning: No corresponding image found for {xml_file}")

# Function to process the folder and update all XML files
def process_folder(folder_path, output_folder):
    # Get a list of all XML and image files in the folder
    xml_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.xml')]
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.tif', '.tiff'))]

    # Process each XML file
    for xml_file in xml_files:
        xml_file_path = os.path.join(folder_path, xml_file)
        update_xml_filename(xml_file_path, image_files, output_folder)

# Example usage
folder_path = 'HTR_pages_Transkribus'  # Replace with your folder path
output_folder = 'HTR_pages_Transkribus'  # Replace with your output folder path

# Make sure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Process the folder and update the XML files
process_folder(folder_path, output_folder)


Updated XML: HTR_pages_Transkribus\0001_11-3_312_5_0002-Mattathias-0003.xml
Updated XML: HTR_pages_Transkribus\0001_11-3_312_5_0003-Mattathias-0001.xml
Updated XML: HTR_pages_Transkribus\0001_26.4.48 - P1.xml
Updated XML: HTR_pages_Transkribus\0001_CamScanner 11-05-2020 13.22_page-0023.xml
Updated XML: HTR_pages_Transkribus\0001_CamScanner 11-05-2020 13.22_page-0034.xml
Updated XML: HTR_pages_Transkribus\0001_CamScanner 11-05-2020 13.22_page-0039.xml
Updated XML: HTR_pages_Transkribus\0001_CamScanner 11-05-2020 13.22_page-0043.xml
Updated XML: HTR_pages_Transkribus\0001_CamScanner 11-05-2020 13.22_page-0050.xml
Updated XML: HTR_pages_Transkribus\0001_CamScanner 11-05-2020 13.22_page-0052.xml
Updated XML: HTR_pages_Transkribus\0001_CamScanner 11-05-2020 13.22_page-0055.xml
Updated XML: HTR_pages_Transkribus\0001_CamScanner 11-05-2020 13.22_page-0057.xml
Updated XML: HTR_pages_Transkribus\0001_ea.en.letters.01.01.01.0010.xml
Updated XML: HTR_pages_Transkribus\0001_ea.en.letters.01.01.02.

Updated XML: HTR_pages_Transkribus\0009_11-3_312_5_0002-Mattathias-0055.xml
Updated XML: HTR_pages_Transkribus\0009_11-3_312_5_0003-Mattathias-0014.xml
Updated XML: HTR_pages_Transkribus\0009_ea.en.letters.01.01.0008.xml
Updated XML: HTR_pages_Transkribus\0009_ea.en.letters.01.01.03.0009.xml
Updated XML: HTR_pages_Transkribus\0009_ea.en.letters.01.01.03.0012.xml
Updated XML: HTR_pages_Transkribus\0009_IMG_20190731_123729.xml
Updated XML: HTR_pages_Transkribus\0009_IMG_20190801_164646.xml
Updated XML: HTR_pages_Transkribus\0009_אחי145-3_Page_09.xml
Updated XML: HTR_pages_Transkribus\0010_11-3_312_5_0002-Mattathias-0057.xml
Updated XML: HTR_pages_Transkribus\0010_11-3_312_5_0003-Mattathias-0015.xml
Updated XML: HTR_pages_Transkribus\0010_11.xml
Updated XML: HTR_pages_Transkribus\0010_ea.en.letters.01.01.0009.xml
Updated XML: HTR_pages_Transkribus\0010_ea.en.letters.01.01.03.0011.xml
Updated XML: HTR_pages_Transkribus\0010_IMG_20190731_123743.xml
Updated XML: HTR_pages_Transkribus\0010_IM

ValueError: invalid literal for int() with base 10: '301.99999999999966'

In [None]:
######## OLD stuff

In [3]:
import os
import xml.etree.ElementTree as ET

def update_xml_file(xml_path, tiff_to_jpeg_map):
    """
    Update the XML file to point to the corresponding JPEG image.
    """
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        namespace = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'}

        page_element = root.find('ns:Page', namespace)

        if page_element is not None:
            tiff_filename = page_element.get('imageFilename')
            if tiff_filename in tiff_to_jpeg_map:
                jpeg_filename = tiff_to_jpeg_map[tiff_filename]
                page_element.set('imageFilename', jpeg_filename)
                tree.write(xml_path)
                print(f'Updated {xml_path} to point to {jpeg_filename}')
            else:
                print(f'No JPEG mapping found for {tiff_filename} in {xml_path}')
        else:
            print(f'No Page element found in {xml_path}')
    except Exception as e:
        print(f'Error updating XML file {xml_path}: {e}')

def main():
    directory = r'C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\uri_gt'

    # Collect TIFF to JPEG mappings
    tiff_to_jpeg_map = {}
    for file in os.listdir(directory):
        if file.lower().endswith('.tif'):
            tiff_filename = file
            jpeg_filename = file.replace('.tif', '.jpeg')
            tiff_to_jpeg_map[tiff_filename] = jpeg_filename

    # Update XML files in the directory
    for file in os.listdir(directory):
        if file.lower().endswith('.xml'):
            xml_path = os.path.join(directory, file)
            update_xml_file(xml_path, tiff_to_jpeg_map)

if __name__ == '__main__':
    main()


No JPEG mapping found for IDGNAZIM0001.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM0001.xml
No JPEG mapping found for IDGNAZIM00010.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM00010.xml
No JPEG mapping found for IDGNAZIM000100.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000100.xml
No JPEG mapping found for IDGNAZIM000101.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000101.xml
No JPEG mapping found for IDGNAZIM000102.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000102.xml
No JPEG mapping found for IDGNAZIM000103.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000103.xml
No JPEG mapping found for IDGNAZIM000104.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000104.xml
No JPEG mapping found for IDGNAZIM000105.tif in

In [3]:
import os
import xml.etree.ElementTree as ET

def update_xml_filenames(directory):
    """
    Update the imageFilename attribute in XML files from .tif to .jpeg.
    """
    namespace = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'}
    for file in os.listdir(directory):
        if file.lower().endswith('.xml'):
            xml_path = os.path.join(directory, file)
            try:
                tree = ET.parse(xml_path)
                root = tree.getroot()
                page_element = root.find('ns:Page', namespace)

                if page_element is not None:
                    image_filename = page_element.get('imageFilename')
                    if image_filename.lower().endswith('.tif'):
                        new_image_filename = image_filename.replace('.tif', '.jpeg')
                        page_element.set('imageFilename', new_image_filename)
                        tree.write(xml_path)
                        print(f'Updated {xml_path}: {image_filename} to {new_image_filename}')
                    else:
                        print(f'No .tif found in {xml_path}')
                else:
                    print(f'No Page element found in {xml_path}')
            except Exception as e:
                print(f'Error updating XML file {xml_path}: {e}')

def main():
    directory = r'C:\Users\User\PycharmProjects\HebHTR\HebHTR_Transkribus\895769\lubrani-9\alto'
    update_xml_filenames(directory)

if __name__ == '__main__':
    main()


No Page element found in C:\Users\User\PycharmProjects\HebHTR\HebHTR_Transkribus\895769\lubrani-9\alto\0001_לבוא במגע עם ילדי העולים לעמוד על דרכי חנוכם_1.xml
No Page element found in C:\Users\User\PycharmProjects\HebHTR\HebHTR_Transkribus\895769\lubrani-9\alto\0002_לבוא במגע עם ילדי העולים לעמוד על דרכי חנוכם_2.xml


In [1]:
import os
import xml.etree.ElementTree as ET

def check_well_formed_xml(directory):
    """
    Check if XML files in the specified directory are well-formed.
    """
    for file in os.listdir(directory):
        if file.lower().endswith('.xml'):
            xml_path = os.path.join(directory, file)
            try:
                tree = ET.parse(xml_path)
                #print(f'{xml_path}: Well-formed')
            except ET.ParseError as e:
                print(f'{xml_path}: XML parsing error - {e}')
            except Exception as e:
                print(f'{xml_path}: Error - {e}')

def main():
    directory = r'C:/Users/User/PycharmProjects/HebHTR/training_data'
    check_well_formed_xml(directory)

if __name__ == '__main__':
    main()


In [7]:
import os
import xml.etree.ElementTree as ET

def fix_ns0_to_default_namespace(file_path, output_path=None):
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Get the actual URI from ns0:alto
    ns_uri = root.tag.split('}')[0].strip('{')

    # Strip 'ns0:' from all tags
    for elem in root.iter():
        if isinstance(elem.tag, str) and '}' in elem.tag:
            elem.tag = elem.tag.split('}', 1)[1]

    # Replace root tag and reset attributes
    root.attrib.clear()
    root.set('xmlns', ns_uri)
#     root.set('xmlns:xsi', "http://www.w3.org/2001/XMLSchema-instance")
    root.set('{http://www.w3.org/2001/XMLSchema-instance}schemaLocation',
             f"{ns_uri} http://www.loc.gov/standards/alto/v4/alto-4-2.xsd")

    output_path = output_path or file_path
    tree.write(output_path, encoding='utf-8', xml_declaration=True)
    print(f"✅ Fixed namespace in: {output_path}")

def fix_all_xmls_in_folder(folder_path):
    for fname in os.listdir(folder_path):
        if fname.lower().endswith('.xml'):
            in_path = os.path.join(folder_path, fname)
            out_path = os.path.join(folder_path, fname.replace(".xml", "_fixed.xml"))
            try:
                fix_ns0_to_default_namespace(in_path, out_path)
            except Exception as e:
                print(f"❌ Failed to fix {fname}: {e}")

# Example usage
# === Example usage ===
folder = r'C:/Users/User/PycharmProjects/HebHTR/GT_copy'

fix_all_xmls_in_folder("C:/Users/User/PycharmProjects/HebHTR/GT_copy")


✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\1_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\3_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 10_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 2_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 4_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 5_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 6_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 7_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 8_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 9_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT001_fixed_fixed.xml
✅ Fixed namespace in: C:

✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT045_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT045_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT046_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT046_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT047_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT047_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT048_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT048_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT049_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT049_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/G

✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT094_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT094_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT095_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT095_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT096_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT096_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT097_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT097_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT098_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT098_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/G

✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT142_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT142_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT143_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT143_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT144_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT144_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT145_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT145_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT146_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT146_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/G

✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT188_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT188_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT189_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT189_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT190_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT190_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT191_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT191_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT192_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT192_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/G

✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\IDGNAZIM00037_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\IDGNAZIM00037_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\IDGNAZIM0003_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\IDGNAZIM00042_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\IDGNAZIM00042_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\IDGNAZIM00046_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\IDGNAZIM00046_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\IDGNAZIM00049_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\IDGNAZIM00049_fixed_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\IDGNAZIM00054_fixed.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT

In [9]:
import os
import xml.etree.ElementTree as ET

def fix_ns0_to_default_namespace(file_path, output_path=None):
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Get the actual URI from ns0:alto
    ns_uri = root.tag.split('}')[0].strip('{')

    # Strip 'ns0:' from all tags
    for elem in root.iter():
        if isinstance(elem.tag, str) and '}' in elem.tag:
            elem.tag = elem.tag.split('}', 1)[1]

    # Replace root tag and reset attributes
    root.attrib.clear()
    root.set('xmlns', ns_uri)
    root.set('{http://www.w3.org/2001/XMLSchema-instance}schemaLocation',
             f"{ns_uri} http://www.loc.gov/standards/alto/v4/alto-4-2.xsd")

    output_path = output_path or file_path  # 👈 this now defaults to overwriting
    tree.write(output_path, encoding='utf-8', xml_declaration=True)
    print(f"✅ Fixed namespace in: {output_path}")

def fix_all_xmls_in_folder(folder_path):
    for fname in os.listdir(folder_path):
        if fname.lower().endswith('.xml'):
            file_path = os.path.join(folder_path, fname)
            try:
                fix_ns0_to_default_namespace(file_path)
            except Exception as e:
                print(f"❌ Failed to fix {fname}: {e}")

# Run the script on your folder
folder = r'C:/Users/User/PycharmProjects/HebHTR/GT_copy'
fix_all_xmls_in_folder(folder)


✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\1.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\3.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 10.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 2.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 4.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 5.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 6.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 7.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 8.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\gnazim HTR 9.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT001.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT002.xml
✅ Fixed nam

✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT091.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT092.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT093.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT094.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT095.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT096.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT097.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT098.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT099.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT100.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT101.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_cop

✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT191.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT192.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT193.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT194.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT195.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT196.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT197.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT198.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT199.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\HEB_HTR_GT200.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy\IDGNAZIM0001.xml
✅ Fixed namespace in: C:/Users/User/PycharmProjects/HebHTR/GT_copy