In [7]:
from PIL import Image, ImageOps
import os
import xml.etree.ElementTree as ET

def pad_tiff_image(image_path, target_size, save_path):
    """
    Pads a TIFF image with the required size and centers it, saving the result.
    
    Parameters:
    - image_path: Path to the TIFF image file.
    - target_size: Target size (width, height) for padding.
    - save_path: Path to save the padded image.
    
    Returns:
    - original_size: The original size of the image before padding.
    """
    img = Image.open(image_path)
    original_size = img.size  # (original_width, original_height)
    
    # Calculate padding
    left_padding = (target_size[0] - original_size[0]) // 2
    top_padding = (target_size[1] - original_size[1]) // 2
    
    # Add padding around the image
    padded_img = ImageOps.expand(img, border=(left_padding, top_padding), fill="white")
    
    # Save the padded image
    padded_img.save(save_path, format='TIFF')
    print(f"Padded image saved to {save_path}")
    
    return original_size

def adjust_alto_coordinates_for_padding(xml_file_path, output_file_path, original_size, target_size):
    """
    Adjusts the coordinates in the ALTO XML file by applying the x and y offsets for padding.
    
    Parameters:
    - xml_file_path: Path to the original ALTO XML file.
    - output_file_path: Path to save the modified XML file.
    - original_size: Tuple (original_width, original_height) of the image.
    - target_size: Tuple (target_width, target_height) after padding.
    """
    # Register the namespace without adding a prefix
    ET.register_namespace('', "http://www.loc.gov/standards/alto/ns-v4#")
    
    # Calculate padding offsets
    original_width, original_height = original_size
    target_width, target_height = target_size
    x_offset = (target_width - original_width) // 2
    y_offset = (target_height - original_height) // 2

    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Update the Page width and height to the new size
    ns = {'alto': 'http://www.loc.gov/standards/alto/ns-v4#'}
    page = root.find('.//alto:Page', ns)
    page.set('WIDTH', str(target_width))
    page.set('HEIGHT', str(target_height))

    # Adjust coordinates in TextBlocks, TextLines, and Polygons
    for block in root.findall('.//alto:TextBlock', ns):
        # Adjust block coordinates (HPOS and VPOS)
        block.set('HPOS', str(int(block.get('HPOS')) + x_offset))
        block.set('VPOS', str(int(block.get('VPOS')) + y_offset))

        # Adjust polygons in the block (handling space-separated points)
        shape = block.find('.//alto:Shape/alto:Polygon', ns)
        if shape is not None:
            points = shape.get('POINTS').split()
            new_points = []
            for i in range(0, len(points), 2):  # Process x, y pairs
                try:
                    x = int(points[i])
                    y = int(points[i + 1])
                    new_points.append(f"{x + x_offset} {y + y_offset}")
                except (ValueError, IndexError):
                    print(f"Error parsing point: {points[i:i+2]}")
            shape.set('POINTS', ' '.join(new_points))

        # Adjust each TextLine's coordinates and baseline
        for line in block.findall('.//alto:TextLine', ns):
            line.set('HPOS', str(int(line.get('HPOS')) + x_offset))
            line.set('VPOS', str(int(line.get('VPOS')) + y_offset))

            # Adjust baseline coordinates (also space-separated)
            baseline = line.get('BASELINE', '')
            if baseline:
                baseline_points = baseline.split()
                new_baseline = []
                for i in range(0, len(baseline_points), 2):  # Process x, y pairs
                    try:
                        x = int(baseline_points[i])
                        y = int(baseline_points[i + 1])
                        new_baseline.append(f"{x + x_offset} {y + y_offset}")
                    except (ValueError, IndexError):
                        print(f"Error parsing baseline point: {baseline_points[i:i+2]}")
                line.set('BASELINE', ' '.join(new_baseline))

            # Adjust polygons in the line (handling space-separated points)
            shape = line.find('.//alto:Shape/alto:Polygon', ns)
            if shape is not None:
                points = shape.get('POINTS').split()
                new_points = []
                for i in range(0, len(points), 2):  # Process x, y pairs
                    try:
                        x = int(points[i])
                        y = int(points[i + 1])
                        new_points.append(f"{x + x_offset} {y + y_offset}")
                    except (ValueError, IndexError):
                        print(f"Error parsing point: {points[i:i+2]}")
                shape.set('POINTS', ' '.join(new_points))

    # Save the updated XML with correct namespaces
    tree.write(output_file_path, xml_declaration=True, encoding="UTF-8", method="xml")
    print(f"XML coordinates adjusted and saved to {output_file_path}")

def update_all_xmls_and_images_in_folder(input_folder, output_folder, image_folder, padded_image_folder, target_size):
    """
    Loops through all XML files in the input folder, pads their corresponding TIFF images, adjusts XML coordinates,
    and saves the padded images and XML files to the respective output folders.
    
    Parameters:
    - input_folder: Path to the folder containing the original XML files.
    - output_folder: Path to the folder where the updated XML files will be saved.
    - image_folder: Path to the folder containing the corresponding TIFF images.
    - padded_image_folder: Path to save the padded TIFF images.
    - target_size: Tuple (target_width, target_height) after padding.
    """
    # Ensure the output folders exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    if not os.path.exists(padded_image_folder):
        os.makedirs(padded_image_folder)

    for filename in os.listdir(input_folder):
        if filename.lower().endswith('.xml'):
            xml_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename)

            # Find the corresponding TIFF image
            base_filename = os.path.splitext(filename)[0]  # Get base name without extension
            tiff_file_path = os.path.join(image_folder, f"{base_filename}.tif")
            padded_image_path = os.path.join(padded_image_folder, f"{base_filename}.tif")

            if os.path.exists(tiff_file_path):
                # Pad the TIFF image and get its original size
                original_size = pad_tiff_image(tiff_file_path, target_size=target_size, save_path=padded_image_path)

                # Adjust the XML coordinates based on the original image size
                adjust_alto_coordinates_for_padding(xml_file_path, output_file_path, original_size, target_size)
            else:
                print(f"Warning: No corresponding TIFF image found for {filename}")

# Example usage:

input_folder = "training_data/updated_uri_gt_alto"  # Replace with the path to the folder with TIFF images
output_folder = "training_data/padded_uri_gt_alto" # Replace with the folder to save updated XML files
image_folder = "training_data/updated_uri_gt_alto"   # Folder containing the TIFF images
padded_image_folder = "training_data/padded_uri_gt_alto"  # Folder to save the padded TIFF images
target_size = (1800, 1800)  # New image size after padding

update_all_xmls_and_images_in_folder(input_folder, output_folder, image_folder, padded_image_folder, target_size)

Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM0001.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM0001.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM00010.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM00010.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000100.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000100.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000101.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000101.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000102.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000102.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000103.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000103.xml
Padded image saved to 

XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000162.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000163.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000163.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000164.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000164.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000165.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000165.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000166.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000166.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000167.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000167.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000168.tif
XML coordinates 

XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000222.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000223.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000223.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000224.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000224.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000225.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000225.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000226.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000226.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000227.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM000227.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM000228.tif
XML coordinates 

XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM00054.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM00055.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM00055.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM00056.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM00056.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM00057.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM00057.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM00058.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM00058.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM00059.tif
XML coordinates adjusted and saved to training_data/padded_uri_gt_alto\IDGNAZIM00059.xml
Padded image saved to training_data/padded_uri_gt_alto\IDGNAZIM0006.tif
XML coordinates adjusted and 

In [6]:
from PIL import Image, ImageOps
import os
import xml.etree.ElementTree as ET

def pad_tiff_image(image_path, target_size, save_path):
    """
    Pads a specific TIFF image with the required size and centers it, saving the result.
    
    Parameters:
    - image_path: Path to the TIFF image file.
    - target_size: Target size (width, height) for padding.
    - save_path: Path to save the padded image.
    
    Returns:
    - original_size: The original size of the image before padding.
    """
    img = Image.open(image_path)
    original_size = img.size  # (original_width, original_height)
    
    # Calculate padding
    left_padding = (target_size[0] - original_size[0]) // 2
    top_padding = (target_size[1] - original_size[1]) // 2
    
    # Add padding around the image
    padded_img = ImageOps.expand(img, border=(left_padding, top_padding), fill="white")
    
    # Save the padded image
    padded_img.save(save_path, format='TIFF')
    print(f"Padded image saved to {save_path}")
    
    return original_size

def adjust_alto_coordinates_for_padding(xml_file_path, output_file_path, original_size, target_size):
    """
    Adjusts the coordinates in the ALTO XML file by applying the x and y offsets for padding.
    
    Parameters:
    - xml_file_path: Path to the original ALTO XML file.
    - output_file_path: Path to save the modified XML file.
    - original_size: Tuple (original_width, original_height) of the image.
    - target_size: Tuple (target_width, target_height) after padding.
    """
    # Register the namespace without adding a prefix
    ET.register_namespace('', "http://www.loc.gov/standards/alto/ns-v4#")
    
    # Calculate padding offsets
    original_width, original_height = original_size
    target_width, target_height = target_size
    x_offset = (target_width - original_width) // 2
    y_offset = (target_height - original_height) // 2

    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Update the Page width and height to the new size
    ns = {'alto': 'http://www.loc.gov/standards/alto/ns-v4#'}
    page = root.find('.//alto:Page', ns)
    page.set('WIDTH', str(target_width))
    page.set('HEIGHT', str(target_height))

    # Adjust coordinates in TextBlocks, TextLines, and Polygons
    for block in root.findall('.//alto:TextBlock', ns):
        # Adjust block coordinates (HPOS and VPOS)
        block.set('HPOS', str(int(block.get('HPOS')) + x_offset))
        block.set('VPOS', str(int(block.get('VPOS')) + y_offset))

        # Adjust polygons in the block (handling space-separated points)
        shape = block.find('.//alto:Shape/alto:Polygon', ns)
        if shape is not None:
            points = shape.get('POINTS').split()
            new_points = []
            for i in range(0, len(points), 2):  # Process x, y pairs
                try:
                    x = int(points[i])
                    y = int(points[i + 1])
                    new_points.append(f"{x + x_offset} {y + y_offset}")
                except (ValueError, IndexError):
                    print(f"Error parsing point: {points[i:i+2]}")
            shape.set('POINTS', ' '.join(new_points))

        # Adjust each TextLine's coordinates and baseline
        for line in block.findall('.//alto:TextLine', ns):
            line.set('HPOS', str(int(line.get('HPOS')) + x_offset))
            line.set('VPOS', str(int(line.get('VPOS')) + y_offset))

            # Adjust baseline coordinates (also space-separated)
            baseline = line.get('BASELINE', '')
            if baseline:
                baseline_points = baseline.split()
                new_baseline = []
                for i in range(0, len(baseline_points), 2):  # Process x, y pairs
                    try:
                        x = int(baseline_points[i])
                        y = int(baseline_points[i + 1])
                        new_baseline.append(f"{x + x_offset} {y + y_offset}")
                    except (ValueError, IndexError):
                        print(f"Error parsing baseline point: {baseline_points[i:i+2]}")
                line.set('BASELINE', ' '.join(new_baseline))

            # Adjust polygons in the line (handling space-separated points)
            shape = line.find('.//alto:Shape/alto:Polygon', ns)
            if shape is not None:
                points = shape.get('POINTS').split()
                new_points = []
                for i in range(0, len(points), 2):  # Process x, y pairs
                    try:
                        x = int(points[i])
                        y = int(points[i + 1])
                        new_points.append(f"{x + x_offset} {y + y_offset}")
                    except (ValueError, IndexError):
                        print(f"Error parsing point: {points[i:i+2]}")
                shape.set('POINTS', ' '.join(new_points))

    # Save the updated XML with correct namespaces
    tree.write(output_file_path, xml_declaration=True, encoding="UTF-8", method="xml")
    print(f"XML coordinates adjusted and saved to {output_file_path}")

# Example usage:
image_path = "Images/IDGNAZIM000710.tif"  # Path to the TIFF image file
# xml_file_path = "path/to/your/input.xml"  # Path to the corresponding XML file
padded_image_path = "trial_padded_image.tif"  # Path to save the padded TIFF image
# output_xml_path = "path/to/save/adjusted_xml.xml"  # Path to save the adjusted XML file
target_size = (1700, 1700)  # New image size after padding

# Pad the image
original_size = pad_tiff_image(image_path, target_size=target_size, save_path=padded_image_path)

# # Adjust the XML coordinates based on padding
# adjust_alto_coordinates_for_padding(xml_file_path, output_xml_path, original_size, target_size)


KeyError: 'TIF'

In [1]:
from PIL import Image, ImageOps
import os

def pad_images_in_folder(input_folder, output_folder, target_size=(1700, 1700)):
    """
    Pads all TIFF images in the input folder to the specified size and saves them in the output folder.

    Parameters:
    - input_folder: Path to the folder containing the original TIFF images.
    - output_folder: Path to the folder where the padded images will be saved.
    - target_size: Tuple specifying the target (width, height) after padding.
    """
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate over all TIFF images in the input folder
    for filename in os.listdir(input_folder):
        if filename.lower().endswith('.tif'):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)

            # Pad and save the image
            pad_tiff_image(input_path, target_size, output_path)

def pad_tiff_image(image_path, target_size, save_path):
    """
    Pads a single TIFF image to the specified size and saves it.

    Parameters:
    - image_path: Path to the original TIFF image.
    - target_size: Tuple specifying the target (width, height) after padding.
    - save_path: Path to save the padded image.
    """
    img = Image.open(image_path)
    original_size = img.size  # (original_width, original_height)

    # Calculate padding
    left_padding = (target_size[0] - original_size[0]) // 2
    top_padding = (target_size[1] - original_size[1]) // 2

    # Add padding and center the image
    padded_img = ImageOps.expand(img, border=(left_padding, top_padding), fill="white")

    # Save the padded image
    padded_img.save(save_path, format='TIFF')
    print(f"Padded image saved: {save_path}")

# Example usage:
input_folder = "Data/זלדה"  # Replace with the path to your folder with TIFF images
output_folder = "Data/זלדה/Padded Images"  # Replace with the path to the target output folder
target_size = (1800, 1800)  # Specify the desired target size after padding

# Run the function to pad all images in the folder
pad_images_in_folder(input_folder, output_folder, target_size)


Padded image saved: Data/זלדה/Padded Images\2785.tif
Padded image saved: Data/זלדה/Padded Images\2786.tif
Padded image saved: Data/זלדה/Padded Images\2787.tif
Padded image saved: Data/זלדה/Padded Images\2788.tif
Padded image saved: Data/זלדה/Padded Images\2789.tif
Padded image saved: Data/זלדה/Padded Images\2790.tif
Padded image saved: Data/זלדה/Padded Images\2791.tif
Padded image saved: Data/זלדה/Padded Images\2792.tif
Padded image saved: Data/זלדה/Padded Images\2793.tif
Padded image saved: Data/זלדה/Padded Images\2794.tif
Padded image saved: Data/זלדה/Padded Images\2795.tif
Padded image saved: Data/זלדה/Padded Images\2796.tif
Padded image saved: Data/זלדה/Padded Images\2797.tif
Padded image saved: Data/זלדה/Padded Images\2798.tif
Padded image saved: Data/זלדה/Padded Images\2799.tif
Padded image saved: Data/זלדה/Padded Images\2800.tif
Padded image saved: Data/זלדה/Padded Images\2801.tif
Padded image saved: Data/זלדה/Padded Images\2802.tif
Padded image saved: Data/זלדה/Padded Images\28

Padded image saved: Data/זלדה/Padded Images\2945.tif
Padded image saved: Data/זלדה/Padded Images\2946.tif
Padded image saved: Data/זלדה/Padded Images\2947.tif
Padded image saved: Data/זלדה/Padded Images\2948.tif
Padded image saved: Data/זלדה/Padded Images\2949.tif
Padded image saved: Data/זלדה/Padded Images\2950.tif
Padded image saved: Data/זלדה/Padded Images\2951.tif
Padded image saved: Data/זלדה/Padded Images\2952.tif
Padded image saved: Data/זלדה/Padded Images\2953.tif
Padded image saved: Data/זלדה/Padded Images\2954.tif
Padded image saved: Data/זלדה/Padded Images\2955.tif
Padded image saved: Data/זלדה/Padded Images\2956.tif
Padded image saved: Data/זלדה/Padded Images\2957.tif
Padded image saved: Data/זלדה/Padded Images\2958.tif
Padded image saved: Data/זלדה/Padded Images\2959.tif
Padded image saved: Data/זלדה/Padded Images\2960.tif
Padded image saved: Data/זלדה/Padded Images\2961.tif
Padded image saved: Data/זלדה/Padded Images\2962.tif
Padded image saved: Data/זלדה/Padded Images\29