In [3]:
from lxml import etree
import os
import shutil

In [3]:
import os
import xml.etree.ElementTree as ET

def update_xml_file(xml_path, tiff_to_jpeg_map):
    """
    Update the XML file to point to the corresponding JPEG image.
    """
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        namespace = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'}

        page_element = root.find('ns:Page', namespace)

        if page_element is not None:
            tiff_filename = page_element.get('imageFilename')
            if tiff_filename in tiff_to_jpeg_map:
                jpeg_filename = tiff_to_jpeg_map[tiff_filename]
                page_element.set('imageFilename', jpeg_filename)
                tree.write(xml_path)
                print(f'Updated {xml_path} to point to {jpeg_filename}')
            else:
                print(f'No JPEG mapping found for {tiff_filename} in {xml_path}')
        else:
            print(f'No Page element found in {xml_path}')
    except Exception as e:
        print(f'Error updating XML file {xml_path}: {e}')

def main():
    directory = r'C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\uri_gt'

    # Collect TIFF to JPEG mappings
    tiff_to_jpeg_map = {}
    for file in os.listdir(directory):
        if file.lower().endswith('.tif'):
            tiff_filename = file
            jpeg_filename = file.replace('.tif', '.jpeg')
            tiff_to_jpeg_map[tiff_filename] = jpeg_filename

    # Update XML files in the directory
    for file in os.listdir(directory):
        if file.lower().endswith('.xml'):
            xml_path = os.path.join(directory, file)
            update_xml_file(xml_path, tiff_to_jpeg_map)

if __name__ == '__main__':
    main()


No JPEG mapping found for IDGNAZIM0001.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM0001.xml
No JPEG mapping found for IDGNAZIM00010.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM00010.xml
No JPEG mapping found for IDGNAZIM000100.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000100.xml
No JPEG mapping found for IDGNAZIM000101.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000101.xml
No JPEG mapping found for IDGNAZIM000102.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000102.xml
No JPEG mapping found for IDGNAZIM000103.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000103.xml
No JPEG mapping found for IDGNAZIM000104.tif in C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000104.xml
No JPEG mapping found for IDGNAZIM000105.tif in

In [4]:
import os
import xml.etree.ElementTree as ET

def update_xml_filenames(directory):
    """
    Update the imageFilename attribute in XML files from .tif to .jpeg.
    """
    namespace = {'ns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'}
    for file in os.listdir(directory):
        if file.lower().endswith('.xml'):
            xml_path = os.path.join(directory, file)
            try:
                tree = ET.parse(xml_path)
                root = tree.getroot()
                page_element = root.find('ns:Page', namespace)

                if page_element is not None:
                    image_filename = page_element.get('imageFilename')
                    if image_filename.lower().endswith('.tif'):
                        new_image_filename = image_filename.replace('.tif', '.jpeg')
                        page_element.set('imageFilename', new_image_filename)
                        tree.write(xml_path)
                        print(f'Updated {xml_path}: {image_filename} to {new_image_filename}')
                    else:
                        print(f'No .tif found in {xml_path}')
                else:
                    print(f'No Page element found in {xml_path}')
            except Exception as e:
                print(f'Error updating XML file {xml_path}: {e}')

def main():
    directory = r'C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml'
    update_xml_filenames(directory)

if __name__ == '__main__':
    main()


Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM0001.xml: IDGNAZIM0001.tif to IDGNAZIM0001.jpeg
Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM00010.xml: IDGNAZIM00010.tif to IDGNAZIM00010.jpeg
Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000100.xml: IDGNAZIM000100.tif to IDGNAZIM000100.jpeg
Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000101.xml: IDGNAZIM000101.tif to IDGNAZIM000101.jpeg
Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000102.xml: IDGNAZIM000102.tif to IDGNAZIM000102.jpeg
Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000103.xml: IDGNAZIM000103.tif to IDGNAZIM000103.jpeg
Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM000104.xml: IDGNAZIM000104.tif to IDGNAZIM000104.jpeg
Updated C:\Users\User\Pycharm

Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM00062.xml: IDGNAZIM00062.tif to IDGNAZIM00062.jpeg
Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM00063.xml: IDGNAZIM00063.tif to IDGNAZIM00063.jpeg
Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM00064.xml: IDGNAZIM00064.tif to IDGNAZIM00064.jpeg
Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM00065.xml: IDGNAZIM00065.tif to IDGNAZIM00065.jpeg
Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM00066.xml: IDGNAZIM00066.tif to IDGNAZIM00066.jpeg
Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM00067.xml: IDGNAZIM00067.tif to IDGNAZIM00067.jpeg
Updated C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\pagexml\IDGNAZIM00068.xml: IDGNAZIM00068.tif to IDGNAZIM00068.jpeg
Updated C:\Users\User\PycharmProjects\Gna

In [6]:
import os
import xml.etree.ElementTree as ET

def check_well_formed_xml(directory):
    """
    Check if XML files in the specified directory are well-formed.
    """
    for file in os.listdir(directory):
        if file.lower().endswith('.xml'):
            xml_path = os.path.join(directory, file)
            try:
                tree = ET.parse(xml_path)
                #print(f'{xml_path}: Well-formed')
            except ET.ParseError as e:
                print(f'{xml_path}: XML parsing error - {e}')
            except Exception as e:
                print(f'{xml_path}: Error - {e}')

def main():
    directory = r'C:\Users\User\PycharmProjects\Gnazim_New\gnazim\training_data\uri_gt'
    check_well_formed_xml(directory)

if __name__ == '__main__':
    main()


In [9]:
import pyarrow.parquet as pq

def read_arrow_file(file_path):
    """
    Read and display the contents of a Parquet file (dataset.arrow).
    """
    try:
        table = pq.read_table(file_path)
        df = table.to_pandas()
        print(df)
    except Exception as e:
        print(f'Error reading the Arrow file: {e}')

def main():
    file_path = 'dataset.arrow'
    read_arrow_file(file_path)

if __name__ == '__main__':
    main()


Error reading the Arrow file: Could not open Parquet input source 'dataset.arrow': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.


In [11]:
def read_file_header(file_path, num_bytes=256):
    """
    Read and display the first few bytes of a file.
    """
    try:
        with open(file_path, 'rb') as file:
            header = file.read(num_bytes)
            print(header)
    except Exception as e:
        print(f'Error reading the file: {e}')

def main():
    file_path = 'dataset.arrow'
    read_file_header(file_path)

if __name__ == '__main__':
    main()


b'ARROW1\x00\x00\xff\xff\xff\xff\x88\x05\x00\x00\x10\x00\x00\x00\x00\x00\n\x00\x0e\x00\x06\x00\x05\x00\x08\x00\n\x00\x00\x00\x00\x01\x04\x00\x10\x00\x00\x00\x00\x00\n\x00\x0c\x00\x00\x00\x04\x00\x08\x00\n\x00\x00\x008\x04\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x0c\x00\x00\x00\x08\x00\x0c\x00\x04\x00\x08\x00\x08\x00\x00\x00\x08\x00\x00\x00\x10\x00\x00\x00\x05\x00\x00\x00lines\x00\x00\x00\x00\x04\x00\x00{"type": "kraken_recognition_baseline", "alphabet": {"\\u05d2": 455, "\\u05d5": 1187, "\\u05e8": 1044, "\\u05d9": 1660, ",": 806, " ": 2537, "\\u05d7'
