In [2]:
import os

In [25]:
folder = f'training_data'
i = 0
for file_name in os.listdir(folder):
    new_name = f'HebHTR_GT_{i+1}'

    # Renaming the file
    os.rename(f'{folder}/{file_name}', f'{folder}/{new_name}')

print('All Files Renamed')


All Files Renamed


In [18]:
file_name

'Ahtar_Nummer 9 (28. Ḏū l-ḥiǧǧa 1292 -- 13. Kânûn-ı Sânî 1291 -- 25. Januar 1876)_36.jpg'

In [11]:
import os
import xml.etree.ElementTree as ET

def rename_files_and_update_xml(folder_path, file_prefix="HEB_HTR_GT_", start_number=1):
    """
    Renames all TIFF images and corresponding XML files in a folder with a sequential naming pattern.
    Also updates the 'fileName' tag inside each XML file.
    
    Parameters:
    - folder_path: Path to the folder containing both TIFF and XML files.
    - file_prefix: The prefix for the new file names (default is 'image_').
    - start_number: The number to start the sequence from (default is 1).
    """
    # Get list of TIFF and XML files in the folder
    files = sorted(os.listdir(folder_path))
  

    xml_files = [f for f in files if f.lower().endswith('.xml')]
    img_files = [f for f in files if f.lower().endswith(('.jpg', '.jpeg', '.tif', '.tiff'))]

    # Make sure there are equal numbers of images and XMLs
    if len(img_files) != len(xml_files):
        print("Error: Number of img files does not match the number of XML files!")
        return

    # Iterate over the files and rename them
    for i, (img_file, xml_file) in enumerate(zip(img_files, xml_files), start=start_number):
        new_name = f"{file_prefix}{i:03d}"  # Format the new name with leading zeros
        image_suffix = img_file.split(sep='.')[-1]
        # Construct the new file names
        new_img_name = f"{new_name}.{image_suffix}"
        new_xml_name = f"{new_name}.xml"

        # Paths to the current and new files
        img_file_path = os.path.join(folder_path, img_file)
        xml_file_path = os.path.join(folder_path, xml_file)
        new_img_file_path = os.path.join(folder_path, new_img_name)
        new_xml_file_path = os.path.join(folder_path, new_xml_name)

        # Rename the image
        os.rename(img_file_path, new_img_file_path)
        print(f"Renamed image: {img_file} -> {new_img_name}")

        # Rename the XML file
        os.rename(xml_file_path, new_xml_file_path)
        print(f"Renamed XML: {xml_file} -> {new_xml_name}")

        # Update the 'fileName' tag in the XML
        update_xml_file_name(new_xml_file_path, new_img_name)

def update_xml_file_name(xml_file_path, new_img_name):
    """
    Updates the 'fileName' tag inside the XML file to match the new image file name.
    
    Parameters:
    - xml_file_path: Path to the XML file.
    - new_img_name: The new image file name to set in the 'fileName' tag.
    """
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Find the fileName tag (assuming the fileName is inside the sourceImageInformation tag)
    ns = {'alto': 'http://www.loc.gov/standards/alto/ns-v4#'}
    file_name_element = root.find('.//alto:sourceImageInformation/alto:fileName', ns)
    
    if file_name_element is not None:
        file_name_element.text = new_img_name  # Set the new image name
        tree.write(xml_file_path, xml_declaration=True, encoding="UTF-8", method="xml")
        print(f"Updated fileName in XML: {new_img_name}")
    else:
        print(f"Warning: fileName tag not found in XML: {xml_file_path}")

# Example usage:
folder_path = r'training_data'  # Replace with the path to your folder containing TIFF and XML files
rename_files_and_update_xml(folder_path, file_prefix="HEB_HTR_GT", start_number=1)


Renamed image: 0001_11-3_312_5_0002-Mattathias-0003.jpg -> HEB_HTR_GT001.jpg
Renamed XML: 0001_11-3_312_5_0002-Mattathias-0003.xml -> HEB_HTR_GT001.xml
Updated fileName in XML: HEB_HTR_GT001.jpg
Renamed image: 0001_11-3_312_5_0003-Mattathias-0001.jpg -> HEB_HTR_GT002.jpg
Renamed XML: 0001_11-3_312_5_0003-Mattathias-0001.xml -> HEB_HTR_GT002.xml
Updated fileName in XML: HEB_HTR_GT002.jpg
Renamed image: 0001_26.4.48 - P1.jpeg -> HEB_HTR_GT003.jpeg
Renamed XML: 0001_26.4.48 - P1.xml -> HEB_HTR_GT003.xml
Updated fileName in XML: HEB_HTR_GT003.jpeg
Renamed image: 0001_CamScanner 11-05-2020 13.22_page-0023.jpg -> HEB_HTR_GT004.jpg
Renamed XML: 0001_CamScanner 11-05-2020 13.22_page-0023.xml -> HEB_HTR_GT004.xml
Updated fileName in XML: HEB_HTR_GT004.jpg
Renamed image: 0001_CamScanner 11-05-2020 13.22_page-0034.jpg -> HEB_HTR_GT005.jpg
Renamed XML: 0001_CamScanner 11-05-2020 13.22_page-0034.xml -> HEB_HTR_GT005.xml
Updated fileName in XML: HEB_HTR_GT005.jpg
Renamed image: 0001_CamScanner 11-05

Renamed image: 0002_יצחק מולדבי - השתתפותו בהגנת תל חי_2.jpeg -> HEB_HTR_GT049.jpeg
Renamed XML: 0002_יצחק מולדבי - השתתפותו בהגנת תל חי_2.xml -> HEB_HTR_GT049.xml
Updated fileName in XML: HEB_HTR_GT049.jpeg
Renamed image: 0002_ישבתי למעלה משעה עם רחל (ינאית) בבית הנשיא_2.jpeg -> HEB_HTR_GT050.jpeg
Renamed XML: 0002_ישבתי למעלה משעה עם רחל (ינאית) בבית הנשיא_2.xml -> HEB_HTR_GT050.xml
Updated fileName in XML: HEB_HTR_GT050.jpeg
Renamed image: 0002_לבוא במגע עם ילדי העולים לעמוד על דרכי חנוכם_2.jpeg -> HEB_HTR_GT051.jpeg
Renamed XML: 0002_לבוא במגע עם ילדי העולים לעמוד על דרכי חנוכם_2.xml -> HEB_HTR_GT051.xml
Updated fileName in XML: HEB_HTR_GT051.jpeg
Renamed image: 0002_משה שרת חלש... אני חושד בזקן כי בכך רצה_2.jpeg -> HEB_HTR_GT052.jpeg
Renamed XML: 0002_משה שרת חלש... אני חושד בזקן כי בכך רצה_2.xml -> HEB_HTR_GT052.xml
Updated fileName in XML: HEB_HTR_GT052.jpeg
Renamed image: 0002_עמ2  - מידב - 05.05.1952.jpg -> HEB_HTR_GT053.jpg
Renamed XML: 0002_עמ2  - מידב - 05.05.1952.xml -> HE

Renamed image: 0008_ea.en.letters.01.01.03.0008.jpg -> HEB_HTR_GT095.jpg
Renamed XML: 0008_ea.en.letters.01.01.03.0008.xml -> HEB_HTR_GT095.xml
Updated fileName in XML: HEB_HTR_GT095.jpg
Renamed image: 0009_11-3_312_5_0002-Mattathias-0055.jpg -> HEB_HTR_GT096.jpg
Renamed XML: 0009_11-3_312_5_0002-Mattathias-0055.xml -> HEB_HTR_GT096.xml
Updated fileName in XML: HEB_HTR_GT096.jpg
Renamed image: 0009_11-3_312_5_0003-Mattathias-0014.jpg -> HEB_HTR_GT097.jpg
Renamed XML: 0009_11-3_312_5_0003-Mattathias-0014.xml -> HEB_HTR_GT097.xml
Updated fileName in XML: HEB_HTR_GT097.jpg
Renamed image: 0009_IMG_20190731_123729.jpg -> HEB_HTR_GT098.jpg
Renamed XML: 0009_IMG_20190731_123729.xml -> HEB_HTR_GT098.xml
Updated fileName in XML: HEB_HTR_GT098.jpg
Renamed image: 0009_IMG_20190801_164646.jpg -> HEB_HTR_GT099.jpg
Renamed XML: 0009_IMG_20190801_164646.xml -> HEB_HTR_GT099.xml
Updated fileName in XML: HEB_HTR_GT099.jpg
Renamed image: 0009_ea.en.letters.01.01.0008.jpg -> HEB_HTR_GT100.jpg
Renamed XML

Renamed image: 0023_11-3_312_5_0002-Mattathias-0156.jpg -> HEB_HTR_GT149.jpg
Renamed XML: 0023_11-3_312_5_0002-Mattathias-0156.xml -> HEB_HTR_GT149.xml
Updated fileName in XML: HEB_HTR_GT149.jpg
Renamed image: 0023_IMG_20190731_123934.jpg -> HEB_HTR_GT150.jpg
Renamed XML: 0023_IMG_20190731_123934.xml -> HEB_HTR_GT150.xml
Updated fileName in XML: HEB_HTR_GT150.jpg
Renamed image: 0023_IMG_20190801_164828.jpg -> HEB_HTR_GT151.jpg
Renamed XML: 0023_IMG_20190801_164828.xml -> HEB_HTR_GT151.xml
Updated fileName in XML: HEB_HTR_GT151.jpg
Renamed image: 0024_11-3_312_5_0002-Mattathias-0158.jpg -> HEB_HTR_GT152.jpg
Renamed XML: 0024_11-3_312_5_0002-Mattathias-0158.xml -> HEB_HTR_GT152.xml
Updated fileName in XML: HEB_HTR_GT152.jpg
Renamed image: 0024_IMG_20190731_123944.jpg -> HEB_HTR_GT153.jpg
Renamed XML: 0024_IMG_20190731_123944.xml -> HEB_HTR_GT153.xml
Updated fileName in XML: HEB_HTR_GT153.jpg
Renamed image: 0024_IMG_20190801_164837.jpg -> HEB_HTR_GT154.jpg
Renamed XML: 0024_IMG_20190801_1

Renamed image: 0038_IMG_20190801_165034.jpg -> HEB_HTR_GT196.jpg
Renamed XML: 0038_IMG_20190801_165034.xml -> HEB_HTR_GT196.xml
Updated fileName in XML: HEB_HTR_GT196.jpg
Renamed image: 0039_11-3_312_5_0002-Mattathias-0200.jpg -> HEB_HTR_GT197.jpg
Renamed XML: 0039_11-3_312_5_0002-Mattathias-0200.xml -> HEB_HTR_GT197.xml
Updated fileName in XML: HEB_HTR_GT197.jpg
Renamed image: 0039_IMG_20190731_124153.jpg -> HEB_HTR_GT198.jpg
Renamed XML: 0039_IMG_20190731_124153.xml -> HEB_HTR_GT198.xml
Updated fileName in XML: HEB_HTR_GT198.jpg
Renamed image: 0039_IMG_20190801_165042.jpg -> HEB_HTR_GT199.jpg
Renamed XML: 0039_IMG_20190801_165042.xml -> HEB_HTR_GT199.xml
Updated fileName in XML: HEB_HTR_GT199.jpg
Renamed image: 0040_11-3_312_5_0002-Mattathias-0201.jpg -> HEB_HTR_GT200.jpg
Renamed XML: 0040_11-3_312_5_0002-Mattathias-0201.xml -> HEB_HTR_GT200.xml
Updated fileName in XML: HEB_HTR_GT200.jpg


In [6]:
img_file = 'blablalbla.jpeg'
img_file.split(sep='.')[1]


'jpeg'