In [2]:
import xml.etree.ElementTree as ET
import os
import glob

def clean_cvat_xml(xml_file_path, tif_image_folder_path, output_xml_file_path, existing_tif_files):
    """
    Processes a single CVAT XML file to:
    1. Remove <image> tags for TIF files not present in tif_image_folder_path.
    2. Remove all <attribute name="is_crowd"></attribute> tags.
    3. Replace "unlabelled" with "unlabeled" in relevant text and attributes.
    """
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
    except ET.ParseError as e:
        print(f"Error parsing {xml_file_path}: {e}")
        return

    # 1. Remove <image> tags for TIF files not present in a user-specified folder
    images_to_remove = []
    for image_element in root.findall('image'):
        image_name = image_element.get('name')
        if image_name:
            # We are only concerned with .tif files as per the request
            if image_name.lower().endswith(('.tif', '.tiff')):
                # Check if the actual file (not just the name) exists in the provided folder
                if image_name not in existing_tif_files:
                    images_to_remove.append(image_element)
                    print(f"  Marking image '{image_name}' for removal (not found in image folder).")

    for img_el in images_to_remove:
        root.remove(img_el)
    if images_to_remove:
        print(f"  Removed {len(images_to_remove)} image entries.")


    # 2. Remove all <attribute name="is_crowd"></attribute> attribute tags
    # This needs to be done throughout the document, as it can appear under <image><polygon/box>
    # or under <meta><task><labels><label><attributes>
    elements_with_attributes_to_check = root.findall(".//*") # Get all elements
    is_crowd_removed_count = 0

    for parent_element in elements_with_attributes_to_check:
        attributes_to_remove = []
        for attribute_tag in parent_element.findall('attribute'):
            if attribute_tag.get('name') == 'is_crowd':
                attributes_to_remove.append(attribute_tag)
        
        for attr_to_remove in attributes_to_remove:
            parent_element.remove(attr_to_remove)
            is_crowd_removed_count +=1
            
    if is_crowd_removed_count > 0:
        print(f"  Removed {is_crowd_removed_count} 'is_crowd' attribute tags.")


    # 3. Replace all "unlabelled" typos with "unlabeled"
    # This can be in <label><name>text</name></label> or in <polygon label="unlabelled">
    typo_corrected_count = 0

    # Check <name> tags within <labels>
    for name_tag in root.findall('.//meta/task/labels/label/name'):
        if name_tag.text and 'unlabelled' in name_tag.text:
            original_text = name_tag.text
            name_tag.text = name_tag.text.replace('unlabelled', 'unlabeled')
            if original_text != name_tag.text:
                typo_corrected_count += 1
                print(f"  Corrected typo in <name> tag: '{original_text}' -> '{name_tag.text}'")

    # Check 'label' attributes in tags like <polygon>, <box>, etc.
    # Using .//* to find any element that might have a 'label' attribute
    for element_with_label_attr in root.findall(".//*[@label]"):
        original_label_value = element_with_label_attr.get('label')
        if original_label_value and 'unlabelled' in original_label_value:
            new_label_value = original_label_value.replace('unlabelled', 'unlabeled')
            if original_label_value != new_label_value:
                element_with_label_attr.set('label', new_label_value)
                typo_corrected_count += 1
                print(f"  Corrected typo in attribute: label='{original_label_value}' -> label='{new_label_value}' for tag <{element_with_label_attr.tag}>")
    
    if typo_corrected_count > 0 :
        print(f"  Corrected 'unlabelled' typo {typo_corrected_count} times.")

    # Write the modified XML to the output file
    # ET.indent is available in Python 3.9+ for pretty-printing
    if hasattr(ET, 'indent'):
        ET.indent(tree)
    
    try:
        tree.write(output_xml_file_path, encoding='utf-8', xml_declaration=True)
    except IOError as e:
        print(f"Error writing {output_xml_file_path}: {e}")


def process_cvat_folder(xml_input_folder, tif_image_folder, xml_output_folder):
    """
    Processes all XML files in a given folder.
    """
    if not os.path.isdir(xml_input_folder):
        print(f"Error: XML input folder not found: {xml_input_folder}")
        return
    if not os.path.isdir(tif_image_folder):
        print(f"Error: TIF image folder not found: {tif_image_folder}")
        # Continue if you want to process XMLs without filtering images, 
        # but it's safer to ensure it exists if filtering is a key requirement.
        # For this script, we will stop if the image folder is invalid as it's key for step 1
        return
    
    if not os.path.exists(xml_output_folder):
        os.makedirs(xml_output_folder)
        print(f"Created output folder: {xml_output_folder}")

    # Pre-scan the TIF image folder to get a set of existing .tif file names for quick lookup
    print(f"\nScanning TIF image folder: {tif_image_folder}...")
    existing_tif_files = set()
    for filename in os.listdir(tif_image_folder):
        if filename.lower().endswith(('.tif', '.tiff')):
            existing_tif_files.add(filename)
    print(f"Found {len(existing_tif_files)} TIF files in the image folder.")
    if not existing_tif_files:
        print("Warning: No .tif/.tiff files found in the specified image folder. "
              "All <image> tags referencing .tif files will be removed from XMLs if this is not intended.")


    xml_files = glob.glob(os.path.join(xml_input_folder, '*.xml'))
    if not xml_files:
        print(f"No XML files found in {xml_input_folder}")
        return

    print(f"\nProcessing {len(xml_files)} XML file(s) from {xml_input_folder}...")
    for xml_file_path in xml_files:
        base_name = os.path.basename(xml_file_path)
        output_xml_file_path = os.path.join(xml_output_folder, base_name)
        print(f"\nProcessing file: {xml_file_path}")
        clean_cvat_xml(xml_file_path, tif_image_folder, output_xml_file_path, existing_tif_files)
        print(f"  Saved modified XML to: {output_xml_file_path}")
    
    print("\nAll XML processing complete.")

input_xml_directory = r"C:\Users\kevin\Documents\xmls - Copy"
input_image_directory = r"C:\Users\kevin\dev\tornado-tree-destruction-ef\dataset\images" # Folder containing the .tif files
output_xml_directory = r"C:\Users\kevin\Documents\xmls_out"

process_cvat_folder(input_xml_directory, input_image_directory, output_xml_directory)

Created output folder: C:\Users\kevin\Documents\xmls_out

Scanning TIF image folder: C:\Users\kevin\dev\tornado-tree-destruction-ef\dataset\images...
Found 37 TIF files in the image folder.

Processing 10 XML file(s) from C:\Users\kevin\Documents\xmls - Copy...

Processing file: C:\Users\kevin\Documents\xmls - Copy\centreglassville.xml
  Removed 21 'is_crowd' attribute tags.
  Corrected typo in <name> tag: 'unlabelled' -> 'unlabeled'
  Corrected typo in attribute: label='unlabelled' -> label='unlabeled' for tag <box>
  Corrected typo in attribute: label='unlabelled' -> label='unlabeled' for tag <box>
  Corrected 'unlabelled' typo 3 times.
  Saved modified XML to: C:\Users\kevin\Documents\xmls_out\centreglassville.xml

Processing file: C:\Users\kevin\Documents\xmls - Copy\crozier.xml
  Marking image '22_Crozier_461000_5386000.tif' for removal (not found in image folder).
  Removed 1 image entries.
  Removed 193 'is_crowd' attribute tags.
  Corrected typo in <name> tag: 'unlabelled' -> '