In [None]:
import os
import xml.etree.ElementTree as ET
import shutil
import random

In [29]:
# Pastikan folder tersedia
extract_dir = './dataset'

In [59]:
# Path tujuan untuk training dan testing
train_folder = os.path.join(extract_dir, 'train')
valid_folder = os.path.join(extract_dir, 'valid')
test_folder = os.path.join(extract_dir, 'test')


# Filter hanya file gambar dan label
image_files = [f for f in os.listdir(test_folder) if os.path.splitext(f)[1].lower() in {".jpg", ".jpeg", ".png"}]
label_files = [f for f in os.listdir(test_folder) if os.path.splitext(f)[1].lower() in {".xml", ".txt"}]

if len(image_files) != len(label_files):
    raise ValueError("Jumlah file gambar dan label tidak sesuai. Harap periksa dataset.")

# Mengurutkan agar pasangan file tetap sesuai
image_files.sort()
label_files.sort()

# Gabungkan file gambar dan label
paired_files = list(zip(image_files, label_files))

# Acak pasangan file
random.seed(42)  # Untuk hasil yang reproducible
random.shuffle(paired_files)

# Hitung jumlah data untuk training dan testing
split_index = int(0.7 * len(paired_files))

train_files = paired_files[:split_index]
valid_files = paired_files[split_index:]

# Fungsi untuk memindahkan file
def move_files(file_pairs, source_folder, target_folder):
    for img_file, label_file in file_pairs:
        shutil.move(os.path.join(source_folder, img_file), os.path.join(target_folder, img_file))
        shutil.move(os.path.join(source_folder, label_file), os.path.join(target_folder, label_file))

# Pindahkan file ke folder training dan validation
move_files(train_files, test_folder, train_folder)
move_files(valid_files, test_folder, valid_folder)


print(f"Data berhasil dibagi: {len(train_files)} untuk training, {len(valid_files)} untuk testing.")


Data berhasil dibagi: 53 untuk training, 24 untuk testing.


In [60]:
folder_prefix_map = {
    "train": "train",
    "valid": "valid"
}

image_extensions = {".jpg", ".jpeg", ".png"}
label_extensions = {".xml", ".txt"}

# Daftar label yang akan digunakan
allowed_labels = {'struk_belanja'}

for folder, prefix in folder_prefix_map.items():
    folder_path = os.path.join(extract_dir, folder)
    if os.path.exists(folder_path):
        image_counter = 1
        label_counter = 1
        
        file_map = {}
        
        # Mencari pasangan file gambar dan label
        for filename in sorted(os.listdir(folder_path)):
            name, ext = os.path.splitext(filename)
            if ext in image_extensions or ext in label_extensions:
                if name not in file_map:
                    file_map[name] = {}
                file_map[name][ext] = filename
        
        for base_name, files in file_map.items():
            new_image_filename = ""
            # Rename gambar
            if any(ext in files for ext in image_extensions):
                old_image_filename = next(files[ext] for ext in image_extensions if ext in files)
                new_image_filename = f"{prefix}_image_{image_counter}{os.path.splitext(old_image_filename)[1]}"
                os.rename(os.path.join(folder_path, old_image_filename), os.path.join(folder_path, new_image_filename))
                print(f"Renamed: {old_image_filename} -> {new_image_filename}")
                image_counter += 1
            
            # Rename label
            if any(ext in files for ext in label_extensions):
                old_label_filename = next(files[ext] for ext in label_extensions if ext in files)
                new_label_filename = f"{prefix}_label_{label_counter}{os.path.splitext(old_label_filename)[1]}"
                old_label_path = os.path.join(folder_path, old_label_filename)
                new_label_path = os.path.join(folder_path, new_label_filename)
                
                os.rename(old_label_path, new_label_path)
                print(f"Renamed: {old_label_filename} -> {new_label_filename}")

                # Update isi XML dan filter label
                if new_label_filename.endswith(".xml"):
                    tree = ET.parse(new_label_path)
                    root = tree.getroot()

                    # Update elemen <filename> dan <path>
                    if root.find("filename") is not None:
                        root.find("filename").text = new_image_filename
                    if root.find("path") is not None:
                        root.find("path").text = new_image_filename
                    
                    # Filter label yang tidak diinginkan
                    for obj in root.findall('object'):
                        label = obj.find('name').text
                        if label not in allowed_labels:
                            root.remove(obj)
                    
                    # Simpan file XML yang sudah difilter
                    tree.write(new_label_path)
                    print(f"Filtered and updated XML: {new_label_filename}")
                
                label_counter += 1
    else:
        print(f"Folder {folder_path} tidak ditemukan.")


Renamed: 0_jpg.rf.edfc36b893bb5852b605b16fba630ac5.jpg -> train_image_1.jpg
Renamed: 0_jpg.rf.edfc36b893bb5852b605b16fba630ac5.xml -> train_label_1.xml
Filtered and updated XML: train_label_1.xml
Renamed: 100_jpg.rf.c788257ba5f3206ce8d9f20e2afe5bed.jpg -> train_image_2.jpg
Renamed: 100_jpg.rf.c788257ba5f3206ce8d9f20e2afe5bed.xml -> train_label_2.xml
Filtered and updated XML: train_label_2.xml
Renamed: 10210397_20180607101646_jpg.rf.f4a2894bbb2936c1232f6f79d1dbe895.jpg -> train_image_3.jpg
Renamed: 10210397_20180607101646_jpg.rf.f4a2894bbb2936c1232f6f79d1dbe895.xml -> train_label_3.xml
Filtered and updated XML: train_label_3.xml
Renamed: 102_jpg.rf.d401759ffa3b36cceac4810d7d067ad6.jpg -> train_image_4.jpg
Renamed: 102_jpg.rf.d401759ffa3b36cceac4810d7d067ad6.xml -> train_label_4.xml
Filtered and updated XML: train_label_4.xml
Renamed: 105_jpg.rf.d49d24d0e291ed9aae619cf692273ac4.jpg -> train_image_5.jpg
Renamed: 105_jpg.rf.d49d24d0e291ed9aae619cf692273ac4.xml -> train_label_5.xml
Filtered

In [None]:
import os
import shutil
import xml.etree.ElementTree as ET

invalid_dir = os.path.join(extract_dir, "incorrect_annotations")
folders_to_check = ["train", "valid"]

# Pastikan folder tujuan ada
os.makedirs(invalid_dir, exist_ok=True)

# Ekstensi file gambar yang diharapkan
image_extensions = {".jpg", ".jpeg", ".png"}

def validate_and_move(xml_file_path, folder_path):
    try:
        # Parse file XML
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Ambil nama file gambar dari tag <filename>
        image_file_name = root.find('filename').text
        image_file_path = os.path.join(folder_path, image_file_name)

        # Ambil semua class dalam file XML
        classes_in_file = [obj.find('name').text for obj in root.findall('object')]

        # Validasi class dalam file XML
        for obj in root.findall('object'):
            class_name = obj.find('name').text
            if class_name not in allowed_labels:
                print(f"Invalid annotation found in: {xml_file_path}, class: {class_name}")
                # Pindahkan file XML dan gambar
                move_files(xml_file_path, image_file_path)
                return
        
        # Pastikan semua class yang diizinkan ada
        if not all(cls in classes_in_file for cls in allowed_labels):
            print(f"Missing required classes in: {xml_file_path}")
            move_files(xml_file_path, image_file_path)

    except ET.ParseError:
        print(f"Error parsing XML file: {xml_file_path}. Moving to incorrect folder.")
        move_files(xml_file_path, None)
    except Exception as e:
        print(f"Unexpected error: {e}")

def move_files(xml_file_path, image_file_path):
    """Pindahkan file XML dan gambar ke folder incorrect_annotations."""
    # Pindahkan file XML
    shutil.move(xml_file_path, os.path.join(invalid_dir, os.path.basename(xml_file_path)))
    print(f"Moved XML: {os.path.basename(xml_file_path)} -> {invalid_dir}")

    # Pindahkan file gambar jika ada
    if image_file_path and os.path.exists(image_file_path):
        shutil.move(image_file_path, os.path.join(invalid_dir, os.path.basename(image_file_path)))
        print(f"Moved Image: {os.path.basename(image_file_path)} -> {invalid_dir}")
    elif image_file_path:
        print(f"Image file not found: {image_file_path}")

# Iterasi semua folder yang akan diperiksa
for folder in folders_to_check:
    folder_path = os.path.join(base_dir, folder)
    if not os.path.exists(folder_path):
        print(f"Folder {folder_path} tidak ditemukan. Melewati folder ini.")
        continue

    # Iterasi semua file XML di dalam folder
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith('.xml') and os.path.isfile(file_path):
            validate_and_move(file_path, folder_path)

print("Proses validasi dan pemindahan selesai.")


Missing required classes in: dataset\train\train_label_162.xml
Moved XML: train_label_162.xml -> dataset\incorrect_annotations
Moved Image: train_image_162.jpg -> dataset\incorrect_annotations
Missing required classes in: dataset\valid\valid_label_51.xml
Moved XML: valid_label_51.xml -> dataset\incorrect_annotations
Moved Image: valid_image_51.jpg -> dataset\incorrect_annotations
Proses validasi dan pemindahan selesai.
