In [None]:
import zipfile
with zipfile.ZipFile('/content/drive/MyDrive/Syook/datasets.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/drive/MyDrive/Syook')

In [None]:
import os
import xml.etree.ElementTree as ET

VOC_ANNOTATIONS_PATH = "/content/drive/MyDrive/Syook/datasets/Voc_labels"  # XML annotation directory
YOLO_LABELS_PATH = "/content/drive/MyDrive/Syook/datasets/labels"          # Output YOLO format directory

CLASSES = ["person", "hard-hat", "gloves", "mask", "glasses", "boots", "vest", "ppe-suit", "ear-protector", "safety-harness"]

os.makedirs(YOLO_LABELS_PATH, exist_ok=True)

# Convert each XML file
for xml_file in os.listdir(VOC_ANNOTATIONS_PATH):
    if not xml_file.endswith(".xml"):
        continue

    # Parse XML
    tree = ET.parse(os.path.join(VOC_ANNOTATIONS_PATH, xml_file))
    root = tree.getroot()

    # Get image dimensions
    width = int(root.find("size/width").text)
    height = int(root.find("size/height").text)

    yolo_data = []

    for obj in root.findall("object"):
        class_name = obj.find("name").text.strip()
        if class_name not in CLASSES:
            continue  # Skip unrecognized classes

        class_id = CLASSES.index(class_name)

        # Get bounding box coordinates
        bbox = obj.find("bndbox")
        xmin = int(bbox.find("xmin").text)
        ymin = int(bbox.find("ymin").text)
        xmax = int(bbox.find("xmax").text)
        ymax = int(bbox.find("ymax").text)

        # Convert to YOLO format (normalize values)
        x_center = (xmin + xmax) / (2 * width)
        y_center = (ymin + ymax) / (2 * height)
        box_width = (xmax - xmin) / width
        box_height = (ymax - ymin) / height

        # Append YOLO annotation
        yolo_data.append(f"{class_id} {x_center:.6f} {y_center:.6f} {box_width:.6f} {box_height:.6f}")


    txt_filename = os.path.join(YOLO_LABELS_PATH, xml_file.replace(".xml", ".txt"))
    with open(txt_filename, "w") as f:
        f.write("\n".join(yolo_data))

In [None]:
import os
import shutil
import random

# Set paths
base_dir = "/content/drive/MyDrive/Syook/datasets"
image_dir = os.path.join(base_dir, "images")
label_dir = os.path.join(base_dir, "labels")

# Define output directories
output_base = "/content/drive/MyDrive/Syook/datasets/data_split"
train_image_dir = os.path.join(output_base, "images/train")
train_label_dir = os.path.join(output_base, "labels/train")
val_image_dir = os.path.join(output_base, "images/val")
val_label_dir = os.path.join(output_base, "labels/val")

# Create output directories
os.makedirs(train_image_dir, exist_ok=True)
os.makedirs(train_label_dir, exist_ok=True)
os.makedirs(val_image_dir, exist_ok=True)
os.makedirs(val_label_dir, exist_ok=True)

# Get all image files (assuming images are .jpg or .png)
image_files = [f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png'))]

# Shuffle randomly
random.seed(42)  # For reproducibility
random.shuffle(image_files)

# Split into train (80%) and val (20%)
split_index = int(len(image_files) * 0.8)
train_files = image_files[:split_index]
val_files = image_files[split_index:]

# Function to move images and corresponding labels
def move_files(file_list, src_img_dir, src_lbl_dir, dest_img_dir, dest_lbl_dir):
    for img_file in file_list:
        # Corresponding label file (assuming .txt format)
        label_file = img_file.replace(".jpg", ".txt").replace(".png", ".txt")

        # Move image
        shutil.copy(os.path.join(src_img_dir, img_file), os.path.join(dest_img_dir, img_file))

        # Move label only if it exists
        label_path = os.path.join(src_lbl_dir, label_file)
        if os.path.exists(label_path):
            shutil.copy(label_path, os.path.join(dest_lbl_dir, label_file))

# Move train files
move_files(train_files, image_dir, label_dir, train_image_dir, train_label_dir)

# Move val files
move_files(val_files, image_dir, label_dir, val_image_dir, val_label_dir)

print("✅ Train-Val split done! Check /content/drive/MyDrive/Syook/datasets/data_split/images/train")

✅ Train-Val split done! Check /content/drive/MyDrive/Syook/datasets/data_split/images/train


In [None]:
import os
import shutil
import random

# Input labels directory
label_dir = "/content/drive/MyDrive/Syook/datasets/labels"

# Output directories
train_label_dir = "/content/drive/MyDrive/Syook/datasets/data_split/labels/train"
val_label_dir = "/content/drive/MyDrive/Syook/datasets/data_split/labels/val"

# Create output directories
os.makedirs(train_label_dir, exist_ok=True)
os.makedirs(val_label_dir, exist_ok=True)

# Get all label files
label_files = [f for f in os.listdir(label_dir) if f.endswith(".txt")]

# Shuffle and split
random.seed(42)
random.shuffle(label_files)

split_index = int(len(label_files) * 0.8)
train_labels = label_files[:split_index]
val_labels = label_files[split_index:]

# Copy label files
for label_file in train_labels:
    shutil.copy(os.path.join(label_dir, label_file), os.path.join(train_label_dir, label_file))

for label_file in val_labels:
    shutil.copy(os.path.join(label_dir, label_file), os.path.join(val_label_dir, label_file))

print("✅ Labels split into train and val folders!")

✅ Labels split into train and val folders!


In [None]:
import os
len(os.listdir('/content/drive/MyDrive/Syook/datasets/data_split/images/train'))

332

In [None]:
len(os.listdir('/content/drive/MyDrive/Syook/datasets/data_split/images/val'))

84

In [None]:
ls /content/drive/MyDrive/Syook/datasets/data_split/labels/train

001003.txt
001029.txt
001038.txt
001042.txt
001045.txt
001054.txt
001056_jpg.rf.fb5d9fbc2ccfa43ca89d84be6d2a98ea.txt
001059.txt
001060.txt
001062.txt
001071.txt
001073.txt
001080.txt
001082.txt
001083.txt
001085.txt
001086.txt
001092.txt
001096.txt
001107_jpg.rf.ddc4b21edf46aaa9518dfe33a381ff29.txt
001109.txt
001124.txt
001142.txt
001143.txt
001146.txt
001147.txt
001155.txt
001158.txt
001164.txt
001175.txt
001180.txt
001184.txt
001186.txt
001187.txt
001188.txt
001191.txt
001196.txt
001198.txt
001199.txt
001201.txt
001202.txt
001209.txt
001213.txt
001214_jpg.rf.1341753c952df6e0889b1f781af22c77.txt
001216_jpg.rf.c7de195db643cb4d72f58f262b39b050.txt
001221.txt
001222.txt
001223.txt
001224.txt
001225.txt
001229.txt
001231.txt
001232.txt
001236.txt
001246_jpg.rf.05724a1c67f05c4fbd6fb3d872bc98b4.txt
001265.txt
001268.txt
001271.txt
001282.txt
001284.txt
001295.txt
001297.txt
001302_jpg.rf.6e51fb4e9255ceda9bca16f35d4ae32b.txt
001302.txt
001303.txt
001305.txt
001308.txt
001318.txt
001320.txt
0

In [None]:
print("Train Images:", len(os.listdir(train_image_dir)))
print("Train Labels:", len(os.listdir(train_label_dir)))
print("Val Images:", len(os.listdir(val_image_dir)))
print("Val Labels:", len(os.listdir(val_label_dir)))


Train Images: 332
Train Labels: 332
Val Images: 84
Val Labels: 84


In [None]:
import os

# Show first 5 image and label names
image_dir = "/content/drive/MyDrive/Syook/datasets/data_split/images/train"
label_dir = "/content/drive/MyDrive/Syook/datasets/data_split/labels/train"

image_files = sorted([f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png'))])[:5]
label_files = sorted(os.listdir(label_dir))[:5]

print("Sample image files:", image_files)
print("Sample label files:", label_files)

Sample image files: ['-1532-_png_jpg.rf.08a5b6985f24bfe7efefdb45c04469c2.jpg', '-1571-_png_jpg.rf.ccf919d5ea025ddc24cf0a707b331249.jpg', '-1579-_png_jpg.rf.c8f91ec3791bf03ccf9eca6c29f62aec.jpg', '-1597-_png_jpg.rf.3bd5df66feaa51e0d65197b4acaf356f.jpg', '-1817-_png_jpg.rf.0c0c9d7ee4b875c6ad49937fc72182f6.jpg']
Sample label files: ['-1532-_png_jpg.rf.08a5b6985f24bfe7efefdb45c04469c2.txt', '-1571-_png_jpg.rf.ccf919d5ea025ddc24cf0a707b331249.txt', '-1579-_png_jpg.rf.c8f91ec3791bf03ccf9eca6c29f62aec.txt', '-1597-_png_jpg.rf.3bd5df66feaa51e0d65197b4acaf356f.txt', '-1817-_png_jpg.rf.0c0c9d7ee4b875c6ad49937fc72182f6.txt']
