# Data split for YOLO

In [1]:
import os
import shutil
import random

def create_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def split_dataset(
    images_dir, labels_dir,
    output_root,
    train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1,
    seed=42
):
    assert abs(train_ratio + valid_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1."

    # Create output folders
    for split in ['train', 'valid', 'test']:
        for subfolder in ['images', 'labels']:
            create_dir(os.path.join(output_root, split, subfolder))

    # Match image and label filenames (without extension)
    image_files = sorted([f for f in os.listdir(images_dir) if f.lower().endswith(('.jpg', '.png', '.jpeg'))])
    label_files = sorted([f for f in os.listdir(labels_dir) if f.lower().endswith('.txt')])

    # Ensure corresponding labels exist
    base_names = [os.path.splitext(f)[0] for f in image_files]
    paired_files = [
        (img, f"{os.path.splitext(img)[0]}.txt")
        for img in image_files if f"{os.path.splitext(img)[0]}.txt" in label_files
    ]

    print(f"Found {len(paired_files)} matched image-label pairs.")

    # Shuffle
    random.seed(seed)
    random.shuffle(paired_files)

    # Split
    total = len(paired_files)
    train_end = int(train_ratio * total)
    valid_end = train_end + int(valid_ratio * total)

    train_files = paired_files[:train_end]
    valid_files = paired_files[train_end:valid_end]
    test_files = paired_files[valid_end:]

    # Helper to copy files
    def copy_files(file_list, split_name):
        for img_file, lbl_file in file_list:
            shutil.copy2(os.path.join(images_dir, img_file), os.path.join(output_root, split_name, 'images', img_file))
            shutil.copy2(os.path.join(labels_dir, lbl_file), os.path.join(output_root, split_name, 'labels', lbl_file))

    # Copy files
    copy_files(train_files, 'train')
    copy_files(valid_files, 'valid')
    copy_files(test_files, 'test')

    print("Split completed:")
    print(f"  Train: {len(train_files)}")
    print(f"  Valid: {len(valid_files)}")
    print(f"  Test : {len(test_files)}")

# ==== Example Usage ====
split_dataset(
    images_dir='D:/Pill_Identification/dataset/For_model/Pill_jpg_2025_Resized',
    labels_dir='D:/Pill_Identification/model/YOLOv8/Pill_YOLO_Labels',
    output_root='D:/Pill_Identification/model/YOLOv8',
    train_ratio=0.7,
    valid_ratio=0.2,
    test_ratio=0.1
)


Found 1296 matched image-label pairs.
Split completed:
  Train: 907
  Valid: 259
  Test : 130


# Data split for retina

In [2]:
import pandas as pd
import os
import shutil
from sklearn.model_selection import train_test_split

# Paths
csv_path = "D:/Pill_Identification/RetinaNet/pill_bounding_boxes.csv"
images_root = "D:/Pill_Identification/Pill_Jpeg_Processed"
output_base = "D:/Pill_Identification/RetinaNet"

# Output structure
train_img_dir = os.path.join(output_base, "train", "images")
test_img_dir = os.path.join(output_base, "test", "images")
train_label_csv = os.path.join(output_base, "train", "labels.csv")
test_label_csv = os.path.join(output_base, "test", "labels.csv")

# Create directories
for path in [train_img_dir, test_img_dir]:
    os.makedirs(path, exist_ok=True)

# Load full dataset
df = pd.read_csv(csv_path)

# Split based on unique image IDs
unique_imgs = df['image_id'].unique()
train_imgs, test_imgs = train_test_split(unique_imgs, test_size=0.2, random_state=42, shuffle=True)

# Create train/test label DataFrames
train_df = df[df['image_id'].isin(train_imgs)]
test_df = df[df['image_id'].isin(test_imgs)]

# Save label CSVs
train_df.to_csv(train_label_csv, index=False)
test_df.to_csv(test_label_csv, index=False)

# Function to copy images
def copy_images(img_list, target_dir):
    for img in img_list:
        src_path = os.path.join(images_root, img + ".jpg")
        dst_path = os.path.join(target_dir, img + ".jpg")
        if os.path.exists(src_path):
            shutil.copy2(src_path, dst_path)
        else:
            print(f"⚠️ Missing image: {src_path}")

# Copy corresponding images
copy_images(train_imgs, train_img_dir)
copy_images(test_imgs, test_img_dir)

print("✅ Done: labels and images split into train/test folders.")


✅ Done: labels and images split into train/test folders.
