In [19]:
import os
import json
import numpy as np
from tqdm import tqdm
import shutil

In [None]:
def split_coco_dataset(json_path, out_dir, train_ratio=0.6, val_ratio=0.2, seed=42):
    os.makedirs(out_dir, exist_ok=True)
    
    with open(json_path, 'r') as f:
        coco_data = json.load(f)
    
    images = coco_data['images']
    annotations = coco_data['annotations']
    np.random.seed(seed)
    np.random.shuffle(images)
    
    total_images = len(images)
    train_count = int(total_images * train_ratio)
    val_count = int(total_images * val_ratio)
    test_count = total_images - train_count - val_count
    
    train_images = images[:train_count]
    val_images = images[train_count:train_count + val_count]
    test_images = images[train_count + val_count:]
    
    train_images_ids = {img['id'] for img in train_images}
    val_images_ids = {img['id'] for img in val_images}
    test_images_ids = {img['id'] for img in test_images}
    
    train_annotations = [ann for ann in annotations if ann['image_id'] in train_images_ids]
    val_annotations = [ann for ann in annotations if ann['image_id'] in val_images_ids]
    test_annotations = [ann for ann in annotations if ann['image_id'] in test_images_ids]
    
    train_coco = {
        "images": train_images,
        "annotations": train_annotations,
        "categories": coco_data["categories"]
    }
    
    val_coco = {
        "images": val_images,
        "annotations": val_annotations,
        "categories": coco_data["categories"]
    }
    
    test_coco = {
        "images": test_images,
        "annotations": test_annotations,
        "categories": coco_data["categories"]
    }
    
    with open(os.path.join(out_dir, "train.json"), 'w') as f:
        json.dump(train_coco, f, indent=4)
    with open(os.path.join(out_dir, "val.json"), 'w') as f:
        json.dump(val_coco, f, indent=4)
    with open(os.path.join(out_dir, "test.json"), 'w') as f:
        json.dump(test_coco, f, indent=4)    
    
    print("TVT SPLIT DONE!")

In [None]:
json_path = "/data/ephemeral/home/aihub/all_mite_coco.json"
out_dir = "/data/ephemeral/home/aihub"

In [None]:
split_coco_dataset(json_path, out_dir)

In [15]:
def move_images(json_path, img_dir, out_dir):
    with open(json_path, 'r') as f:
        coco_data = json.load(f)
    
    split_name = os.path.splitext(os.path.basename(json_path))[0]
    target_dir = os.path.join(out_dir, split_name)
    os.makedirs(target_dir, exist_ok=True)
    
    for image_info in tqdm(coco_data['images'], desc=f"move image to {split_name}"):
        src_path = os.path.join(img_dir, image_info['file_name'])
        dest_path = os.path.join(target_dir, image_info['file_name'])
        if os.path.exists(src_path):
            shutil.copy(src_path, dest_path)
        else:
            print(f"Warning: Image {src_path} does not exist!")
    print("Image moved to DONE")

In [16]:
img_dir = "/data/ephemeral/home/aihub/images"
out_dir = "/data/ephemeral/home/aihub/out"

In [17]:
train_json = "/data/ephemeral/home/aihub/train.json"
val_json = "/data/ephemeral/home/aihub/val.json"
test_json = "/data/ephemeral/home/aihub/test.json"

In [20]:
move_images(train_json, img_dir, out_dir)

move image to train: 100%|██████████| 2211/2211 [00:03<00:00, 732.95it/s]

Image moved to DONE





In [21]:
move_images(val_json, img_dir, out_dir)

move image to val: 100%|██████████| 737/737 [00:00<00:00, 766.67it/s]

Image moved to DONE





In [22]:
move_images(test_json, img_dir, out_dir)

move image to test: 100%|██████████| 737/737 [00:01<00:00, 732.63it/s]


Image moved to DONE
