# Settings

##Google Drive Mount

In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')


Mounted at /content/drive


## Set File Path

In [2]:
coco_dataset_paths = [
    '/content/drive/MyDrive/Pill.individual.coco',
    '/content/drive/MyDrive/Pills Detection.v1i.coco',
]

yolo_output_path = '/content/drive/MyDrive/yolo_dataset_final'

print("경로 확인:")
print(f"\n병합할 데이터셋 ({len(coco_dataset_paths)}개):")
all_exist = True

for i, path in enumerate(coco_dataset_paths, 1):
    exists = os.path.exists(path)
    status = "✓" if exists else "✗"
    print(f"  {status} Dataset {i}: {path}")

    if exists:
        items = os.listdir(path)
        print(f"      폴더: {', '.join([item for item in items if os.path.isdir(os.path.join(path, item))])}")
    else:
        all_exist = False

print(f"\n출력 경로: {yolo_output_path}")

if not all_exist:
    print("\n일부 경로를 찾을 수 없습니다.")

경로 확인:

병합할 데이터셋 (2개):
  ✓ Dataset 1: /content/drive/MyDrive/Pill.individual.coco
      폴더: train, test, valid
  ✓ Dataset 2: /content/drive/MyDrive/Pills Detection.v1i.coco
      폴더: train, valid, test

출력 경로: /content/drive/MyDrive/yolo_dataset_final


## Step 3: 변환 함수 정의

In [3]:
import json
import shutil
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm

def coco_to_yolo_bbox(bbox, img_width, img_height):
    x, y, w, h = bbox

    x_center = x + w / 2
    y_center = y + h / 2

    x_center_norm = x_center / img_width
    y_center_norm = y_center / img_height
    w_norm = w / img_width
    h_norm = h / img_height

    return x_center_norm, y_center_norm, w_norm, h_norm

def process_coco_split(coco_json_path, images_dir, output_images_dir, output_labels_dir, dataset_name=""):
    print(f"\n  Processing: {coco_json_path}")

    with open(coco_json_path, 'r') as f:
        coco_data = json.load(f)

    id_to_image = {img['id']: img for img in coco_data['images']}

    image_annotations = defaultdict(list)
    for ann in coco_data['annotations']:
        image_annotations[ann['image_id']].append(ann)

    processed_images = 0
    processed_annotations = 0
    skipped_duplicates = 0

    for img_id, img_info in tqdm(id_to_image.items(), desc=f"  {dataset_name}", leave=False):
        filename = img_info['file_name']
        img_width = img_info['width']
        img_height = img_info['height']

        src_image_path = Path(images_dir) / filename

        if not src_image_path.exists():
            continue

        dst_image_path = Path(output_images_dir) / filename
        label_filename = Path(filename).stem + '.txt'
        label_path = Path(output_labels_dir) / label_filename

        if dst_image_path.exists():
            skipped_duplicates += 1
            continue

        shutil.copy2(src_image_path, dst_image_path)
        processed_images += 1

        annotations = image_annotations.get(img_id, [])

        with open(label_path, 'w') as f:
            for ann in annotations:
                bbox = ann['bbox']
                x_center, y_center, w, h = coco_to_yolo_bbox(bbox, img_width, img_height)

                # class_id는 Pill로 통일
                class_id = 0

                f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}\n")
                processed_annotations += 1

    print(f"    ✓ Images: {processed_images}, Annotations: {processed_annotations}", end="")
    if skipped_duplicates > 0:
        print(f", Skipped duplicates: {skipped_duplicates}")
    else:
        print()

    return processed_images, processed_annotations

def merge_coco_datasets_to_yolo(coco_dataset_paths, output_path):

    output_path = Path(output_path)

    splits = ['train', 'valid', 'test']

    total_stats = {
        'train': {'images': 0, 'annotations': 0},
        'valid': {'images': 0, 'annotations': 0},
        'test': {'images': 0, 'annotations': 0}
    }

    print("=" * 60)
    print("Merging COCO Datasets to YOLO Format")
    print("=" * 60)
    print(f"\nDatasets to merge: {len(coco_dataset_paths)}")

    for i, path in enumerate(coco_dataset_paths, 1):
        print(f"  {i}. {path}")

    for split in splits:
        print(f"\n{'='*60}")
        print(f"Processing split: {split}")
        print(f"{'='*60}")

        output_images_dir = output_path / 'images' / split
        output_labels_dir = output_path / 'labels' / split

        output_images_dir.mkdir(parents=True, exist_ok=True)
        output_labels_dir.mkdir(parents=True, exist_ok=True)

        split_images = 0
        split_annotations = 0

        for dataset_idx, coco_path in enumerate(coco_dataset_paths, 1):
            coco_path = Path(coco_path)
            split_dir = coco_path / split

            if not split_dir.exists():
                print(f"\n  Dataset {dataset_idx}: {split}/ not found, skipping...")
                continue

            annotation_file = split_dir / '_annotations.coco.json'
            if not annotation_file.exists():
                print(f"\n  Dataset {dataset_idx}: _annotations.coco.json not found, skipping...")
                continue

            images_dir = split_dir

            num_images, num_annotations = process_coco_split(
                annotation_file,
                images_dir,
                output_images_dir,
                output_labels_dir,
                dataset_name=f"Dataset {dataset_idx}"
            )

            split_images += num_images
            split_annotations += num_annotations

        total_stats[split]['images'] = split_images
        total_stats[split]['annotations'] = split_annotations

        print(f"\n  {split} total: {split_images} images, {split_annotations} annotations")

    total_images = sum(stats['images'] for stats in total_stats.values())
    total_annotations = sum(stats['annotations'] for stats in total_stats.values())

    yaml_content = f"""# Pill Detection Dataset
# Merged from {len(coco_dataset_paths)} COCO datasets

# Paths
path: {output_path}  # dataset root dir
train: images/train  # train images (relative to 'path')
val: images/valid   # val images (relative to 'path')
test: images/test   # test images (optional)

# Classes
nc: 1  # number of classes
names: ['Pill']  # class names

# Dataset Statistics
train_images: {total_stats['train']['images']}
train_annotations: {total_stats['train']['annotations']}
valid_images: {total_stats['valid']['images']}
valid_annotations: {total_stats['valid']['annotations']}
test_images: {total_stats['test']['images']}
test_annotations: {total_stats['test']['annotations']}
total_images: {total_images}
total_annotations: {total_annotations}
"""

    yaml_path = output_path / 'data.yaml'
    with open(yaml_path, 'w') as f:
        f.write(yaml_content)

    return total_stats

print("✓ 함수 정의 완료!")

✓ 함수 정의 완료!


## Step 4: 병합 및 변환 실행

In [4]:
if all_exist:
    stats = merge_coco_datasets_to_yolo(coco_dataset_paths, yolo_output_path)

    print("\n" + "=" * 60)
    print("✓ Merge & Conversion Complete!")
    print("=" * 60)
    print(f"\n Final Statistics:")
    print(f"  Train:  {stats['train']['images']} images, {stats['train']['annotations']} annotations")
    print(f"  Valid:  {stats['valid']['images']} images, {stats['valid']['annotations']} annotations")
    print(f"  Test:   {stats['test']['images']} images, {stats['test']['annotations']} annotations")

    total_images = sum(s['images'] for s in stats.values())
    total_annotations = sum(s['annotations'] for s in stats.values())
    print(f"  Total:  {total_images} images, {total_annotations} annotations")

Merging COCO Datasets to YOLO Format

Datasets to merge: 2
  1. /content/drive/MyDrive/Pill.individual.coco
  2. /content/drive/MyDrive/Pills Detection.v1i.coco

Processing split: train

  Processing: /content/drive/MyDrive/Pill.individual.coco/train/_annotations.coco.json




    ✓ Images: 337, Annotations: 337

  Processing: /content/drive/MyDrive/Pills Detection.v1i.coco/train/_annotations.coco.json




    ✓ Images: 483, Annotations: 9932

  train total: 820 images, 10269 annotations

Processing split: valid

  Processing: /content/drive/MyDrive/Pill.individual.coco/valid/_annotations.coco.json




    ✓ Images: 164, Annotations: 164

  Processing: /content/drive/MyDrive/Pills Detection.v1i.coco/valid/_annotations.coco.json




    ✓ Images: 135, Annotations: 2602

  valid total: 299 images, 2766 annotations

Processing split: test

  Processing: /content/drive/MyDrive/Pill.individual.coco/test/_annotations.coco.json




    ✓ Images: 41, Annotations: 41

  Processing: /content/drive/MyDrive/Pills Detection.v1i.coco/test/_annotations.coco.json


                                                            

    ✓ Images: 78, Annotations: 1681

  test total: 119 images, 1722 annotations

✓ Merge & Conversion Complete!

 Final Statistics:
  Train:  820 images, 10269 annotations
  Valid:  299 images, 2766 annotations
  Test:   119 images, 1722 annotations
  Total:  1238 images, 14757 annotations




## Step 5: 결과 확인

In [5]:
if all_exist and os.path.exists(yolo_output_path):
    print("생성된 파일 확인:\n")

    # 각 split별 파일 개수 확인
    for split in ['train', 'valid', 'test']:
        images_path = Path(yolo_output_path) / 'images' / split
        labels_path = Path(yolo_output_path) / 'labels' / split

        if images_path.exists():
            num_images = len(list(images_path.glob('*')))
            num_labels = len(list(labels_path.glob('*.txt')))
            print(f"{split}:")
            print(f"  - Images: {num_images}")
            print(f"  - Labels: {num_labels}")

    # data.yaml 확인
    yaml_path = Path(yolo_output_path) / 'data.yaml'
    if yaml_path.exists():
        print("\n" + "=" * 60)
        print("data.yaml 내용:")
        print("=" * 60)
        with open(yaml_path, 'r') as f:
            print(f.read())

    # 샘플 레이블 파일 확인
    train_labels = list((Path(yolo_output_path) / 'labels' / 'train').glob('*.txt'))
    if train_labels:
        print("\n" + "=" * 60)
        print("샘플 레이블 파일:")
        print("=" * 60)
        sample = train_labels[0]
        print(f"파일: {sample.name}")
        with open(sample, 'r') as f:
            lines = f.readlines()
            for i, line in enumerate(lines[:5]):
                print(f"  {i+1}: {line.strip()}")
            if len(lines) > 5:
                print(f"  ... ({len(lines) - 5} more lines)")

생성된 파일 확인:

train:
  - Images: 820
  - Labels: 820
valid:
  - Images: 299
  - Labels: 299
test:
  - Images: 119
  - Labels: 119

data.yaml 내용:
# Pill Detection Dataset
# Merged from 2 COCO datasets

# Paths
path: /content/drive/MyDrive/yolo_dataset_final  # dataset root dir
train: images/train  # train images (relative to 'path')
val: images/valid   # val images (relative to 'path')
test: images/test   # test images (optional)

# Classes
nc: 1  # number of classes
names: ['Pill']  # class names

# Dataset Statistics
train_images: 820
train_annotations: 10269
valid_images: 299
valid_annotations: 2766
test_images: 119
test_annotations: 1722
total_images: 1238
total_annotations: 14757


샘플 레이블 파일:
파일: cefalexin--31-_jpg.rf.00640cefb7a6f7bbcfd6c1030b07155a.txt
  1: 0 0.591406 0.525000 0.182812 0.512500


In [6]:
import os

folder_path = "/content/drive/MyDrive/Pills Detection.v1i.yolo/labels/test"

file_count = len([
    f for f in os.listdir(folder_path)
    if os.path.isfile(os.path.join(folder_path, f))
])

print(f"'{folder_path}' 폴더 안의 파일 개수: {file_count}")


'/content/drive/MyDrive/Pills Detection.v1i.yolo/labels/test' 폴더 안의 파일 개수: 78
