In [15]:
from pathlib import Path
import json
from tqdm import tqdm
import shutil
from collections import defaultdict

In [16]:
root = Path("../data/raw/DocLayNet_core")
out_root = Path("../data/raw/DocLayNet_filtered")

In [17]:
target_classes = ["Picture", "Caption", "Table","Page-footer","Page-header"]
used_classes = ["Caption", "PageNumber", "Picture"]
categories = ['scientific_articles']

In [18]:
def coco_to_yolo(coco_path, out_label_dir, src_img_dir, out_img_dir, limit):
    with open(coco_path, 'r') as f:
        coco = json.load(f)

    cat_map = {c['id']: c['name'] for c in coco['categories']}
    class_to_idx = {name: i for i, name in enumerate(used_classes)}
    count = 0

    anns_by_image = defaultdict(list)
    for a in tqdm(coco['annotations'], desc='Appending to dict'):
        anns_by_image[a['image_id']].append(a)

    for img in tqdm(coco['images'], desc='Processing imgs'):
        if(count >= limit): break
        if(img["doc_category"] not in categories): continue
        
        file_name = Path(img["file_name"]).stem
        img_w, img_h = img['width'], img['height']
        anns = anns_by_image[img['id']]
        yolo_lines = []

        for ann in anns:
            cls_name = cat_map[ann['category_id']]
            if cls_name not in target_classes:
                continue
            if cls_name == "Table": cls_name = "Picture"
            elif cls_name == "Page-header" or cls_name == "Page-footer": cls_name = "PageNumber"
            x, y, w, h = ann['bbox']
            x_c = (x + w/2) / img_w
            y_c = (y + h/2) / img_h
            w /= img_w
            h /= img_h
            yolo_lines.append(f"{class_to_idx[cls_name]} {x_c:.6f} {y_c:.6f} {w:.6f} {h:.6f}")
        
        if not yolo_lines: continue

        (out_label_dir / f"{file_name}.txt").write_text("\n".join(yolo_lines))

        src_img = src_img_dir / f"{file_name}.png"
        dst_img = out_img_dir / f"{file_name}.png"

        count += 1
        if src_img.exists():
            shutil.copy2(src_img, dst_img)  # copy or move (use move if you prefer)
        else:
            print(f"Image not found for {file_name}")

In [19]:
coco_path_train = root / "COCO/train.json"
src_img_dir_train = root / "PNG"
out_label_dir_train = out_root / "train/labels"
out_img_dir_train = out_root / "train/images"

out_label_dir_train.mkdir(parents=True, exist_ok=True)
out_img_dir_train.mkdir(parents=True, exist_ok=True)

In [20]:
coco_to_yolo(coco_path_train, out_label_dir_train, src_img_dir_train, out_img_dir_train, 6000)

Appending to dict: 100%|██████████| 941123/941123 [00:03<00:00, 266745.65it/s]
Processing imgs:  37%|███▋      | 25503/69375 [00:19<00:33, 1292.16it/s]  


In [21]:
coco_path_test = root / "COCO/test.json"
src_img_dir_test = root / "PNG"
out_label_dir_test = out_root / "test/labels"
out_img_dir_test = out_root / "test/images"

out_label_dir_test.mkdir(parents=True, exist_ok=True)
out_img_dir_test.mkdir(parents=True, exist_ok=True)

In [22]:
coco_to_yolo(coco_path_test, out_label_dir_test, src_img_dir_test, out_img_dir_test, limit=1200)

Appending to dict: 100%|██████████| 66531/66531 [00:00<00:00, 1023729.43it/s]
Processing imgs: 100%|██████████| 4999/4999 [00:03<00:00, 1573.74it/s] 


In [23]:
coco_path_val = root / "COCO/val.json"
coco_to_yolo(coco_path_val, out_label_dir_test, src_img_dir_test, out_img_dir_test, limit=1200-936)

Appending to dict: 100%|██████████| 99816/99816 [00:00<00:00, 1886478.59it/s]
Processing imgs:  27%|██▋       | 1762/6489 [00:00<00:02, 1914.41it/s] 


In [24]:
import cv2
import os

def draw_yolo_boxes(image_path, label_path, save_path=None):
    img = cv2.imread(image_path)
    if img is None:
        print(f"Cannot load image: {image_path}")
        return

    h, w, _ = img.shape

    with open(label_path, "r") as f:
        lines = f.readlines()

    for line in lines:
        cls, x_c, y_c, bw, bh = map(float, line.split())

        x_center = x_c * w
        y_center = y_c * h
        box_width = bw * w
        box_height = bh * h

        x1 = int(x_center - box_width / 2)
        y1 = int(y_center - box_height / 2)
        x2 = int(x_center + box_width / 2)
        y2 = int(y_center + box_height / 2)

        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(img, f"class {int(cls)}", (x1, max(0, y1 - 5)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2)

    if save_path:
        cv2.imwrite(save_path, img)
        print(f"Saved preview → {save_path}")
    else:
        cv2.imshow("YOLO Bounding Boxes", img)
        cv2.waitKey(0)
        cv2.destroyAllWindows()


In [27]:
image_dir = out_img_dir_test
label_dir = out_label_dir_test

image_files = [
    f for f in os.listdir(image_dir)
    if f.lower().endswith((".jpg", ".jpeg", ".png"))
]

# take only first 10
image_files = image_files[:10]

for file in image_files:
    img_path = os.path.join(image_dir, file)
    txt_name = os.path.splitext(file)[0] + ".txt"
    txt_path = os.path.join(label_dir, txt_name)

    if os.path.exists(txt_path):
        draw_yolo_boxes(img_path, txt_path, None)
    else:
        print(f"No label found for {file}")