## Split Test

In [8]:
import os, glob, random, shutil

In [9]:
DATASET_DIR = "../data/annotated_pictures/combined"      
OUT = "split10"       
TEST_RATIO = 0.10

In [10]:
images = []
for ext in ["jpg","jpeg","png"]:
    images.extend(glob.glob(f"{DATASET_DIR}/images/*.{ext}"))

images = sorted(images)
random.shuffle(images)

n = len(images)
n_test = int(TEST_RATIO * n)

test_imgs = images[:n_test]
remain_imgs = images[n_test:]

print("Total:", n)
print("Test:", len(test_imgs))
print("Remain for K-fold:", len(remain_imgs))

Total: 300
Test: 30
Remain for K-fold: 270


In [None]:
for d in ["test/images", "test/labels", "remain/images", "remain/labels"]:
    os.makedirs(f"{OUT}/{d}", exist_ok=True)

def copy_pair(imgs, subset):
    for img in imgs:
        fname = os.path.basename(img)
        lbl = fname.rsplit(".", 1)[0] + ".txt"
        shutil.copy(img, f"{OUT}/{subset}/images/{fname}")
        shutil.copy(f"{DATASET_DIR}/labels/{lbl}", f"{OUT}/{subset}/labels/{lbl}")

In [None]:
copy_pair(test_imgs, "test")
copy_pair(remain_imgs, "remain")

## K-Fold

In [11]:
import os
import glob
import random
from sklearn.model_selection import KFold

In [12]:
REMAIN = "split10/remain"
OUT = "kfold"
K = 5

images = []
for ext in ["jpg","jpeg","png"]:
    images.extend(glob.glob(f"{REMAIN}/images/*.{ext}"))
images = sorted(images)

print("Images used for K-fold:", len(images))  

Images used for K-fold: 270


In [13]:
kf = KFold(n_splits=K, shuffle=True, random_state=42)
os.makedirs(OUT, exist_ok=True)

In [14]:
for fold, (train_idx, val_idx) in enumerate(kf.split(remain_imgs)):
    print(f"\n=== FOLD {fold} ===")
    
    fold_dir = f"{OUT}/fold{fold}"
    train_dir = f"{fold_dir}/train"
    val_dir = f"{fold_dir}/val"

    for d in [
        "train/images", "train/labels",
        "val/images", "val/labels"
    ]:
        os.makedirs(f"{fold_dir}/{d}", exist_ok=True)

    train_files = [remain_imgs[i] for i in train_idx]
    val_files   = [remain_imgs[i] for i in val_idx]

    print("Train:", len(train_files))
    print("Val:", len(val_files))

    for img in train_files:
        fname = os.path.basename(img)
        lbl = fname.rsplit(".", 1)[0] + ".txt"
        shutil.copy(img, f"{train_dir}/images/{fname}")
        shutil.copy(f"{DATASET_DIR}/labels/{lbl}", f"{train_dir}/labels/{lbl}")

    for img in val_files:
        fname = os.path.basename(img)
        lbl = fname.rsplit(".", 1)[0] + ".txt"
        shutil.copy(img, f"{val_dir}/images/{fname}")
        shutil.copy(f"{DATASET_DIR}/labels/{lbl}", f"{val_dir}/labels/{lbl}")


=== FOLD 0 ===
Train: 216
Val: 54

=== FOLD 1 ===
Train: 216
Val: 54

=== FOLD 2 ===
Train: 216
Val: 54

=== FOLD 3 ===
Train: 216
Val: 54

=== FOLD 4 ===
Train: 216
Val: 54


In [16]:
import os
import yaml

ROOT_PATH = "/content/drive/MyDrive/kfold"   
KFOLD_ROOT = "kfold"
NC = 3

CLASSES = ["Caption", "PageNumber", "Picture"]

for fold in range(5):  
    fold_dir = f"{KFOLD_ROOT}/fold{fold}"
    yaml_path = os.path.join(fold_dir, "data.yaml")

    data = {
        "path": f"{ROOT_PATH}/fold{fold}",
        "train": "train/images",
        "val": "val/images",     
        "nc": NC,
        "names": CLASSES
    }

    with open(yaml_path, "w") as f:
        yaml.dump(data, f)

    print("Created:", yaml_path)


Created: kfold/fold0\data.yaml
Created: kfold/fold1\data.yaml
Created: kfold/fold2\data.yaml
Created: kfold/fold3\data.yaml
Created: kfold/fold4\data.yaml
