In [1]:
%pip install natsort tqdm torch torchvision


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from pathlib import Path
import glob
import re
import shutil
from random import shuffle

import torch
from torchvision.io import decode_image, write_jpeg
from natsort import natsorted
from tqdm import tqdm

In [3]:

DATASET_PATH = os.path.join(os.sep, 'ml', 'LIZA_dataset', 'original', '**')
OUTPUT_PATH = os.path.join(os.sep, 'ml', 'LIZA_dataset', 'original_split')
if os.path.isdir(OUTPUT_PATH):
    shutil.rmtree(OUTPUT_PATH)
os.mkdir(OUTPUT_PATH)

INFERENCE_SIZE = 640
STEP_SIZE = 580

In [4]:
images = [file for file in glob.glob(os.path.join(DATASET_PATH, '*'), recursive=True) if re.match(r'(.*\.jpg)|(.*\.JPG)', file)]
annotations = [file for file in glob.glob(os.path.join(DATASET_PATH, '*'), recursive=True) if re.match(r'(.*\.txt)', file)]

images = natsorted(images)
annotations = natsorted(annotations)

len(images), len(annotations)

(60295, 60295)

In [5]:
annotated = 0

unannotated_images = []
unannotated_annotations = []

for image, annotation in tqdm(zip(images, annotations)):
    if Path(image).stem != Path(annotation).stem:
        raise ValueError('Images and annotations indices do not match')
    stem = Path(image).stem

    T = decode_image(image)
    _, height, width = T.shape

    
    with open(annotation) as file:
        labels = file.read().splitlines()

    bboxes = torch.empty((len(labels), 4), dtype=torch.float32)
    for i in range(len(labels)):
        if not labels[i]:
            continue
            
        _, bbox_x, bbox_y, bbox_w, bbox_h = labels[i].split(' ')
        bbox_x, bbox_y, bbox_w, bbox_h = list(map(float, (bbox_x, bbox_y, bbox_w, bbox_h)))
        
        bbox_l = bbox_x - bbox_w / 2  # from center coordinate to left
        bbox_t = bbox_y - bbox_h / 2  # from center coordinate to top
        bboxes[i] = torch.as_tensor((bbox_l * width, bbox_t * height, bbox_w * width, bbox_h * height))

    idx = 0

    for y in range(0, height, STEP_SIZE):
        v_edge = height - y < INFERENCE_SIZE
        h = min(INFERENCE_SIZE, height - y)
        for x in range(0, width, STEP_SIZE):
            h_edge = width - x < INFERENCE_SIZE
            w = min(INFERENCE_SIZE, width - x)
            
            window = T[
                :, 
                y if not v_edge else height - INFERENCE_SIZE:y + h if not v_edge else height, 
                x if not h_edge else width - INFERENCE_SIZE:x + w if not h_edge else width
            ]

            window_labels = []
            for i in range(bboxes.shape[0]):
                l, t, w_, h_ = bboxes[i]
                l, t, w_, h_ = l.item(), t.item(), w_.item(), h_.item()

                bbox = [
                    max(0., l - x),
                    max(0., t - y),
                    min(w, w_ + l - x) - max(0., l - x),
                    min(h, h_ + t - y) - max(0., t - y)
                ]

                if bbox[2] > 0. and bbox[3] > 0.:
                    label = f'0 {(bbox[0] + bbox[2] / 2) / INFERENCE_SIZE} {(bbox[1] + bbox[3] / 2) / INFERENCE_SIZE} {bbox[2] / INFERENCE_SIZE} {bbox[3] / INFERENCE_SIZE}'
                    window_labels.append(label)

            image_path = os.path.join(OUTPUT_PATH, f'{stem}_{idx}.jpg')
            annotation_path = os.path.join(OUTPUT_PATH, f'{stem}_{idx}.txt')
                
            idx += 1

            if window_labels:
                annotated += 1
            else:
                unannotated_images.append(image_path)
                unannotated_annotations.append(annotation_path)

            with open(annotation_path, 'w') as file:
                    for window_label in window_labels:
                        file.write(f"{window_label}\n")
                
            write_jpeg(window, image_path)
            

60295it [2:06:43,  7.93it/s]


In [6]:
len(unannotated_images), len(unannotated_annotations)

(2455578, 2455578)

In [7]:
ratio = 1.  # how many times there should be more unannotated images than annoted? e.g. ratio of 1. means there will be 1 unannotated for 1 annotated

L = list(zip(unannotated_images, unannotated_annotations))
shuffle(L)
remove_images, remove_annotations = zip(*L)

remove_images = remove_images[int(ratio * annotated) + 1:]
remove_annotations = remove_annotations[int(ratio * annotated) + 1:]
for remove_image, remove_annotations in zip(remove_images, remove_annotations):
    for file in (remove_image, remove_annotations):
        if os.path.exists(file):
            os.remove(file)