In [1]:
! pip install opencv-python-headless


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import boto3
import os
import json
import cv2
import numpy as np
import urllib.request
import yaml

In [3]:
AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY")
AWS_S3_BUCKET = os.environ.get("AWS_S3_BUCKET")
AWS_S3_ENDPOINT = os.environ.get("AWS_S3_ENDPOINT")

DATASET_PATH = "dataset/"
DATA_CONFIG_PATH = "utils/data.yaml"
LOCAL_LABEL_FOLDER = "/tmp/labels/"
TARGET_BUCKET = "label-studio-sink"
LABEL_STUDIO_SVC = "http://label-studio-ls-app.label-studio.svc.cluster.local"

session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
client = session.client("s3", endpoint_url=AWS_S3_ENDPOINT)

In [4]:
def generate_dirs():
    os.makedirs(DATASET_PATH, exist_ok = True)
    os.makedirs(DATASET_PATH + "labels/train", exist_ok = True)
    os.makedirs(DATASET_PATH + "labels/test", exist_ok = True)
    os.makedirs(DATASET_PATH + "labels/val", exist_ok = True)
    os.makedirs(DATASET_PATH + "images/train", exist_ok = True)
    os.makedirs(DATASET_PATH + "images/test", exist_ok = True)
    os.makedirs(DATASET_PATH + "images/val", exist_ok = True)
    os.makedirs(LOCAL_LABEL_FOLDER, exist_ok=True)

In [5]:
def download_folder(bucket_name):
    paginator = client.get_paginator('list_objects_v2')
    operation_parameters = {'Bucket': bucket_name}
    page_iterator = paginator.paginate(**operation_parameters)
    for page in page_iterator:
        if 'Contents' in page:
            for obj in page['Contents']:
                key = obj['Key']
                client.download_file(bucket_name, key, LOCAL_LABEL_FOLDER + key.split("/")[-1])

In [6]:
def xyxy_to_yolo(bbox, image_width, image_height):
    x1, y1, x2, y2 = bbox
    width = x2 - x1
    height = y2 - y1
    x_center = (x1 + x2) / 2
    y_center = (y1 + y2) / 2
    # Normalize coordinates by image width and height
    x_center /= image_width
    y_center /= image_height
    width /= image_width
    height /= image_height
    return x_center, y_center, width, height

In [7]:
def ls_to_xyxy(bbox, img_w, img_h):
    x, y, w, h = bbox
    x1 = int(x * img_w)
    y1 = int(y * img_h)
    x2 = int(x1 + w * img_w)
    y2 = int(y1 + h * img_h)
    return x1, y1, x2, y2

In [8]:
def show_image(uri, bbox):
    image_path = "/".join(uri.split("/")[3:]) # Get path in s3://BUCKET/path
    client.download_file(SOURCE_BUCKET, image_path, "/tmp/test.jpg")
    image = cv2.imread("/tmp/test.jpg")
    img_h, img_w, _ = image.shape
    x1, y1, x2, y2 = ls_to_xyxy(bbox, img_w, img_h)
    image = cv2.rectangle(image, (x1, y1), (x2, y2), color=(255,0,0), thickness=2)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image_rgb)

In [9]:
def add_image_to_dataset(split, uri, bbox, label_name):
    image_remote_path = LABEL_STUDIO_SVC + uri
    image_name = image_remote_path.split("/")[-1]
    image_path = DATASET_PATH + "images/" + split + "/" + image_name
    label_file_name = ".".join(image_name.split(".")[:-1]) + ".txt"
    label_path = DATASET_PATH + "labels/" + split + "/" + label_file_name
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'Token redhat')]
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve(image_remote_path, image_path)
    image = cv2.imread(image_path)
    img_h, img_w, _ = image.shape
    bbox_xyxy = ls_to_xyxy(bbox, img_w, img_h)
    x, y, w, h = xyxy_to_yolo(bbox_xyxy, img_w, img_h)
    with open(DATA_CONFIG_PATH, 'r') as f:
        data_config = yaml.safe_load(f)
    label_id = list(data_config["names"].keys())[list(data_config["names"].values()).index(label_name)]
    label_line = f"{label_id} {x:.6f} {y:.6f} {w:.6f} {h:.6f}"
    with open(label_path, 'w') as f:
        f.write(label_line)

In [10]:
def determine_split(i, nb_data):
    if nb_data < 5:
        split = "train"
    else:
        split = "train" if i / nb_data <= 0.9 else "val"
    return split

In [11]:
generate_dirs()
download_folder(TARGET_BUCKET)
files = [LOCAL_LABEL_FOLDER + label for label in os.listdir(LOCAL_LABEL_FOLDER)]
nb_data = len(files)
for i, file in enumerate(files):
    split = determine_split(i, nb_data)
    with open(file) as f:
        payload = json.load(f)
    image_uri = payload['task']['data']['image']
    label_name = payload['result'][0]['value']['rectanglelabels'][0]
    bbox = payload['result'][0]['value']
    bbox = (bbox['x'] / 100, bbox['y'] / 100, bbox['width'] / 100, bbox['height'] / 100)
    # show_image(image_uri, bbox)
    add_image_to_dataset(split, image_uri, bbox, label_name)

In [12]:
! tar -czf dataset.tar.gz dataset