# Dataset Conversion

This code converts the Supervisely dataset into the format required for KP dataset (using our Penguin v1 extract)


In [3]:
#
# This code is made for building datasets from Supervisely with HIK Thermal Cameras
# Note: supports only a single class for all instance masks
#

import os
import glob
from PIL import Image
from sklearn.model_selection import train_test_split
# This utility class replaces argparse

class DotDict(dict):
    def __getattr__(self, attr):
        return self[attr]
    
    def __setattr__(self, attr, value):
        self[attr] = value
        

def load_directory(root_dir):
    output = []
    src_path = os.path.join(root_dir, "img")
    for img_name in sorted(os.listdir(src_path)):
        img_path = os.path.join(src_path, img_name)
        if os.path.isdir(img_path):
            continue
        else:
            output.append(img_name)
    return output


def load_resized_img(path, crop: bool):
    """

    :param path:
    :param crop: Apply FOV cropping
    :return:
    """
    if crop:
        #left: 705
        #top: 429
        #right: 2629
        #bottom: 1872
        # This is specific for HIKVision thermal cameras
        return Image.open(path).convert('RGB').crop((705, 429, 2629, 1872)).resize((640, 512))
    else:
        return Image.open(path).convert('RGB').resize((640, 512))


def render_image(image, output_dir, filename):
    output_image = Image.new('RGB', (640, 512))
    output_image.paste(image, (0, 0))
    output_image.save(os.path.join(output_dir, filename), format='JPEG', subsampling=0, quality=100)


def get_mask_names(input_dir, base_name):
    mask_root = os.path.join(input_dir, "masks_instances", base_name)        
    if os.path.exists(mask_root):
        mask_names = [os.path.join(mask_root,o) for o in sorted(os.listdir(mask_root))]
    else:
        mask_names = []
    return mask_names


def process_dataset(input_dir, filenames, output_dir, validation_phase: bool):
    save_phase = 'test' if validation_phase else 'train'
    savedir = os.path.join(output_dir, save_phase)
    os.makedirs(savedir, exist_ok=True)
    for i, filename in enumerate(filenames):
        base_name, ext = os.path.splitext(filename)
        if not base_name.endswith("_01"):
            mask_names = get_mask_names(input_dir, base_name)
            alt_mask_names = get_mask_names(input_dir, f"{base_name}_01")
        else:
            mask_names = get_mask_names(input_dir, base_name)
            alt_mask_names = get_mask_names(input_dir, base_name[:-3])
        if len(alt_mask_names) > len(mask_names):
            mask_names = alt_mask_names
        if base_name.endswith("_01"):
            base_name = base_name[:-3]
        final_output_dir = os.path.join(savedir, base_name)
        os.makedirs(final_output_dir, exist_ok=True)
        print(f"Rendering {final_output_dir}")
        render_image(load_resized_img(os.path.join(input_dir, "img", filename), False), final_output_dir, "lwir.jpg")
        render_image(load_resized_img(os.path.join(input_dir, "vis", f"{base_name}_VIS.jpeg"), True), final_output_dir, "rgb.jpg")
        for index, mask_name in enumerate(mask_names):
            render_image(load_resized_img(mask_name, False), final_output_dir, f"mask{index:03}.jpg")


opt: DotDict = DotDict()
opt.input_dir = "datasets/src_PV1dataset"
opt.output_dir = "datasets/PV1dataset_clean"
train_set, val_set = train_test_split(load_directory(opt.input_dir), test_size=0.2, random_state=42)
print('Preparing Dataset for val phase')
process_dataset(opt.input_dir, val_set, opt.output_dir, True)
print('Preparing Dataset for train phase')
process_dataset(opt.input_dir, train_set, opt.output_dir, False)
print('Done')

Preparing Dataset for val phase
Rendering datasets/PV1dataset_clean/test/HM20240724150709
Rendering datasets/PV1dataset_clean/test/HM20240724150016
Rendering datasets/PV1dataset_clean/test/HM20240710145743
Rendering datasets/PV1dataset_clean/test/HM20240717150212
Rendering datasets/PV1dataset_clean/test/HM20240717145437
Rendering datasets/PV1dataset_clean/test/HM20240717145426
Rendering datasets/PV1dataset_clean/test/HM20240710134700
Rendering datasets/PV1dataset_clean/test/HM20240710145457
Rendering datasets/PV1dataset_clean/test/HM20240710134733
Rendering datasets/PV1dataset_clean/test/HM20240717144256
Rendering datasets/PV1dataset_clean/test/HM20240724145821
Rendering datasets/PV1dataset_clean/test/HM20240717144114
Rendering datasets/PV1dataset_clean/test/HM20240724144640
Rendering datasets/PV1dataset_clean/test/HM20240710145223
Rendering datasets/PV1dataset_clean/test/HM20240710145420
Rendering datasets/PV1dataset_clean/test/HM20240717145423
Rendering datasets/PV1dataset_clean/test