In [1]:
import os
import csv
import random
from PIL import Image
from tqdm import tqdm

# Split Train and Test

In [2]:
def split(origin_image_path, origin_mask_path):
    images = os.listdir(origin_image_path)
    masks = os.listdir(origin_mask_path)
    assert len(images) == len(masks)
    num_data = len(images)
    random.shuffle(images)
    os.system("rm -rf ./data/train/*")
    os.system("rm -rf ./data/test/*")
    os.makedirs("./data/train/image", exist_ok=True)
    os.makedirs("./data/train/mask", exist_ok=True)
    os.makedirs("./data/test/image", exist_ok=True)
    os.makedirs("./data/test/mask", exist_ok=True)
    for idx, image in enumerate(tqdm(images)):
        image_name = image.split("/")[-1]
        os.system("cp {} {}".format(os.path.join(origin_image_path, image_name), os.path.join(f"./data/{'test' if idx <= num_data // 10 else 'train'}/image", image_name)))
        os.system("cp {} {}".format(os.path.join(origin_mask_path, image_name), os.path.join(f"./data/{'test' if idx <= num_data // 10 else 'train'}/mask", image_name)))

In [3]:
split("./data/original/image/", "./data/original/mask/")

100%|██████████| 2900/2900 [00:07<00:00, 413.31it/s]


# Split Image

In [4]:
def split_images(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    image_files = [f for f in os.listdir(input_dir) if f.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp"))]
    for image_file in tqdm(image_files):
        input_path = os.path.join(input_dir, image_file)
        original_image = Image.open(input_path)
        width, height = original_image.size
        split_point = width // 2
        left_image = original_image.crop((0, 0, split_point, height))
        right_image = original_image.crop((split_point, 0, width, height))
        left_output_path = os.path.join(output_dir, f"left_{image_file}")
        right_output_path = os.path.join(output_dir, f"right_{image_file}")
        right_image = right_image.transpose(Image.FLIP_LEFT_RIGHT)
        left_image.save(left_output_path)
        right_image.save(right_output_path)

In [5]:
split_images("./data/train/image", "./data/train/image_splited")
split_images("./data/train/mask", "./data/train/mask_splited")
split_images("./data/test/image", "./data/test/image_splited")
split_images("./data/test/mask", "./data/test/mask_splited")
# split_images("./data/predict/image", "./data/predict/image_splited")

100%|██████████| 2609/2609 [01:29<00:00, 29.12it/s]
100%|██████████| 2609/2609 [00:06<00:00, 407.35it/s]
100%|██████████| 291/291 [00:10<00:00, 28.95it/s]
100%|██████████| 291/291 [00:00<00:00, 374.17it/s]


In [None]:
def split_images_overlap(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    image_files = [f for f in os.listdir(input_dir) if f.lower().endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp"))]
    for image_file in tqdm(image_files):
        input_path = os.path.join(input_dir, image_file)
        original_image = Image.open(input_path)
        width, height = original_image.size
        split_point = width // 2
        left_image = original_image.crop((30, 0, split_point + 30, height))
        right_image = original_image.crop((split_point - 30, 0, width - 30, height))
        left_output_path = os.path.join(output_dir, f"left_{image_file}")
        right_output_path = os.path.join(output_dir, f"right_{image_file}")
        right_image = right_image.transpose(Image.FLIP_LEFT_RIGHT)
        left_image.save(left_output_path)
        right_image.save(right_output_path)

In [None]:
# split_images_overlap("./data/train/image", "./data/train/image_splited")
# split_images_overlap("./data/train/mask", "./data/train/mask_splited")
# split_images_overlap("./data/test/image", "./data/test/image_splited")
# split_images_overlap("./data/test/mask", "./data/test/mask_splited")
# split_images_overlap("./data/predict/image", "./data/predict/image_splited")

# Filter Illegal Data

In [6]:
def is_binary_image_all_zeros(image_path):
    with Image.open(image_path) as img:
        img = img.convert("L")
        pixels = list(img.getdata())
        return all(pixel == 0 for pixel in pixels)
    

def get_illegal_data(mask_dir):
    illegal_data = []
    for filename in tqdm(os.listdir(mask_dir)):
        if filename.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
            image_path = os.path.join(mask_dir, filename)
            if is_binary_image_all_zeros(image_path):
                illegal_data.append(filename)
    return illegal_data


def perform_delete(iliiegal_data, image_dir, mask_dir):
    for filename in tqdm(iliiegal_data):
        os.remove(os.path.join(image_dir, filename))
        os.remove(os.path.join(mask_dir, filename))


def filter_illegal_data(image_dir, mask_dir):
    illegal_data = get_illegal_data(mask_dir)
    print(f"Found {len(illegal_data)} illegal data:")
    print(illegal_data)
    input("Confirm?")
    perform_delete(illegal_data, image_dir, mask_dir)

In [7]:
filter_illegal_data("./data/train/image_splited", "./data/train/mask_splited")
filter_illegal_data("./data/test/image_splited", "./data/test/mask_splited")

100%|██████████| 5218/5218 [00:07<00:00, 677.85it/s]


Found 8 illegal data:
['left_train_0.png', 'left_train_1.png', 'right_A-17.png', 'left_train_898.png', 'right_train_419.png', 'right_train_1.png', 'right_train_0.png', 'left_A-5.png']


100%|██████████| 8/8 [00:00<00:00, 13706.88it/s]
100%|██████████| 582/582 [00:00<00:00, 680.74it/s]


Found 0 illegal data:
[]


0it [00:00, ?it/s]


# Generate CSV

In [10]:
def generate_csv(type, image_dir, mask_dir = None):
    images = os.listdir(image_dir)

    if mask_dir is None:
        masks = [None] * len(images)
    else:
        masks = os.listdir(mask_dir)

    image_folder = os.path.basename(image_dir)
    mask_folder = os.path.basename(mask_dir) if mask_dir is not None else None

    assert len(images) == len(masks)

    with open(f"./data/{type}.csv", "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["#", "img", "seg"])
        for i in range(len(images)):
            writer.writerow([i, f"{type}/{image_folder}/{images[i]}", f"{type}/{mask_folder}/{masks[i]}" if mask_dir is not None else "./placeholder.png"])

In [11]:
generate_csv("train", "./data/train/image_splited", "./data/train/mask_splited")
generate_csv("test", "./data/test/image_splited", "./data/test/mask_splited")
# generate_csv("predict", "./data/predict/image")