In [1]:
import json
import math
import os
import time
from random import seed, random, choices

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from PIL import Image

%matplotlib inline

In [39]:
MACHINE_ASTROPHYSICS = "Astrophysics"
MACHINE_RAPISCAN = "Rapiscan"
MACHINE_SMITH = "Smith"

ASTROPHYSICS_LAYOUT_HEIGHT_BOUNDARY = 1000

RAPISCAN_LAYOUT_Y_BOUNDARY = 40
RAPISCAN_LAYOUT_HEIGHT_BOUNDARY = 980

SMITH_LAYOUT_HEIGHT_BOUNDARY = 920

CONTOUR_AREA_BOUNDARY = 4000000

In [40]:
def get_path(annotation_path: str) -> tuple:
    compressed_data_list = list()
    image_path_list = list()
    image_label_list = list()
    last_image_path_idx = 0
    avg_width = 0
    avg_height = 0
    
    # annotation 파일 오픈
    with open(annotation_path) as json_file:
        entire_data = json.load(json_file)
    
    image_data_list = entire_data['images']
    annotation_data_list = entire_data['annotations']
    
    # Average height, width 추출
    for image_data in image_data_list:
        avg_width += image_data['width']
        avg_height += image_data['height']
    
    avg_width = int(avg_width / len(image_data_list))
    avg_height = int(avg_height / len(image_data_list))

    # image 및 path 정보 추출
    for annotation_data in annotation_data_list:
        # 처음일 경우 혹은 새로운 image 를 참조하였을 때
        if not compressed_data_list or annotation_data['image_id'] != compressed_data_list[-1]['image_id']:
            output_data_dict = {
                'image_id': annotation_data['image_id'],
                'image_path': image_data_list[last_image_path_idx]['path'],
                'category_id': set()
            }

            output_data_dict['category_id'].add(annotation_data['category_id'])
            compressed_data_list.append(output_data_dict)
            last_image_path_idx += 1
        # 기존의 image 를 참조하였을 때
        elif annotation_data['image_id'] == compressed_data_list[-1]['image_id']:
            compressed_data_list[-1]['category_id'].add(annotation_data['category_id'])

    # output data 추출 (image_path, label)
    for compressed_data in compressed_data_list:
        for category_id in list(compressed_data['category_id']):
            image_path_list.append(compressed_data['image_path'])
            image_label_list.append(category_id)
    
    return image_path_list, image_label_list, (avg_width, avg_height)

In [41]:
def get_boundary_box(img_data):
    img_gray = cv2.cvtColor(img_data, cv2.COLOR_BGR2GRAY)
    img_gray = cv2.blur(img_gray, (15,15))

    ret, thresh = cv2.threshold(img_gray, math.floor(np.average(img_gray)), 255, cv2.THRESH_BINARY_INV)
    dilated = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (10, 10)))
    contours, _ = cv2.findContours(dilated, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

    new_contours = []

    for contour in contours:
        if cv2.contourArea(contour) < CONTOUR_AREA_BOUNDARY:
            new_contours.append(contour)

    boundary_box = [-1, -1, -1, -1]

    for contour in new_contours:
        x, y, w, h = cv2.boundingRect(contour)

        if boundary_box[0] < 0:
            boundary_box = [x, y, w, h]
        else:
            if x < boundary_box[0]:
                boundary_box[0] = x
            if y < boundary_box[1]:
                boundary_box[1] = y
            if x + w > boundary_box[2]:
                boundary_box[2] = x + w
            if y + h > boundary_box[3]:
                boundary_box[3] = y + h

    return boundary_box

In [42]:
def cut_layout(img_data, machine):
    if machine == MACHINE_ASTROPHYSICS:
        return img_data[:ASTROPHYSICS_LAYOUT_HEIGHT_BOUNDARY, :]
    elif machine == MACHINE_RAPISCAN:
        return img_data[RAPISCAN_LAYOUT_Y_BOUNDARY:RAPISCAN_LAYOUT_HEIGHT_BOUNDARY, :]
    elif machine == MACHINE_SMITH:
        return img_data[:SMITH_LAYOUT_HEIGHT_BOUNDARY, :]

In [43]:
def save_image(save_dir_path: str, data_path_list: list, resize_img_size: tuple, idx: int):
    img_data_list = list()
    img_id_list = list()
    resize_width = resize_img_size[0]
    resize_height = resize_img_size[1]
    
    data_path = data_path_list[idx]
    img_id = data_path.split("/")[-1]
    machine = data_path.split("/")[2]

    img_data = cv2.imread(data_path)
    img_data = cut_layout(img_data, machine)

    # Auto cropping
#         if machine == MACHINE_ASTROPHYSICS:
#             boundary_box = get_boundary_box(img_data)
#             img_data = img_data[boundary_box[1]:boundary_box[3], boundary_box[0]:boundary_box[2]]
    
    # Cut image with average height and width && Save image
    img_data = cv2.resize(img_data, dsize=(resize_width, resize_height), interpolation=cv2.INTER_AREA)

    # Save image
    cv2.imwrite("{}/{}".format(save_dir_path, img_id), img_data);

In [44]:
def save_csv(save_path: str, data_path_list: list, label_list: list):
    image_id_list = list(map(lambda data_path: data_path.split("/")[-1], data_path_list))
    
    df = pd.DataFrame(list(zip(image_id_list, label_list)), columns=['ImageFileName', 'Label'])
    
    df.to_csv(save_path, index=False)

### Save Images

In [45]:
def save(annotation_path: str, machine: str, divide_prop: float=0.125):
    # Load data
    image_path_list, label_list, (avg_width, avg_height) = get_path(annotation_path)
    image_path_set_list = list(set(image_path_list))
    new_image_path_list = list(map(lambda image_path: "." + image_path, image_path_list))
    new_image_path_set_list = list(map(lambda image_path: "." + image_path, image_path_set_list))
    resize_width = 0
    resize_height = 0
    
    # 이미지 평균 width, height 에서 layout 부분을 제외
    if machine == MACHINE_ASTROPHYSICS:
        resize_height = ASTROPHYSICS_LAYOUT_HEIGHT_BOUNDARY 
    elif machine == MACHINE_RAPISCAN:
        resize_height = RAPISCAN_LAYOUT_HEIGHT_BOUNDARY - RAPISCAN_LAYOUT_Y_BOUNDARY
    elif machine == MACHINE_SMITH:
        resize_height = SMITH_LAYOUT_HEIGHT_BOUNDARY
    
    resize_width = int(avg_width * divide_prop)
    resize_height = int(resize_height * divide_prop)
    
    # Path to be created
    target = "dataset_{}_{}_{}/img_folder".format(machine, resize_width, resize_height)
    csv_save_path = 'dataset_{}_{}_{}/data_label_info.csv'.format(machine, resize_width, resize_height)
    num_workers = 1
    
    # 현재 directory에서 resize한 데이터셋 만듬
    cur_path = os.getcwd()
    target_path = os.path.join(cur_path, target)
    csv_save_path = os.path.join(cur_path, csv_save_path)
    
    print("Resize Width : {}, Resize Height : {}".format(resize_width, resize_height))
    print("Image directory path : {}".format(target_path))
    print(".csv file path : {}".format(csv_save_path))
    
    # Make Directory
    try:
        os.makedirs(target_path, exist_ok=True)
        #os.mkdir(csv_save_path)
    except OSError as e:
        print(e)
        if e.errno != os.errno.EEXIST:
            print(e)
            raise
    
    # Save images
    for i in range(len(new_image_path_set_list)):
        save_image(target_path, new_image_path_set_list, (resize_width, resize_height), i)
    print("Complete to save images")
    
    # Save .csv file
    save_csv(csv_save_path, new_image_path_list, label_list)
    print("Complete to save .csv file")

In [46]:
start = time.time()

save(
    annotation_path="./datasets/Annotation/Astrophysics.json",
    machine=MACHINE_ASTROPHYSICS
)

end = time.time()

print("Elapsed Time : {}".format(end - start))

Resize Width : 240, Resize Height : 125
Image directory path : /root/local_vol/bk/dataset_Astrophysics_240_125/img_folder
.csv file path : /root/local_vol/bk/dataset_Astrophysics_240_125/data_label_info.csv
Complete to save images
Complete to save .csv file
Elapsed Time : 2.662433624267578


In [47]:
start = time.time()
save(
    annotation_path="./datasets/Annotation/Rapiscan.json",
    machine=MACHINE_RAPISCAN
)
end = time.time()

print("Elapsed Time : {}".format(end - start))

Resize Width : 210, Resize Height : 117
Image directory path : /root/local_vol/bk/dataset_Rapiscan_210_117/img_folder
.csv file path : /root/local_vol/bk/dataset_Rapiscan_210_117/data_label_info.csv
Complete to save images
Complete to save .csv file
Elapsed Time : 4.385071516036987


In [48]:
start = time.time()
save(
    annotation_path="./datasets/Annotation/Smith.json",
    machine=MACHINE_SMITH
)
end = time.time()

print("Elapsed Time : {}".format(end - start))

Resize Width : 210, Resize Height : 115
Image directory path : /root/local_vol/bk/dataset_Smith_210_115/img_folder
.csv file path : /root/local_vol/bk/dataset_Smith_210_115/data_label_info.csv
Complete to save images
Complete to save .csv file
Elapsed Time : 2.727540969848633
