# Tiled HPA Dataset Basecode


In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
import random
import torch
from PIL import Image
from tqdm.auto import tqdm

In [3]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/gdrive/MyDrive/AI/1.code/HPA_Hacking_the_human_body/dataset/'
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
cd /content/gdrive/MyDrive/AI/1.code/HPA_Hacking_the_human_body/

/content/gdrive/MyDrive/AI/1.code/HPA_Hacking_the_human_body


In [5]:
SEED = 2022
HPA_path = './dataset/HPA_Hubmap_dataset/'
TRAIN = HPA_path + 'train_images/'
MASKS = HPA_path + 'train_annotations/'
LABELS = HPA_path + 'train.csv'

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(SEED)

# Export masks
해당 코드는 rle mask 정보를 이용하여 이진 마스크와 class 마스크를 만들어 각각 디렉터리에 저장하는 코드이다. 디렉터리에 해당 이미지가 저장이 되어있는 경우, 실행하지 않도록 구현하였다.

In [7]:
def rle_decode(mask_rle: str, img_shape: tuple = None) -> np.ndarray:
    seq = mask_rle.split()
    starts = np.array(list(map(int, seq[0::2])))
    lengths = np.array(list(map(int, seq[1::2])))
    assert len(starts) == len(lengths)
    ends = starts + lengths
    img = np.zeros((np.product(img_shape),), dtype=np.uint8)
    for begin, end in zip(starts, ends):
        img[begin:end] = 1
    return img.reshape(img_shape)

In [8]:
df_train = pd.read_csv(LABELS)
binary_masks = HPA_path + 'train_binary_masks'
mclass_masks = HPA_path + 'train_mclass_masks'
label_list = list(df_train["organ"].unique())
if len(os.listdir(binary_masks)) == 351:
    pass
else:
    for _, row in tqdm(df_train.iterrows(), total=len(df_train)):
        mask = rle_decode(row['rle'], img_shape=(row["img_height"], row["img_width"])).T
        segm_path = os.path.join(binary_masks, f"{row['id']}.png")
        Image.fromarray(mask).save(segm_path)
        segm_path = os.path.join(mclass_masks, f"{row['id']}.png")
        mask = mask * (label_list.index(row["organ"])+1)
        Image.fromarray(mask).save(segm_path)

# Process Dataset
tile image 데이터셋 생성을 위해 tiled mask image와 tiled 원본 image를 저장하는 코드를 구현하였다.

In [9]:
def tile_image(p_img, folder, size: int = 768) -> list:
    w = h = size
    im = np.array(Image.open(p_img))
    tiles = [im[i:i+h, j:(j+w), ...] for i in range(0, im.shape[0], h) for j in range(0, im.shape[1], w)]
    idxs = [(i, (i+h), j, (j+w)) for i in range(0, im.shape[0], h) for j in range(0, im.shape[1], w)]
    name, _ = os.path.splitext(os.path.basename(p_img))
    files = list()
    
    for k, tile in enumerate(tiles):
        if tile.shape[:2] != (h, w):
            tile_ = tile
            tile = np.zeros_like(tiles[0])
            tile[:tile_.shape[0], :tile_.shape[1], ...] = tile_
        p_img = os.path.join(folder, f"{name}_{k:02}.png")
        Image.fromarray(tile).save(p_img)
        files.append(p_img)
        
    return files, idxs

In [11]:
from joblib import Parallel, delayed
from fastai.vision.all import *
TILE_SIZE = 1024
HPA_tiled_path = './dataset/tiled_HPA_Hubmap_dataset/'
tile_image_path = HPA_tiled_path + "tiled_images"
tile_mask_path = HPA_tiled_path + "tiled_masks"




for dir_source, dir_target in [(os.path.join(TRAIN),tile_image_path),(mclass_masks, tile_mask_path)]:
    ls = glob.glob(os.path.join(dir_source, '*'))
    print(os.listdir())
    _ = Parallel(n_jobs=3)(delayed(tile_image)(p_img, dir_target, size=TILE_SIZE)for p_img in tqdm(ls))
    # for p_img in tqdm(ls):
    #   tile_image(p_img, dir_target, size=TILE_SIZE)

['dataset', 'kaggle.json', 'EDA.ipynb', '.idea', 'DataCheck.ipynb', 'DataDownload.ipynb', 'HPA Dataset Basecode.ipynb']


  0%|          | 0/351 [00:00<?, ?it/s]

['dataset', 'kaggle.json', 'EDA.ipynb', '.idea', 'DataCheck.ipynb', 'DataDownload.ipynb', 'HPA Dataset Basecode.ipynb']


  0%|          | 0/351 [00:00<?, ?it/s]