# 0. 초기 세팅

In [None]:
import os
import cv2
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2

# Pytorch에서 gpu를 사용하는 방법.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# torch ver 1.12.1
# torchaudio ver 0.12.1
# torchvision 0.13.1

In [None]:
import random

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [None]:
print(torch.version)
print(torch.version.cuda)

# 1. 데이터 준비

In [None]:
!pip install patchify

In [None]:
from matplotlib import pyplot as plt
from patchify import patchify
import tifffile as tiff
from PIL import Image
import random

In [None]:
from tensorflow import keras
import segmentation_models as sm

In [None]:
import tensorflow
from keras.metrics import MeanIoU

In [None]:
# RLE 디코딩 함수
def rle_decode(mask_rle, shape):
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)


# RLE 인코딩 함수
def rle_encode(mask):
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return " ".join(str(x) for x in runs)

`1` Quick understanding of the dataset

In [None]:
df_train, df_test = pd.read_csv('./train.csv'), pd.read_csv('./test.csv')

In [None]:
'./data' + df_train.loc[0, 'img_path'][1:]

In [None]:
temp_img = cv2.imread('./data' + df_train.loc[0, 'img_path'][1:]) #3 channels / spectral bands
plt.imshow(temp_img[:,:,2]) #View each channel...
temp_mask = rle_decode(df_train.loc[0, 'mask_rle'], shape = (1024, 1024)) #1 channels
labels, count = np.unique(temp_mask[:, :], return_counts=True) #Check for each channel. All chanels are identical
print("Labels are: ", labels, " and the counts are: ", count)

`2` Now, crop each large image into patches of 224x224. Save them into a directory

so we can use data augmentation and read directly from the drive.

In [None]:
patch_size = 224

In [None]:
for i in range(len(df_train)):
    img_path = './data' + df_train.loc[i, 'img_path'][1:]
    image = cv2.imread(img_path)       #Read each image as BGR
    SIZE_X = (image.shape[1] // patch_size) * patch_size  #Nearest size divisible by our patch size
    SIZE_Y = (image.shape[0] // patch_size) * patch_size  #Nearest size divisible by our patch size
    image = Image.fromarray(image)
    image = image.crop((0 ,0, SIZE_X, SIZE_Y))
    image = np.array(image)

    #Extract patches from each image
    print("Now patchifying image:", img_path)
    patches_img = patchify(image, (224, 224, 3), step=224)

    for j in range(patches_img.shape[0]):
        for k in range(patches_img.shape[1]):

            single_patch_img = patches_img[j,k,:,:]
            single_patch_img = single_patch_img[0] #Drop the extra unecessary dimension that patchify adds.

            cv2.imwrite("./224_patches/images/"+ df_train.loc[i, 'img_id'] + "patch_"+str(j)+str(k)+".png", single_patch_img)

In [None]:
for i in range(len(df_train)):
    mask = rle_decode(df_train.loc[i, 'mask_rle'], shape = (1024, 1024))
    SIZE_X = (mask.shape[1] // patch_size) * patch_size  #Nearest size divisible by our patch size
    SIZE_Y = (mask.shape[0] // patch_size) * patch_size  #Nearest size divisible by our patch size
    mask = Image.fromarray(mask)
    mask = mask.crop((0 ,0, SIZE_X, SIZE_Y))
    mask = np.array(mask)

    #Extract patches from each image
    print("Now patchifying mask:", i)
    patches_mask = patchify(mask, (224, 224), step=224)

    for j in range(patches_mask.shape[0]):
        for k in range(patches_mask.shape[1]):

            single_patch_mask = patches_mask[j,k,:,:]

            cv2.imwrite("./224_patches/masks/"+ "MASK_" + df_train.loc[i, 'img_id'][-4:] + "patch_"+str(j)+str(k)+".png", single_patch_mask)

In [None]:
image_test = cv2.imread("./224_patches/images/TRAIN_0000patch_01.png", 1)
image_test = cv2.cvtColor(image_test, cv2.COLOR_BGR2RGB)
mask_test = cv2.imread("./224_patches/masks/MASK_0000patch_01.png", 0)

In [None]:
plt.imshow(image_test)

In [None]:
plt.imshow(mask_test)

In [None]:
train_img_dir = "224_patches/images/"
train_mask_dir = "224_patches/masks/"

img_list = os.listdir(train_img_dir)
msk_list = os.listdir(train_mask_dir)

num_images = len(img_list)

In [None]:
print(len(img_list), len(msk_list))

In [None]:
img_list = sorted(img_list)
msk_list = sorted(msk_list)

In [None]:
img_num = random.randint(0, num_images-1)

img_for_plot = cv2.imread(train_img_dir + img_list[img_num], 1)
img_for_plot = cv2.cvtColor(img_for_plot, cv2.COLOR_BGR2RGB)

mask_for_plot = cv2.imread(train_mask_dir + msk_list[img_num], 0)

plt.figure(figsize=(12, 8))
plt.subplot(121)
plt.imshow(img_for_plot)
plt.title('Image')
plt.subplot(122)
plt.imshow(mask_for_plot, cmap='gray')
plt.title('Mask')
plt.show()

In [None]:
img_list[img_num]

In [None]:
msk_list[img_num]

`3` Now, let us copy images and masks with real information to a new folder.
real information = if mask has decent amount of labels other than 0.

In [None]:
useless=0  #Useless image counter
for img in range(len(img_list)):   #Using t1_list as all lists are of same size
    img_name=img_list[img]
    mask_name = msk_list[img]
    print("Now preparing image and masks number: ", img)

    temp_image=cv2.imread(train_img_dir+img_list[img], 1)

    temp_mask=cv2.imread(train_mask_dir+msk_list[img], 0)
    #temp_mask=temp_mask.astype(np.uint8)

    val, counts = np.unique(temp_mask, return_counts=True)

    if (1 - (counts[0]/counts.sum())) > 0.05:  #At least 5% useful area with labels that are not 0
        print("Save Me")
        cv2.imwrite('./224_patches/images_with_useful_info/images/' + img_name, temp_image)
        cv2.imwrite('./224_patches/images_with_useful_info/masks/' + mask_name, temp_mask)

    else:
        print("I am useless")
        useless +=1

print("Total useful images are: ", len(img_list)-useless)  #48,843
print("Total useless images are: ", useless) #65,397

In [None]:
useful_train_img_dir = './224_patches/images_with_useful_info/images/'
useful_train_mask_dir = './224_patches/images_with_useful_info/masks/'

In [None]:
useful_img_list = os.listdir(useful_train_img_dir)
useful_msk_list = os.listdir(useful_train_mask_dir)

In [None]:
useful_img_list = sorted(useful_img_list)
useful_msk_list = sorted(useful_msk_list)
useful_num_images = len(useful_img_list)

In [None]:
img_num = random.randint(0, useful_num_images-1)

img_for_plot = cv2.imread(useful_train_img_dir + useful_img_list[img_num], 1)
img_for_plot = cv2.cvtColor(img_for_plot, cv2.COLOR_BGR2RGB)

mask_for_plot = cv2.imread(useful_train_mask_dir + useful_msk_list[img_num], 0)

plt.figure(figsize=(12, 8))
plt.subplot(121)
plt.imshow(img_for_plot)
plt.title('Image')
plt.subplot(122)
plt.imshow(mask_for_plot, cmap='gray')
plt.title('Mask')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

img_train, img_test, mask_train, mask_test = train_test_split(useful_img_list, useful_msk_list, test_size=0.25, random_state=42)

In [None]:
train_set = pd.DataFrame({'img_path': img_train, 'mask_path': mask_train})
valid_set = pd.DataFrame({'img_path': img_test, 'mask_path': mask_test})

In [None]:
valid_set

In [None]:
test_set = pd.read_csv('./test.csv')

In [None]:
test_set

`4` split folders

In [None]:
!pip install split-folders

In [None]:
import splitfolders  # or import split_folders

input_folder = './224_patches/images_with_useful_info/'
output_folder = './pro_data/data_for_training_and_testing/'

# Split with a ratio.
# To only split into training and validation set, set a tuple to `ratio`, i.e, `(.8, .2)`.
splitfolders.ratio(input_folder, output=output_folder, seed=42, ratio=(.75, .25), group_prefix=None) # default values