In [None]:
# -*- coding: utf-8 -*-
import sys; print('Python %s on %s' % (sys.version, sys.platform))
import os
import time
import json
from glob import glob, iglob
from tqdm import tqdm
import matplotlib.pyplot as plt

import numpy as np; print('numpy', np.__version__)
import pandas as pd; print('pandas', pd.__version__)
import cv2; print('opencv2', cv2.__version__)

import settings
import helper
import visual

# 1. Load Annotation

In [None]:
df_annotation = pd.read_csv(filepath_or_buffer=settings.PREPROCESS_ANNOTATION_FILE, index_col=['seriesuid'])
df_annotation.index = df_annotation.index.astype('str')
print('annotation:', df_annotation.shape, 'distinct lung:', len(set(df_annotation.index)))

In [None]:
df_annotation.sample(10)

In [None]:
lungs = list(set(df_annotation.index))
print('distinct lungs in annotation:', len(lungs))

# 2. Generate Positive Augmenatation Labels

In [None]:
IS_GENERATE = True

In [None]:
import random
if IS_GENERATE:
    list_pos_aug = []
    min_offset_limit = settings.AUGMENTATION_MIN_OFFSET_LIMIT
    for uid in tqdm(lungs):
        labels = df_annotation.loc[[uid]] # dataframe

        for idx, item in labels.iterrows():
            vcoordX, vcoordY, vcoordZ = item.vcoordX, item.vcoordY, item.vcoordZ
            limitX = int(item.diameterX // 2) if item.diameterX < (settings.CUBE_POS_SIZE // 2) else min_offset_limit
            limitY = int(item.diameterY // 2) if item.diameterY < (settings.CUBE_POS_SIZE // 2) else min_offset_limit
            limitZ = int(item.diameterZ // 2) if item.diameterZ < (settings.CUBE_POS_SIZE // 2) else min_offset_limit

            for i in range(settings.AUGMENTATION_RATE):
                limitX = max(limitX, min_offset_limit)
                limitY = max(limitY, min_offset_limit)
                limitZ = max(limitZ, min_offset_limit)
                offsetX = random.randrange(-limitX, limitX)
                offsetY = random.randrange(-limitY, limitY)
                offsetZ = random.randrange(-limitZ, limitZ)

                pos_aug = {}
                pos_aug['seriesuid'] = uid
                pos_aug['width'] = item.width
                pos_aug['height'] = item.height
                pos_aug['slice'] = item.slice
                pos_aug['vcoordX'] = vcoordX + offsetX
                pos_aug['vcoordY'] = vcoordY + offsetY
                pos_aug['vcoordZ'] = vcoordZ + offsetZ
                pos_aug['diameterX'] = item.diameterX
                pos_aug['diameterY'] = item.diameterY
                pos_aug['diameterZ'] = item.diameterZ
                pos_aug['offsetX'] = offsetX
                pos_aug['offsetY'] = offsetY
                pos_aug['offsetZ'] = offsetZ
                pos_aug['originX'] = item.originX
                pos_aug['originY'] = item.originY
                pos_aug['originZ'] = item.originZ   
                pos_aug['spacingX'] = item.spacingX
                pos_aug['spacingY'] = item.spacingY
                pos_aug['spacingZ'] = item.spacingZ  
                pos_aug['label'] = item.label
                
                list_pos_aug.append(pos_aug)
                
#                 print(offsetX, offsetY, offsetZ)
#                 print(pos_aug)    

In [None]:
if IS_GENERATE:
    df_pos_aug = pd.DataFrame(list_pos_aug, columns=['seriesuid','width','height','slice',
                                                     'vcoordX','vcoordY','vcoordZ',
                                                     'diameterX','diameterY','diameterZ',
                                                     'offsetX', 'offsetY', 'offsetZ',
                                                     'originX','originY','originZ',
                                                     'spacingX','spacingY','spacingZ','label'])
    df_pos_aug = df_pos_aug.set_index('seriesuid')
    df_pos_aug.index = df_pos_aug.index.astype('str')
    df_pos_aug['width'] = df_pos_aug['width'].astype('int')
    df_pos_aug['height'] = df_pos_aug['height'].astype('int')
    df_pos_aug['slice'] = df_pos_aug['slice'].astype('int')
    
    print('total:', len(df_pos_aug), 'lung:', len(set(df_pos_aug.index)))
    
    df_pos_aug.to_csv(settings.PREPROCESS_ANNOTATION_AUG_FILE, encoding='utf-8')

# 3. Positive Samples Extraction

In [None]:
IS_EXTRACTION = True
if not os.path.exists(settings.PREPROCESS_POS_DIR):
    os.mkdir(settings.PREPROCESS_POS_DIR)
    os.mkdir(settings.PREPROCESS_POS_DIR + 'lung/')
    os.mkdir(settings.PREPROCESS_POS_DIR + 'medi/')
    
df_pos_aug = pd.read_csv(filepath_or_buffer=settings.PREPROCESS_ANNOTATION_AUG_FILE, index_col=['seriesuid'])
df_pos_aug.index = df_pos_aug.index.astype('str')
print('anno aug:', df_pos_aug.shape, 'distinct lung:', len(set(df_pos_aug.index)))

In [None]:
if IS_EXTRACTION:
    for uid in tqdm(lungs):
        labels = df_pos_aug.loc[[uid]]
        if len(labels) <= 0:
            continue

        lung_l, mask_l = helper.load_lung_array(uid, int(labels['width'].values[0]), int(labels['height'].values[0]), int(labels['slice'].values[0]), wtype='lung')
        lung_m, mask_m = helper.load_lung_array(uid, int(labels['width'].values[0]), int(labels['height'].values[0]), int(labels['slice'].values[0]), wtype='medi')

        lung_l = lung_l*(mask_l>0)
        lung_m = lung_m*(mask_m>0)
        
        for idx, item in labels.iterrows():
            # patch without mask
            if int(item.label) in {1, 5}:
                lung, wtype = lung_l, 'lung'
            elif int(item.label) in {31, 32}:
                lung, wtype = lung_m, 'medi'
                
            cube = helper.get_cube_from_lung_array(lung, item.vcoordX, item.vcoordY, item.vcoordZ, block_size=settings.CUBE_POS_SIZE)
            if np.sum(cube) > settings.THRESHOLD_VALID_CUBE:
                seg_label = helper.create_seg_label(diameter=np.array([item.diameterZ, item.diameterY, item.diameterX]), 
                                                    offset=np.array([item.offsetZ,item.offsetY,item.offsetX]), 
                                                    block_size=settings.CUBE_POS_SIZE)
                helper.save_cube_img(
                    f'{settings.PREPROCESS_POS_DIR}{wtype}/{idx}_x{int(item.vcoordX)}_y{int(item.vcoordY)}_z{int(item.vcoordZ)}_dx{int(round(item.diameterX))}_dy{int(round(item.diameterY))}_dz{int(round(item.diameterZ,0))}_l{int(item.label)}.png', 
                    cube, rows=8, cols=8)

                helper.save_cube_img(
                    f'{settings.PREPROCESS_SEG_DIR}{wtype}_label/{idx}_x{int(item.vcoordX)}_y{int(item.vcoordY)}_z{int(item.vcoordZ)}_dx{int(round(item.diameterX))}_dy{int(round(item.diameterY))}_dz{int(round(item.diameterZ,0))}_l{int(item.label)}.png', 
                    seg_label, rows=8, cols=8)
            

# 4. Validate the Positive Samples

In [None]:
print(len(df_annotation), len(df_pos_aug), len(glob(settings.PREPROCESS_POS_DIR + '*/*.png')))

In [None]:
EXAMPLE_SERIESUID = '660577'
WTYPE = 'medi'

In [None]:
labels = df_pos_aug.loc[[EXAMPLE_SERIESUID]]
print('labels:', len(labels))

# 5. Visual Samples By Labels

In [None]:
EXAMPLE_SERIESUID = '660577'
WTYPE = 'medi'

In [None]:
for img_file in iglob(settings.PREPROCESS_POS_DIR + WTYPE + '/' + f'{EXAMPLE_SERIESUID}*.png'):
    seg_file = img_file.replace(settings.PREPROCESS_POS_DIR, settings.PREPROCESS_SEG_DIR)
    seg_file = seg_file.replace(WTYPE, WTYPE+'_label')
    print(img_file, '\n', seg_file)
    img = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE)
    seg = cv2.imread(seg_file, cv2.IMREAD_GRAYSCALE)
    fig, axs = plt.subplots(1, 2, figsize=(32, 16))
    axs[0].imshow(img, cmap='gray')
    axs[1].imshow(img*(seg>0), cmap='gray')
    plt.show()