In [None]:
# -*- coding: utf-8 -*-
import sys; print('Python %s on %s' % (sys.version, sys.platform))
import os
import time
import json
from glob import glob, iglob
from tqdm import tqdm
from random import random
import matplotlib.pyplot as plt

import numpy as np; print('numpy', np.__version__)
import pandas as pd; print('pandas', pd.__version__)
import cv2; print('opencv2', cv2.__version__)

import settings
import helper
import visual

# 1. Load Meta File and Annotation

In [None]:
df_meta_train = pd.read_csv(filepath_or_buffer=settings.PREPROCESS_TRAIN_META_FILE, index_col=['seriesuid'])
df_meta_train.index = df_meta_train.index.astype('str')

In [None]:
df_meta_train.head(5)

In [None]:
df_annotation = pd.read_csv(filepath_or_buffer=settings.PREPROCESS_ANNOTATION_FILE, index_col=['seriesuid'])
df_annotation.index = df_annotation.index.astype('str')
print('annotation:', df_annotation.shape, 'distinct lung:', len(set(df_annotation.index)))

In [None]:
df_annotation.sample(10)

In [None]:
lungs = list(set(df_annotation.index))
print('distinct lungs in annotation:', len(lungs))

# 2. Generate Negative Labels

In [None]:
IS_GENERATE = False
df_false_positive = pd.read_csv(filepath_or_buffer=settings.PREPROCESS_FALSE_POSITIVE_FILE, index_col=['seriesuid'])
df_false_positive.index = df_false_positive.index.astype('str')
print('false_positive:', df_false_positive.shape, 'distinct lung:', len(set(df_annotation.index)))

In [None]:
if IS_GENERATE:
    list_fp = []
    for uid in tqdm(lungs):
        candidates = df_false_positive.loc[[uid]] #dataframe
        labels = df_annotation.loc[[uid]] # dataframe
        meta = df_meta_train.loc[uid] # slice
        
        if len(candidates) > 0:
            for i,c in candidates.iterrows():
                vcoordX, vcoordY, vcoordZ = c.vcoordX, c.vcoordY, c.vcoordZ
                # print(vcoordX, vcoordY, vcoordZ)

                flag_pass = True
                if len(labels) > 0:
                    for idx, item in labels.iterrows():
                        dist = np.linalg.norm(np.array([vcoordX, vcoordY, vcoordZ]) - np.array([item.vcoordX, item.vcoordY, item.vcoordZ]))
                        if dist <= settings.CUBE_POS_SIZE + settings.CUBE_FPOS_SIZE:
                            flag_pass = False
                            break

                if flag_pass:
                    fp = {}
                    fp['seriesuid'] = uid
                    fp['width'] = meta.width
                    fp['height'] = meta.height
                    fp['slice'] = meta.slice
                    fp['vcoordX'] = vcoordX
                    fp['vcoordY'] = vcoordY
                    fp['vcoordZ'] = vcoordZ
                    list_fp.append(fp)
                

In [None]:
if IS_GENERATE:
    df_fp = pd.DataFrame(list_fp, columns=['seriesuid','width','height','slice','vcoordX','vcoordY','vcoordZ'])
    df_fp = df_fp.set_index('seriesuid')
    df_fp.index = df_fp.index.astype('str')
    df_fp['width'] = df_fp['width'].astype('int')
    df_fp['height'] = df_fp['height'].astype('int')
    df_fp['slice'] = df_fp['slice'].astype('int')
    
    print('total:', len(df_fp), 'lung:', len(set(df_fp.index)))
    
    df_fp.to_csv(settings.PREPROCESS_FALSE_POSITIVE_FILE, encoding='utf-8')
    

# 3. False Positive Samples Extraction

In [None]:
IS_EXTRACTION = True
if not os.path.exists(settings.PREPROCESS_FPOS_DIR):
    os.mkdir(settings.PREPROCESS_FPOS_DIR)
    os.mkdir(settings.PREPROCESS_FPOS_DIR + 'lung/')
    os.mkdir(settings.PREPROCESS_FPOS_DIR + 'medi/')
    
df_false_positive = pd.read_csv(filepath_or_buffer=settings.PREPROCESS_FALSE_POSITIVE_FILE, index_col=['seriesuid'])
df_false_positive.index = df_false_positive.index.astype('str')
print('false_positive:', df_false_positive.shape, 'distinct lung:', len(set(df_false_positive.index)))

In [None]:
if IS_EXTRACTION:
    for uid in tqdm(lungs):
        if uid not in df_false_positive.index:
            continue
            
        labels = df_false_positive.loc[[uid]]

        lung_l, mask_l = helper.load_lung_array(uid, int(labels['width'].values[0]), int(labels['height'].values[0]), int(labels['slice'].values[0]), wtype='lung')
        lung_m, mask_m = helper.load_lung_array(uid, int(labels['width'].values[0]), int(labels['height'].values[0]), int(labels['slice'].values[0]), wtype='medi')

        lung_l = lung_l*(mask_l>0)
        lung_m = lung_m*(mask_m>0)
        
        for idx, item in labels.iterrows():
            lung, wtype = lung_l, 'lung'
            cube = helper.get_cube_from_lung_array(lung, item.vcoordX, item.vcoordY, item.vcoordZ, block_size=settings.CUBE_FPOS_SIZE)
            if np.sum(cube) > settings.THRESHOLD_VALID_CUBE:
                helper.save_cube_img(
                        f'{settings.PREPROCESS_FPOS_DIR}{wtype}/{idx}_x{int(item.vcoordX)}_y{int(item.vcoordY)}_z{int(item.vcoordZ)}.png', 
                        cube, rows=8, cols=8)
                
            lung, wtype = lung_m, 'medi'
            cube = helper.get_cube_from_lung_array(lung, item.vcoordX, item.vcoordY, item.vcoordZ, block_size=settings.CUBE_FPOS_SIZE)
            if np.sum(cube) > settings.THRESHOLD_VALID_CUBE:
                helper.save_cube_img(
                        f'{settings.PREPROCESS_FPOS_DIR}{wtype}/{idx}_x{int(item.vcoordX)}_y{int(item.vcoordY)}_z{int(item.vcoordZ)}.png', 
                        cube, rows=8, cols=8)
            

# 3. Validate False Positive Samples

In [None]:
print(len(df_false_positive), len(glob(settings.PREPROCESS_FPOS_DIR + '*/*.png')))

In [None]:
EXAMPLE_SERIESUID = '660577'
WTYPE = 'medi'

In [None]:
labels = df_false_positive.loc[[EXAMPLE_SERIESUID]]
print('labels:', len(labels))

In [None]:
if len(labels) > 0:
    for idx, item in labels.iterrows():
        filename = f'{idx}_x{int(item.vcoordX)}_y{int(item.vcoordY)}_z{int(item.vcoordZ)}.png'
        if os.path.exists(settings.PREPROCESS_FPOS_DIR + WTYPE + '/' + filename):
            cube = helper.load_cube_img(settings.PREPROCESS_FPOS_DIR + 'medi' + '/' + filename, rows=8, cols=8)
        else:
            cube = helper.load_cube_img(settings.PREPROCESS_FPOS_DIR + 'lung' + '/' + filename, rows=8, cols=8) 

        assert cube.shape == (64, 64, 64)
        print(filename, '\n', cube)
        print('--'*30)

# 4. Visual Samples By Labels

In [None]:
EXAMPLE_SERIESUID = '660577'

In [None]:
for img_file in iglob(settings.PREPROCESS_FPOS_DIR + 'medi/' + f'{EXAMPLE_SERIESUID}*.png'):   
    img = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE)
    print(img_file, np.sum(img))
    fig, axs = plt.subplots(1, 1, figsize=(16, 16))
    axs.imshow(img, cmap='gray')
    plt.show()

In [None]:
for img_file in iglob(settings.PREPROCESS_FPOS_DIR + 'lung/' + f'{EXAMPLE_SERIESUID}*.png'):
    img = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE)
    print(img_file, np.sum(img))
    fig, axs = plt.subplots(1, 1, figsize=(16, 16))
    axs.imshow(img, cmap='gray')
    plt.show()