# Part I - pre-processing #

In [1]:
import os, shutil, gc, subprocess, sys, json, time

import scipy
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt
from PIL import Image
import cv2
#from tqdm import tqdm
from tqdm.notebook import tqdm
import skimage.io
from skimage.transform import resize, rescale
from math import sqrt
import statistics


In [2]:
import PIL.Image
#PIL.Image.MAX_IMAGE_PIXELS = 400000000
PIL.Image.MAX_IMAGE_PIXELS = None

In [3]:
%matplotlib inline

In [4]:
from panda_bvv_config import *

### Part 1 - Input data ###

In [5]:
Zoom = 2
pred_fold = 'testdata320_inv_isup'
new_tile_size = 80
tile_number = 16

#model_folder = 'effnB5'
#pred_file = 'ground456simple.npz'

In [6]:
# wrapping for inputs:
pred_fold = os.path.join(base_path, pred_fold)


os.mkdir(pred_fold)
#os.makedirs(pred_fold)
input_folder = cancer_s2 #test_size2 if Zoom == 2 else test_size1
input_mask_folder = mask_size2 if Zoom == 2 else mask_size1

### 2. Standard functions ###

In [7]:
def prepare_df_data(test_df = test_cnn_labels,
                    data_fold = pred_fold):
    test_data = []
    test_labels = []
    df_f = []
    for i,row in enumerate(test_df.iterrows()): 
        img_num = row[1]['image_id']
        biopsy = skimage.io.imread(os.path.join(data_fold, 'testf',img_num + '.png'))
        test_data.append(biopsy)
        test_labels.append(row[1]['isup_grade'])
        df_f.append(img_num)
                           
    return np.array(test_data), np.array(test_labels)

In [8]:
#wrapper for image processing function

def process_all_images(proc_func):
    
    def wrapper(zoom = Zoom,
                input_biopsy_folder = input_folder,
                output_data_path = test_cnn,
                df_name = test_cnn_labels,
                with_mask = False,
                input_mask_folder = mask_size2,
                **kw):
        
        for  i, row in enumerate(df_name.iterrows()):
            img_num = row[1]['image_id']
            provider = row[1]['data_provider']                         
            mask = None
            try:
                biopsy = skimage.io.imread(os.path.join(input_biopsy_folder, img_num + '.png'))
                shape = biopsy.shape
                if shape[0]<shape[1]: 
                    biopsy = np.rot90(biopsy)
                    shape = biopsy.shape
            except:
                print(f'can not proceed with {img_num}')   
            if with_mask:
                try:
                    mask = skimage.io.imread(os.path.join(input_mask_folder, img_num + '_mask.png'))
                except:
                    print('Failed to process mask:', img_num)
            try:
                data_new = proc_func(biopsy, mask, provider, **kw)
                cv2.imwrite(os.path.join(output_data_path, img_num + '.png'),\
                                         data_new, [int(cv2.IMWRITE_PNG_COMPRESSION), 9])
            except Exception as ee:
                print('Processing mistake:\n', ee, '\n', img_num)            
            try:
                del biopsy, mask, data_new, mask_new
            except:
                pass
        gc.collect()
        return
    
    return wrapper

In [9]:
def change_karolinska_mask_for_isup_grade_direct(mask_data, chan):
    """
    This function returns pre-processed radbound mask for image multiplication
    """

    try:
        new_mask = np.zeros(mask_data.shape)#.astype('uint8')
        cancerous_tissue = False
        try:             
            mask_area = np.where(mask_data[:,:,chan] == 2)
            new_mask[mask_area[0], mask_area[1], :] = [1, 1, 1] 
            del mask_area, mask_data
            if new_mask.any() >0: 
                cancerous_tissue = True    
        except Exception as ee:
            print("Mask {%s} processing mistake: " %mask_image, ee)
    except Exception as ee:
        print("Mask {%s} read mistake: "  %mask_image, str(ee))

    gc.collect()                   
    
    return cancerous_tissue, new_mask

In [10]:
def change_radboud_mask_for_isup_grade_direct(mask_data, chan,
                                      gauss_bl = 10):
    """
    This function returns pre-processed radbound mask for image multiplication
    """

    try:
        new_mask = np.zeros(mask_data.shape)#.astype('uint8')
        cancerous_tissue = False
        try: 
            for area_value in range(3,5):
                mask_area = np.where(mask_data[:,:,chan] == area_value)
                new_mask[mask_area[0], mask_area[1], :] = [1, 1, 1]
                del mask_area
            if new_mask.any() >0: 
                cancerous_tissue = True
            del mask_data,    
        except Exception as ee:
            print("Mask {%s} processing mistake: " %mask_image, ee)
    except Exception as ee:
        print("Mask {%s} read mistake: "  %mask_image, str(ee))
        
    #new_mask[:,:,:] = cv2.blur(new_mask[:,:, :],(gauss_bl,gauss_bl))
    new_mask[:,:,:] = cv2.bilateralFilter(np.float32(new_mask[:,:,:]),gauss_bl,25,25)
    #medianBlur(new_mask[:,:, 2],5)
    #new_mask[:,:,:] = cv2.GaussianBlur(new_mask[:,:,:],(gauss_bl,gauss_bl), 0)
    #new_mask[:,:,:] =  np.rint(new_mask[:,:,:]) #.astype(int) 
    new_mask[:,:,:] =  np.ceil(new_mask[:,:,:]) #.astype(int) 
    #gc.collect()                   
    
    return cancerous_tissue, new_mask

In [11]:
def get_cancer_area_from_mask(cancer_mask, biopsy):
    """
    Returns multiplication of mask and image
    """
    try:
        only_cancer = np.multiply(cancer_mask,biopsy).astype(int)
        #cv2.imwrite(save_path, only_cancer, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
        #del cancer_mask, biopsy #, only_cancer
    except Exception as ee:
        print(str(ee))
    
    return only_cancer  

### 3. Processing functions ###

In [12]:
@process_all_images
def tile16_simple(img, mask, provider, **kw):        
    sz=kw.get('sz') if kw.get('sz') else 32    
    N = kw.get('N') if kw.get('N') else 16
    final_dim = kw.get('final_dim') if kw.get('final_dim') else N**0.5 
    result = []
    shape = img.shape
    pad0,pad1 = (sz - shape[0]%sz)%sz, (sz - shape[1]%sz)%sz
    img = np.pad(img,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],
                constant_values=255)
    img = img.reshape(img.shape[0]//sz,sz,img.shape[1]//sz,sz,3)
    img = img.transpose(0,2,1,3,4).reshape(-1,sz,sz,3)
    if len(img) < N:
        img = np.pad(img,[[0,N-len(img)],[0,0],[0,0],[0,0]],constant_values=255)
    idxs = np.argsort(img.reshape(img.shape[0],-1).sum(-1))[:N]
    img = img[idxs]
    a, b, c, d = img.shape
    data_new = (img.reshape(int(final_dim), int(final_dim),\
                                                    b, c, d).swapaxes(1,2).reshape(b*int(final_dim),\
                                                    c*int(final_dim), d))
    data_new = cv2.bitwise_not(data_new)
    
    return data_new


In [13]:
@process_all_images
def get_cancer_area_for_isup_grade(biopsy, mask, provider, **kw):
    """
    Extracts cancer area only (if cancerous), from image and mask; or unchanged (if non-cancerous).
    Do NOT split cancer areas on Gleason.
    This function applicable for ISUP-grade based training
    Do NOT tile image
    """
    
    gauss_bl = kw.get('gauss_bl') if kw.get('gauss_bl') else 20    
    channel = 0 if len(np.unique(mask[:,:,0])) >1 else 2    
              
    if provider == 'karolinska':
        cancerous, new_mask  = change_karolinska_mask_for_isup_grade_direct(mask, chan = channel)
    elif provider == 'radboud':
        cancerous, new_mask  = change_radboud_mask_for_isup_grade_direct(mask, chan = channel,
                                      gauss_bl = gauss_bl)
    if cancerous:
        temp_im = get_cancer_area_from_mask(new_mask, biopsy)
        temp_im = temp_im.astype('uint8') * 255
        inv = cv2.bitwise_not(temp_im )
        #inv = biopsy
    else:
        inv = biopsy
    
    data_new = inv #cv2.bitwise_not(inv)
    return data_new
    
    #eliminate_white_direct(biopsy = inv,
    #                       save_file_name = save_path,
    #                       zoom =2,                          
    #                     fault_name = 'faults_size2_large_mask.txt', 
    #                      tile_square_yside = tile_square_yside,
    #                        tile_square_xside = tile_square_xside, 
    #                      tile_fill = tile_fill,
    #                      threshold = threshold,
    #                          transform_ratios = transform_ratios,
    #                    file_list = None,                         
    #                      compression = compression)
        
    #cv2.imwrite(save_path, inv, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
    #else:
        #execute_cp_command(biopsy_file_path, save_path)

### 4. Process test files for generator use and NN debugging ###

In [14]:
temp_fold = os.getcwd()
os.chdir(base_path)
if os.path.exists(test_cnn):
    ! rm -rf testf
    print("[INFO] re-creating testf directory")
! mkdir testf
os.chdir(temp_fold)

In [15]:
%%time
kw = {'sz': new_tile_size, 'N':tile_number}
tile16_simple(**kw)

can not proceed with 3ae66332c2dd50c6ce458937d49f8232
Processing mistake:
 local variable 'biopsy' referenced before assignment 
 3ae66332c2dd50c6ce458937d49f8232
can not proceed with c3ca4a75be0055aefe79f6849804d62c
Processing mistake:
 local variable 'biopsy' referenced before assignment 
 c3ca4a75be0055aefe79f6849804d62c
can not proceed with 3756141b86ca5f2afcacf5113d4e8f83
Processing mistake:
 local variable 'biopsy' referenced before assignment 
 3756141b86ca5f2afcacf5113d4e8f83
can not proceed with eec36dafc7b16caaadf5418529dc29cd
Processing mistake:
 local variable 'biopsy' referenced before assignment 
 eec36dafc7b16caaadf5418529dc29cd
can not proceed with bc6e1ebec08a9503689da9a095690dfe
Processing mistake:
 local variable 'biopsy' referenced before assignment 
 bc6e1ebec08a9503689da9a095690dfe
can not proceed with f9b2162716bb1679a53424b0e6bc7f9b
Processing mistake:
 local variable 'biopsy' referenced before assignment 
 f9b2162716bb1679a53424b0e6bc7f9b
can not proceed with 6

In [16]:
# %%time
# kw = {'gauss_blur':20}
# get_cancer_area_for_isup_grade(with_mask = True,
#                                df_name = test_cnn_labels,
#                                **kw)

#### we make sandbox trainf and validf for NN model testing ####

In [17]:
#test_cnn_labels[['image_id','isup_grade']].groupby('isup_grade').count()

In [18]:
temp_fold = os.getcwd()
os.chdir(base_path)
if os.path.exists(train_cnn):
    ! rm -rf trainf
    print("[INFO] re-creating trainf directory")
if os.path.exists(valid_cnn):
    ! rm -rf validf
    print("[INFO] re-creating validf directory")
os.chdir(temp_fold)

os.mkdir(train_cnn)
os.mkdir(valid_cnn)

for grade in isup_classes:
    os.mkdir(os.path.join(train_cnn, f'isup{grade}'))
    os.mkdir(os.path.join(valid_cnn, f'isup{grade}'))
    
for j, row in tqdm(test_cnn_labels.iterrows()):
    filename = row['image_id'] + '.png'       
    path_to_train_class = os.path.join(train_cnn, 'isup' + str(row['isup_grade']), filename)
    path_to_valid_class = os.path.join(valid_cnn, 'isup' + str(row['isup_grade']), filename) 
    path_to_copy_from = os.path.join(test_cnn, filename)
    bashCommand = "cp " + path_to_copy_from + ' ' + path_to_train_class
    process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    if error: print(str(error))
    bashCommand = "cp " + path_to_copy_from + ' ' + path_to_valid_class
    process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
    output, error = process.communicate()
    if error: print(str(error))    

[INFO] re-creating trainf directory
[INFO] re-creating validf directory


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [19]:
#move testf to wrapping folder for generator use
shutil.move(test_cnn, pred_fold)

'/run/media/admin/kagg/panda/testdata320_inv_isup/testf'

In [20]:
model_folder = os.path.join(model_path, model_folder)
os.mkdir(model_folder)
test_data, test_labels = prepare_df_data()
np.savez(os.path.join(model_folder, pred_file.split('.')[0]),\
         test_data = test_data, test_labels=test_labels)


NameError: name 'model_folder' is not defined

#### quick exploration of test data ####

In [None]:
#test_labels[:10]

In [None]:
sum(sum(test_data[1]))

In [None]:
skimage.io.imshow(test_data[1])