## **Imports**

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
pd.set_option('chained_assignment',None)
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import nibabel as nib

import os
import cv2
from glob import glob
import gc

from sklearn.model_selection import StratifiedKFold, KFold, StratifiedGroupKFold,GroupKFold
import pandas as pd
pd.options.mode.chained_assignment = None 

## **Goal**

The Goal is to reconstrcut 3D Masks (from  provided RLE encodings)

In [2]:
TRAIN_ROOT_DIR = '../input/uw-madison-gi-tract-image-segmentation/train'
TEST_ROOT_DIR = '../input/uw-madison-gi-tract-image-segmentation/test'

In [3]:
train_df_original = pd.read_csv('../input/uw-madison-gi-tract-image-segmentation/train.csv')

print(train_df_original.shape)
train_df_original.head()

(115488, 3)


Unnamed: 0,id,class,segmentation
0,case123_day20_slice_0001,large_bowel,
1,case123_day20_slice_0001,small_bowel,
2,case123_day20_slice_0001,stomach,
3,case123_day20_slice_0002,large_bowel,
4,case123_day20_slice_0002,small_bowel,



## **Helper function: Prepare a DataFrame from the Images Folder**


In [4]:
def prepare_df(df, root_dir):
    all_images = []
    for path in Path(root_dir).rglob('*.png'):

        parts = path.parts
        dict_path = {}
        case_str = parts[4][4:]
        day_str = parts[5].split('_')[1][3:]
        dict_path['case'] = int(case_str)
        dict_path['day'] = int(day_str)
        dict_path['slice'] = int(parts[7].split('_')[1])
        slice_str = '_'.join(parts[7].split('_')[0:2])
        #dict_path['PosixPath'] = path
        dict_path['path'] = str(path)
        dict_path['join_col'] = 'case'+ case_str + '_'+ 'day' + day_str + '_' + slice_str
        dict_path['height'] = int(dict_path['path'].split('/')[7].split('_')[2])
        dict_path['width'] = int(dict_path['path'].split('/')[7].split('_')[3])
        dict_path['subject'] = case_str + '_'+ day_str
        all_images.append(dict_path)
    
    # Only contains 1/3 length of the orginal df(where each slice is repeated 3 times )
    df_1_3 = pd.DataFrame(all_images) 
    # Get the final dataframe with the same length of the original one...
    df_final = pd.merge(df, df_1_3, left_on = 'id', right_on = 'join_col')
    df_final = df_final.drop('join_col', axis=1)

    return df_final


## **Helper function: Rearrange the DataFrame**


In [5]:
def rearrange_df(df):
    """ 
    Rearrange data in the the prepared DataFrame (train_df or test_df).
    For each id (repeated 3 times),it creates 3 associated columns representing the segmentation 
    for the 3 classes:'large_bowel', 'small_bowel', 'stomach' 
    """
    # I keep only one id (so that it is not repeated 3 times)
    df_rearranged = pd.DataFrame({"id": df["id"][::3]}) 
    df_rearranged["large_bowel"] = df["segmentation"][::3].values
    df_rearranged["small_bowel"] = df["segmentation"][1::3].values
    df_rearranged["stomach"] = df["segmentation"][2::3].values

    # Adjust the corresponding other columns
    df_rearranged["case"] = df["case"][::3].values
    df_rearranged["day"] = df["day"][::3].values
    df_rearranged["subject"] = df["subject"][::3].values
    df_rearranged["slice"] = df["slice"][::3].values
    df_rearranged["path"] = df["path"][::3].values
    df_rearranged["width"] = df["width"][::3].values
    df_rearranged["height"] = df["height"][::3].values
    
    df_rearranged = df_rearranged.reset_index(drop=True)
    df_rearranged = df_rearranged.fillna("")
    # I count the classes for which we have segmentation (I mean rle encoding),
    # I'll use it for the validation strategy
    # 0: No segmentation
    # 1: Only one class for which we have segmentation
    # 2: Only 2 classes for which we have segmentation
    # 3: we have segmentation for the 3 classes
    #df_rearranged["count"] = np.sum(~df_rearranged.iloc[:, 1:4].isnull(), axis=1).values
    df_rearranged["count"] = np.sum(df_rearranged.iloc[:, 1:4] != "", axis=1).values
    
   
    return df_rearranged

In [6]:
train_df = prepare_df(train_df_original, TRAIN_ROOT_DIR)
train_df_rearranged = rearrange_df(train_df)


In [7]:
# According to this discussion : 
# https://www.kaggle.com/competitions/uw-madison-gi-tract-image-segmentation/discussion/319963#1763869
# I'll remove misslabeled training data

train_df_rearranged = train_df_rearranged[(train_df_rearranged['case']!=7)|(train_df_rearranged['day']!=0)]
train_df_rearranged = train_df_rearranged.reset_index(drop=True)

train_df_rearranged = train_df_rearranged[(train_df_rearranged['case']!=81)|(train_df_rearranged['day']!=30)]
train_df_rearranged = train_df_rearranged.reset_index(drop=True)

### **slelected_fold = 4**

In [8]:
# It is important to have a good validation strategy
# the whole case (with all its slices) should not be present in the two sets : train & validation

sgkf = GroupKFold(n_splits=5)

for fold, (_, val_idx) in enumerate(sgkf.split(X=train_df_rearranged,groups=train_df_rearranged["case"]),1): # the 1st fold has index 1 not 0
    train_df_rearranged.loc[val_idx, "fold"] = fold

train_df_rearranged["fold"] = train_df_rearranged["fold"].astype(np.uint8)

In [9]:
# For experiment X_train & X_val if the fold = 1
fold_selected = 4
train_ids = train_df_rearranged[train_df_rearranged["fold"] != fold_selected].index
valid_ids = train_df_rearranged[train_df_rearranged["fold"] == fold_selected].index

X_train = train_df_rearranged[train_df_rearranged.index.isin(train_ids)]
X_valid = train_df_rearranged[train_df_rearranged.index.isin(valid_ids)]


## **Create masks arrays from RLE encodings**

In [10]:
def rle2mask(mask_rle, shape, color=1):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (width,height) of array to return 
    Returns numpy array, color - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = color
    return img.reshape(shape) 

### **Developing a function to handle overlapping pixels**

See the following notebook for further information on this matter of overlapping pixels (between classes):  
"02_Preprocess[3D masks]OverlappingPixels issue"

In [11]:
# Create a function that deals with overlapping pixels according to the slice type
#===========================
# When count=1 => when we have one segmentation: large_bowel or small_bowel or stomach, it is
# impossible to have overlapping pixels (since the RLE encoding was for one column, we can't have
# for example, two columns of "small_bowel")
#===========================
# When count=2 => We have 3 cases of possible overlapping pixels: 
# according to each case, I will assign the overlapping pixels to one class of the two

# large_bowel + small_bowel : 1+2 = 3 (type_slices['bk_large_small_bowel']=6) => assign to 1
#(and it will be misleading with 3, since 3 is the class of stomach!) 
# large_bowel + stomach : 1+3 = 4 (type_slices['bk_largebowel_stomach']=5) => assign to 3(stomach)
# small_bowel + stomach : 2+3 = 5 (type_slices['bk_smallbowel_stomach']=4) => assign to 3(stomach)
#==========================
# When count =3 ==> We have 4 cases of possible overlapping pixels
# the 3 cases of count=2, above (because it is possible that only 2 classes overlap)
# and the case when the 3 classes overlap

# When count = 3, we have (04) possible cases of overlapping pixels.
# The three cases of count = 2 above (because we can have only two classes that may overlap)
# As well as the case where the three classes overlap (large_bowel+small_bowel+stomach = 6)
# type_slices['bk_small_large_bowel_stomach']=7 

# So when count=3 and When I have these 2 classes overlapping (small_bowel + large_bowel) 
#==> they have a value of 3, which is the same value as the stomach that coexists with them. 
# ==> I'll treat them using the column'mask_small_large' at the beginning, 
# before creating mask column.
# in all the other cases :  
# 3 classes overlap:large_bowel+small_bowel+stomach(1+2+3=6)=>assign overlapping pixels to3(stomach)
# 2 classes overap : large_bowel + stomach (1+3=4) assign overlapping pixels to 3 (stomach)
# 2 classes overlap : small_bowel + stomach (2+3=5) assign overlapping pixels to 3 (stomach)
#===========================
def create_2d_masks(df):
    # class 'large_bowel' : 1 
    df.loc[:,'large_bowel'] = df.apply(lambda x: cv2.resize(rle2mask(x['large_bowel'],
                    shape = (x['width'],x['height']), color=1),(128,128)).astype(np.uint8),axis=1)
    # class 'small_bowel' : 2
    df.loc[:,'small_bowel'] = df.apply(lambda x: cv2.resize(rle2mask(x['small_bowel'],
                    shape = (x['width'],x['height']), color=2),(128,128)).astype(np.uint8),axis=1)
    # class 'stomack' : 3
    df.loc[:,'stomach'] = df.apply(lambda x: cv2.resize(rle2mask(x['stomach'],
                    shape = (x['width'],x['height']), color=3),(128,128)).astype(np.uint8),axis=1)

    type_slices = {'background':0, 'bk_large_bowel':1,'bk_small_bowel':2,'bk_stomach':3,
                             'bk_smallbowel_stomach':4,'bk_largebowel_stomach':5,
                             'bk_large_small_bowel':6,'bk_small_large_bowel_stomach':7}
    
    df['mask_small_large'] = df['large_bowel'] + df['small_bowel'] 
    # I will treat the overlapping pixels (small_bowel+large_bowel=1+2=3, when count=3)
    # That means when we have 3 segmentations on the mask, it will be confusing to have
    # pixels with the value 3 (representing the class stomach), with the overlapping pixels 3
    # as well as the case when count=2, when we have only small_bowel & large_bowel as segmentation
    # and overlapping pixels 1+2 =3
    # I will asiign the overlapping pixels to one class of them: 1 (large_bowel)
    df['mask_s_l'] = df.apply(lambda x: np.where(x['mask_small_large']==3,1,x['mask_small_large']),
                             axis=1)
    
    # Then, I will sum all segmentation in one mask
    # In this column,mask,I am sure that all pixels with value=3, are representing the class stomach
    # because all the overlapping pixels (large_bowel + small_bowel =1+2=3) have been treated above
    # either for count = 2, or count = 3 (the only possible cases when this might happen).
    df['mask_primary'] = df['mask_s_l'] + df['stomach']
    
    # All the other cases of possible overlapping pixels are:
    # for count=2: large_bowel +stomach 1+3=4/ small_bowel+ stomach 2+3=5 
    # for count=3: 3 sub-cases: small_bowel+large_bowel+stomach 1+2+3=6 /
    # large_bowel+stomach 1+3=4 and small_bowel+ stomach 2+3=5 (under count=3)
    # ==> for all these cases, the stomach is the element in commun=> I'll assign
    # all these overlapping pixels to the class 3 (stomach)
    df['mask'] = df.apply(lambda x: np.where(x['mask_primary']>3,3,x['mask_primary']), axis=1)
    
    df.drop(['large_bowel','small_bowel','stomach','mask_small_large','mask_s_l','mask_primary'],
            axis=1,inplace=True)

    gc.collect()
    gc.collect(generation=2)
    return df

In [12]:
# Create 2D masks without overlapping pixels between classes
X_train_with_2dmasks = create_2d_masks(X_train)
X_valid_with_2dmasks = create_2d_masks(X_valid)

## **Create the 3D masks**

In [13]:
def create_3d_masks(df, root):
    root_path = Path(root)
    root_path.mkdir(parents=True, exist_ok=True)
 
    for subject, list_masks in list(df.groupby('subject')['mask']):
        mask_subject = np.asarray(list(list_masks)).astype(np.uint8)
        mask_subject = np.swapaxes(mask_subject, 0, 2) # (num_slices,w,h) --> (h, w, num_slices)
        mask_subject = np.swapaxes(mask_subject, 0, 1) # (h,w, num_slices) --> (w,h,num_slices)
        mask_subject_nifti = nib.Nifti1Image(mask_subject, affine=np.eye(4))
        nib.save(mask_subject_nifti,  root+subject+'.nii.gz')
        
    

In [14]:
create_3d_masks(X_train_with_2dmasks, root='./masks/train/')

In [15]:
create_3d_masks(X_valid_with_2dmasks, root='./masks/val/')