## **Imports**

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import nibabel as nib

import os
import cv2
from glob import glob
import gc

from sklearn.model_selection import StratifiedKFold, KFold, StratifiedGroupKFold,GroupKFold

## **Goal**
The Goal is to reconstrcut 3D MRI Images (from all slices/ subject) 

In [2]:
TRAIN_ROOT_DIR = '../input/uw-madison-gi-tract-image-segmentation/train'
TEST_ROOT_DIR = '../input/uw-madison-gi-tract-image-segmentation/test'

In [3]:
train_df_original = pd.read_csv('../input/uw-madison-gi-tract-image-segmentation/train.csv')

print(train_df_original.shape)
train_df_original.head()


(115488, 3)


Unnamed: 0,id,class,segmentation
0,case123_day20_slice_0001,large_bowel,
1,case123_day20_slice_0001,small_bowel,
2,case123_day20_slice_0001,stomach,
3,case123_day20_slice_0002,large_bowel,
4,case123_day20_slice_0002,small_bowel,


### **Helper function: Prepare a DataFrame from the Images Folder**

In [4]:
def prepare_df(df, root_dir):
    all_images = []
    for path in Path(root_dir).rglob('*.png'):

        parts = path.parts
        dict_path = {}
        case_str = parts[4][4:]
        day_str = parts[5].split('_')[1][3:]
        dict_path['case'] = int(case_str)
        dict_path['day'] = int(day_str)
        dict_path['slice'] = int(parts[7].split('_')[1])
        slice_str = '_'.join(parts[7].split('_')[0:2])
        #dict_path['PosixPath'] = path
        dict_path['path'] = str(path)
        dict_path['join_col'] = 'case'+ case_str + '_'+ 'day' + day_str + '_' + slice_str
        dict_path['height'] = int(dict_path['path'].split('/')[7].split('_')[2])
        dict_path['width'] = int(dict_path['path'].split('/')[7].split('_')[3])
        dict_path['subject'] = case_str + '_'+ day_str
        all_images.append(dict_path)
    
    # Only contains 1/3 length of the orginal df(where each slice is repeated 3 times )
    df_1_3 = pd.DataFrame(all_images) 
    # Get the final dataframe with the same length of the original one...
    df_final = pd.merge(df, df_1_3, left_on = 'id', right_on = 'join_col')
    df_final = df_final.drop('join_col', axis=1)

    return df_final

In [5]:
train_df = prepare_df(train_df_original, TRAIN_ROOT_DIR)
print(len(train_df))
train_df.head(2)


115488


Unnamed: 0,id,class,segmentation,case,day,slice,path,height,width,subject
0,case123_day20_slice_0001,large_bowel,,123,20,1,../input/uw-madison-gi-tract-image-segmentatio...,266,266,123_20
1,case123_day20_slice_0001,small_bowel,,123,20,1,../input/uw-madison-gi-tract-image-segmentatio...,266,266,123_20


## **Helper function: Rearrange the DataFrame**

In [6]:
def rearrange_df(df):
    """ 
    Rearrange data in the the prepared DataFrame (train_df or test_df).
    For each id (repeated 3 times),it creates 3 associated columns representing the segmentation 
    for the 3 classes:'large_bowel', 'small_bowel', 'stomach' 
    """
    # I keep only one id (so that it is not repeated 3 times)
    df_rearranged = pd.DataFrame({"id": df["id"][::3]}) 
    df_rearranged["large_bowel"] = df["segmentation"][::3].values
    df_rearranged["small_bowel"] = df["segmentation"][1::3].values
    df_rearranged["stomach"] = df["segmentation"][2::3].values

    # Adjust the corresponding other columns
    df_rearranged["case"] = df["case"][::3].values
    df_rearranged["day"] = df["day"][::3].values
    df_rearranged["subject"] = df["subject"][::3].values
    df_rearranged["slice"] = df["slice"][::3].values
    df_rearranged["path"] = df["path"][::3].values
    df_rearranged["width"] = df["width"][::3].values
    df_rearranged["height"] = df["height"][::3].values
    
    df_rearranged = df_rearranged.reset_index(drop=True)
    df_rearranged = df_rearranged.fillna("")
    # I count the classes for which we have segmentation (I mean rle encoding),
    # I'll use it for the validation strategy
    # 0: No segmentation
    # 1: Only one class for which we have segmentation
    # 2: Only 2 classes for which we have segmentation
    # 3: we have segmentation for the 3 classes
    #df_rearranged["count"] = np.sum(~df_rearranged.iloc[:, 1:4].isnull(), axis=1).values
    df_rearranged["count"] = np.sum(df_rearranged.iloc[:, 1:4] != "", axis=1).values
    
    
    return df_rearranged

In [7]:
train_df_rearranged = rearrange_df(train_df)
print(len(train_df_rearranged))
train_df_rearranged.head(3)

38496


Unnamed: 0,id,large_bowel,small_bowel,stomach,case,day,subject,slice,path,width,height,count
0,case123_day20_slice_0001,,,,123,20,123_20,1,../input/uw-madison-gi-tract-image-segmentatio...,266,266,0
1,case123_day20_slice_0002,,,,123,20,123_20,2,../input/uw-madison-gi-tract-image-segmentatio...,266,266,0
2,case123_day20_slice_0003,,,,123,20,123_20,3,../input/uw-madison-gi-tract-image-segmentatio...,266,266,0


In [8]:
# According to this discussion : 
# https://www.kaggle.com/competitions/uw-madison-gi-tract-image-segmentation/discussion/319963#1763869
# I'll remove misslabeled training data

train_df_rearranged = train_df_rearranged[(train_df_rearranged['case']!=7)|(train_df_rearranged['day']!=0)]
train_df_rearranged = train_df_rearranged.reset_index(drop=True)

train_df_rearranged = train_df_rearranged[(train_df_rearranged['case']!=81)|(train_df_rearranged['day']!=30)]
train_df_rearranged = train_df_rearranged.reset_index(drop=True)


In [9]:
len(np.unique(train_df_rearranged['case'])) 
# we have 85 cases but one case may have several MRI scans: MRI scan = case_day


85

## **Before further preprocessing: Validation Strategy**

In [10]:
# It is important to have a good validation strategy
# the whole case (with all its slices) should not be present in the two sets : train & validation

sgkf = GroupKFold(n_splits=5)

for fold, (_, val_idx) in enumerate(sgkf.split(X=train_df_rearranged,groups=train_df_rearranged["case"]),1): # the 1st fold has index 1 not 0
    train_df_rearranged.loc[val_idx, "fold"] = fold

train_df_rearranged["fold"] = train_df_rearranged["fold"].astype(np.uint8)

In [11]:
train_df_rearranged.groupby("fold").size()

fold
1    7696
2    7632
3    7632
4    7648
5    7600
dtype: int64

## **Slelected_fold = 1**


In [12]:
fold_selected = 1
train_ids = train_df_rearranged[train_df_rearranged["fold"] != fold_selected].index
valid_ids = train_df_rearranged[train_df_rearranged["fold"] == fold_selected].index

X_train = train_df_rearranged[train_df_rearranged.index.isin(train_ids)]
X_valid = train_df_rearranged[train_df_rearranged.index.isin(valid_ids)]

In [13]:
X_train['subject'].value_counts()

123_20    144
108_13    144
43_18     144
43_22     144
43_26     144
         ... 
118_0      80
117_15     80
117_17     80
117_13     80
89_21      80
Name: subject, Length: 215, dtype: int64

In [14]:
np.unique(X_train['subject'].value_counts(), return_counts = True)

(array([ 80, 144]), array([  7, 208]))

## **Helper function: Create the list of subjects (subject=case_day) with their corresponding slices**

In [15]:
def create_list_subjects_slices(rearranged_df):
    list_subjects_dict = []
    for subject, slices in list(rearranged_df.groupby('subject')['path']):
        dict_subject ={}
        list_slices_arrays = []
        for path_slice in slices.values:
            arr_img = cv2.imread(path_slice, cv2.IMREAD_ANYDEPTH)
            resized_arr_image = cv2.resize(arr_img, (128,128), interpolation=cv2.INTER_NEAREST)
            list_slices_arrays.append(resized_arr_image)
        dict_subject[subject] = list_slices_arrays
        list_subjects_dict.append(dict_subject)
    
    list_subjects_slices=[]
    list_subjects = []
    for x_dict in list_subjects_dict:
        for key_subject in x_dict:
            list_subjects_slices.append(x_dict[key_subject])
            list_subjects.append(key_subject)
    gc.collect()
    del list_subjects_dict 
    return list_subjects_slices, list_subjects


In [16]:
list_slices, list_subjects = create_list_subjects_slices(X_train)

### **Create the 3D MRI scans (niftii format)**

In [17]:
def create_3d_mri_images(list_all_slices, list_subjects, root):
    root_path = Path(root)
    root_path.mkdir(parents=True, exist_ok=True)
    list_mri_arrays=[]
    for subject, slices_subject in zip(list_subjects,list_all_slices):
        mri_subject = np.asarray(slices_subject)
        mri_subject = np.swapaxes(mri_subject, 0, 2) # (num_slices,w,h) --> (h, w, num_slices)
        mri_subject = np.swapaxes(mri_subject, 0, 1) # (h,w, num_slices) --> (w,h,num_slices)
        mri_subject_nifti = nib.Nifti1Image(mri_subject, affine=np.eye(4))
        nib.save(mri_subject_nifti, root+subject+'.nii.gz')

    gc.collect()
    

In [18]:
create_3d_mri_images(list_slices,list_subjects, root= './mri_images/train/')

### **Validation set: Construct the 3D MRI scans**

In [19]:
list_slices_val, list_subjects_val = create_list_subjects_slices(X_valid)

In [20]:
create_3d_mri_images(list_slices_val,list_subjects_val,root='./mri_images/val/')