# Compile the dataset for training and validation
Load the image (flou and aflou channels) and the corresponding vessel/punctae masks.
This notebook is used to compiled both the vessel and punctae dataset using the same splits

Imports:

In [None]:
import numpy as np
import h5py
import cv2
from os import walk, makedirs
from os.path import join, exists, basename, splitext
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import segmentation_models as sm
from tqdm import tqdm
from segmentation_models.metrics import f1_score, iou_score
from skimage import filters
import os
from os import walk, makedirs, listdir
from os.path import join, exists, expanduser
import pandas as pd
import re
import xarray as xr
import os
from scipy import ndimage
from sklearn.cluster import KMeans
from glob import glob 

Parameters:

In [None]:
data_dir = 'auto_fluo_data_vessel'
#data_dir = 'auto_fluo_data_punctae'
validation_fraction = 0.15
random_seed = 42
np.random_seed(random_seed) # make sure to use the same split for vessel/punctae

Locate the files:

In [None]:
files = glob(join(data_dir, '*'))
ids = np.unique(['_'.join(basename(f).split('_')[:-1]) for f in files])
img_pairs = {}
for img_id in ids:
    pair = {
        'X1': join(data_dir,img_id + '_fluo.nc'),
        'X2': join(data_dir,img_id + '_afluo.nc'),
        'y':  join(data_dir,img_id + '_mask.nc')
    }
    img_pairs[img_id] = pair
    
    assert exists(pair['X1'])
    assert exists(pair['X2'])
    assert exists(pair['y'])
print('ROIs found =',len(img_pairs))

## Function for loading and processing the images
Pad to largest size and standardize pixel intensities to range [0,1]

In [None]:
def load_image_and_masks(data_paths_map, target_resolution=0.12435661944309118, max_size=(576, 608), min_vals={'X1': 1.3632197, 'X2': -1.7582703}, effective_ranges={'X1': (0, 3950), 'X2': (0, 1785)}):
    
    X1, X2, y = [], [], []

    for i, (img_id, data_map) in enumerate(data_paths_map.items()):
        assert exists(data_map['X1']) and exists(data_map['X2']) and exists(data_map['y']), 'Data did not exist.'

        data_X1 = xr.open_dataarray(data_map['X1'])
        data_X2 = xr.open_dataarray(data_map['X2'])
        data_y = xr.open_dataarray(data_map['y'])
        
        img_X1 = data_X1.data
        img_X2 = data_X2.data
        msk_y = data_y.data       
        
        X1_res = (data_X1.x.diff('x')[0].item(), data_X1.y.diff('y')[0].item())
        X2_res = (data_X2.x.diff('x')[0].item(), data_X2.y.diff('y')[0].item())
        y_res = (data_y.x.diff('x')[0].item(), data_y.y.diff('y')[0].item())

        assert img_X1.shape == img_X2.shape and img_X1.shape == msk_y.shape, 'Data have different shapes'
        assert img_X1.shape[1] <= max_size[0] and img_X1.shape[2] <= max_size[1], 'Data have too large shape'
        assert X1_res == X2_res and X1_res == y_res, 'Data resolutions are different'
        assert np.isclose(X1_res[0], X1_res[1]), 'Data resolution is not consistent for x and y axis'
        assert np.isclose(X1_res[0], target_resolution), 'Rescaling not supported yet'
        assert img_X1.shape[0] == 5, 'Only 5 timeslices accepted'
        assert np.all(np.unique(msk_y) == [0,1]) or np.all(np.unique(msk_y) == [0]), 'Mask is not binary'
        
        img_X1 = np.pad(
            img_X1,
            pad_width=((0, 0), (0, max_size[0] - img_X1.shape[1]), (0, max_size[1] - img_X1.shape[2])),
            mode='constant',
            constant_values=0
        )
        img_X2 = np.pad(
            img_X2,
            pad_width=((0, 0), (0, max_size[0] - img_X2.shape[1]), (0, max_size[1] - img_X2.shape[2])),
            mode='constant',
            constant_values=0
        )
        msk_y = np.pad(
            msk_y,
            pad_width=((0, 0), (0, max_size[0] - msk_y.shape[1]), (0, max_size[1] - msk_y.shape[2])),
            mode='constant',
            constant_values=0
        )

        img_X1 = img_X1.astype(np.float32)
        img_X2 = img_X2.astype(np.float32)
        msk_y = msk_y.astype(np.uint8)
        
        img_X1 = img_X1 - min_vals['X1']
        img_X2 = img_X2 - min_vals['X2']
        
        img_X1[img_X1 < effective_ranges['X1'][0]] = effective_ranges['X1'][0]
        img_X1[img_X1 > effective_ranges['X1'][1]] = effective_ranges['X1'][1]
        img_X2[img_X2 < effective_ranges['X2'][0]] = effective_ranges['X2'][0]
        img_X2[img_X2 > effective_ranges['X2'][1]] = effective_ranges['X2'][1]
        
        img_X1 = img_X1 / effective_ranges['X1'][1]
        img_X2 = img_X2 / effective_ranges['X2'][1]
        
        img_X1 = np.expand_dims(img_X1, axis=-1)
        img_X2 = np.expand_dims(img_X2, axis=-1)
        msk_y = np.expand_dims(msk_y, axis=-1)

        X1.append(img_X1)
        X2.append(img_X2)        
        y.append(msk_y)        
    
    return np.array(X1), np.array(X2), np.array(y)


Call the function and load data:

In [None]:
X1, X2, y = load_image_and_masks(img_pairs)

Randomly split to train and validation data:

In [None]:
perm = np.random.permutation(np.arange(len(X1)))
split_idx = len(X1) - round(len(X1) * validation_fraction)
train_indices = perm[:split_idx]
val_indices = perm[split_idx:]

X1_train, X2_train, y_train = [], [], []
for i in train_indices:
    X1_train.extend(X1[i])
    X2_train.extend(X2[i])
    y_train.extend(y[i])

X1_val, X2_val, y_val = [], [], []
for i in val_indices:
    X1_val.extend(X1[i])
    X2_val.extend(X2[i])
    y_val.extend(y[i])

X1_train = np.array(X1_train, dtype=np.float32)
X2_train = np.array(X2_train, dtype=np.float32)
y_train = np.array(y_train, dtype=np.uint8)
X1_val = np.array(X1_val, dtype=np.float32)
X2_val = np.array(X2_val, dtype=np.float32)
y_val = np.array(y_val, dtype=np.uint8)

print(X1_train.shape, X1_val.shape)
print(X2_train.shape, X2_val.shape)
print(y_train.shape, y_val.shape)

Plot example of training data with flou, aflou, and mask:

In [None]:
fig, axes = plt.subplots(3, 5, figsize=(60, 20))
random_img_idx = 42

for slice_i, ax in enumerate(zip(*axes)):
    
    ax[0].imshow(X1_train[random_img_idx + slice_i, ..., 0], cmap='hot')
    ax[0].axis('off')
    
    ax[1].imshow(X2_train[random_img_idx + slice_i, ..., 0], cmap='hot')
    ax[1].axis('off')
    
    ax[2].imshow(y_train[random_img_idx + slice_i, ..., 0], cmap='hot')
    ax[2].axis('off')

fig.tight_layout()
plt.show()

Save data to file for loading in training notebooks:

In [None]:
h5f = h5py.File('data/compiled_vessel_seg_data_final.h5', 'w')
#h5f = h5py.File('data/compiled_punctae_seg_data_final.h5', 'w')
h5f.create_dataset('X1_train', data=X1_train)
h5f.create_dataset('X1_val', data=X1_val)
h5f.create_dataset('X2_train', data=X2_train)
h5f.create_dataset('X2_val', data=X2_val)
h5f.create_dataset('y_train', data=y_train)
h5f.create_dataset('y_val', data=y_val)
h5f.close()