# Dataset pre-processing

## TotalSegmentator dataset normalization (mask combination)

In [None]:
import os
import numpy as np
import nibabel as nib

from tqdm import tqdm

from datasets import get_dataset
from utils.med_utils import load_medical_seg

def combine_masks(masks, labels, out_file, overwrite = False, check = False):
    if os.path.exists(out_file):
        if not overwrite:
            if not check: return True
            try:
                if 'nii' in out_file:
                    file = nib.load(out_file)
                    file.get_fdata()
                    file.uncache()
                elif out_file.endswith('npz'):
                    with np.load(out_file) as file:
                        file['mask']
                return True
            except Exception as e:
                print('[ERROR] Error when loading file {} : {}'.format(out_file, e))
                os.remove(out_file)
    
    if out_file.endswith('.npz') and os.path.exists(out_file.replace('.npz', '.nii.gz')):
        data = nib.load(out_file.replace('.npz', '.nii.gz'))
        
        comb_mask, affine, headers = data.get_fdata(), data.affine, data.header
        print(comb_mask.shape)
    elif out_file.endswith('nii.gz') and os.path.exists(out_file[:-3]):
        data = nib.load(out_file[:-3])
        
        comb_mask, affine, headers = data.get_fdata(), data.affine, data.header
        segmented = [
            label for i, label in enumerate(labels) if np.any(comb_mask[..., i])
        ]
        os.remove(out_file[:-3])
    else:
        comb_mask, affine, headers, segmented = None, None, None, []
        for i, label in enumerate(tqdm(labels)):
            if label not in masks:
                print('[WARNING] Label {} is not in masks !'.format(label))
                continue

            file = nib.load(masks[label])
            mask = file.get_fdata()
            if comb_mask is None:
                comb_mask = np.zeros(mask.shape + (len(labels), ), dtype = np.uint8)
                affine, headers = file.affine, file.header

            if mask.shape != comb_mask.shape[:-1]:
                raise ValueError('Mask of shape {} != reference mask {}'.format(mask.shape, comb_mask.shape))

            mask_bool = mask > 0.5
            indexes   = np.where(mask)
            if np.any(comb_mask[indexes + (i, )] != 0):
                raise ValueError('Overlap with label : {} and {}'.format(label, labels[np.max(comb_mask[indexes + (i, )])]))

            if np.any(mask_bool):
                segmented.append(label)

            comb_mask[indexes + (i, )] = 1
            assert np.all(comb_mask[..., i] == mask)

        if comb_mask is None:
            raise RuntimeError('No valid mask !')
    
    if 'nii' in out_file:
        result = nib.Nifti1Image(comb_mask, affine, header = headers, extra = {'labels' : labels, 'present' : segmented})
        nib.save(result, out_file)
    elif out_file.endswith('.npz'):
        np.savez_compressed(
            out_file,
            mask   = np.stack(np.where(comb_mask), axis = -1),
            shape  = np.array(comb_mask.shape),
            affine = affine,
            pixdim = headers['pixdim']
        )
    return True

dataset = get_dataset('total_segmentator')

labels = list(sorted(dataset.loc[0, 'label']))
for i, (_, row) in enumerate(dataset.iloc[::1].iterrows()):
    print('Patient {} / {}'.format(i + 1, len(dataset)))
    
    seg_files = row['segmentation'] if isinstance(row['segmentation'], list) else []
    out_file = row['images'].replace('ct.nii.gz', 'masks.npz')
    
    res = combine_masks(
        {organ : file for organ, file in zip(row['label'], row['segmentation'])},
        labels, out_file = out_file, overwrite = False, check = True
    )


2023-04-07 08:30:33.497925: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-07 08:30:34.660120: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-07 08:30:35.180813: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading dataset total_segmentator...
Patient 1 / 1204
Patient 2 / 1204
Patient 3 / 1204
Patient 4 / 1204
Patient 5 / 1204
Patient 6 / 1204
Patient 7 / 1204
Patient 8 / 1204
Patient 9 / 1204
Patient 10 / 1204
Patient 11 / 1204
Patient 12 / 1204
Patient 13 / 1204
Patient 14 / 1204
Patient 15 / 1204
Patient 16 / 1204
Patient 17 / 1204
Patient 18 / 1204
Patient 19 / 1204
Patient 20 / 1204
Patient 21 / 1204
Patient 22 / 1204
Patient 23 / 1204
Patient 24 / 1204
Patient 25 / 1204
Patient 26 / 1204
Patient 27 / 1204
Patient 28 / 1204
Patient 29 / 1204
Patient 30 / 1204
Patient 31 / 1204
Patient 32 / 1204
Patient 33 / 1204
Patient 34 / 1204
Patient 35 / 1204
Patient 36 / 1204
Patient 37 / 1204
Patient 38 / 1204
Patient 39 / 1204
Patient 40 / 1204
Patient 41 / 1204
Patient 42 / 1204
Patient 43 / 1204
Patient 44 / 1204
Patient 45 / 1204
Patient 46 / 1204
Patient 47 / 1204
Patient 48 / 1204
Patient 49 / 1204
Patient 50 / 1204
Patient 51 / 1204
Patient 52 / 1204
Patient 53 / 1204
Patient 54 / 1204


In [3]:
%%time

import numpy as np
import nibabel as nib
import tensorflow as tf

path_npz = '/storage/Totalsegmentator_dataset/s0110/masks.npz'
path_npz = dataset.iloc[7]['segmentation'].replace('.nii.gz', '.npz')
path_nii = path_npz.replace('.npz', '.nii.gz')

with np.load(path_npz) as data_npz:
    indexes = data_npz['mask']
    mask_npz = tf.sparse.SparseTensor(
        indices = indexes, values = tf.ones((len(indexes), ), dtype = tf.uint8), dense_shape = data_npz['shape']
    )


CPU times: user 958 ms, sys: 909 ms, total: 1.87 s
Wall time: 2.82 s


2023-04-05 19:29:35.173483: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-05 19:29:36.726308: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14783 MB memory:  -> device: 0, name: Quadro RTX 5000, pci bus id: 0000:17:00.0, compute capability: 7.5
2023-04-05 19:29:36.727064: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1616] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 14485 MB memory:  -> device: 1, name: Quadro RTX 5000, pci bus id: 0000:65:00.0, compute capability: 7.5


In [4]:
%%time

data_nii = nib.load(path_nii)
mask_nii = data_nii.get_fdata(caching = 'unchanged').astype(np.uint8)

CPU times: user 10.1 s, sys: 3.42 s, total: 13.5 s
Wall time: 13.6 s


## Dataset extraction

In [2]:
import os
import zipfile
import nibabel as nib

from tqdm import tqdm

path = '/storage'
zip_filename = '../Totalsegmentator_dataset.zip'

with zipfile.ZipFile(zip_filename, 'r') as file:
    for name in tqdm(file.namelist()[:]):
        filename = os.path.join(path, name)
        if name.endswith('/'):
            os.makedirs(filename[:-1], exist_ok = True)
            continue
        elif os.path.exists(filename):
            try:
                if 'ct' in filename:
                    nib.load(filename).get_fdata(caching = 'unchanged')
                continue
            except Exception as e:
                print('[ERROR] Error while loading {} : {}'.format(filename, e))
                os.remove(filename)
        
        try:
            file.extract(name, path = path)
        except Exception as e:
            print('[ERROR] when extracting {} : {}'.format(name, e))


 60%|█████████████████████████████████████████████▏                             | 77685/128830 [04:43<03:20, 254.85it/s]

[ERROR] Error while loading /storage/Totalsegmentator_dataset/s0864/ct.nii.gz : Compressed file ended before the end-of-stream marker was reached


 60%|█████████████████████████████████████████████▎                             | 77899/128830 [04:45<04:10, 203.15it/s]

[ERROR] when extracting Totalsegmentator_dataset/s0864/ct.nii.gz : Bad CRC-32 for file 'Totalsegmentator_dataset/s0864/ct.nii.gz'


100%|██████████████████████████████████████████████████████████████████████████| 128830/128830 [08:14<00:00, 260.78it/s]
