# Files tree

```
./datasets
|
+---------MedicalDecathlon
|                   |
|                   +-----training
|                   |         |
|                   |         |
|                   |         +--- archives with the cts in nii.gz format
|                   |
|                   |
|                   |
|                   +-----validation
|                             |
|                             |
|                             +--- archives with the cts in nii.gz format
|
|
|
+---------MedicalDecathlonTensors
                    |
                    +-----training
                    |         |
                    |         |
                    |         +--- files with the cts as tensors
                    |
                    |
                    |
                    +-----validation
                              |
                              |
                              +---  files with the cts as tensors

```


### IMPORTS

In [2]:
import os
import warnings
warnings.filterwarnings("ignore") # remove some scikit-image warnings


from monai.apps import DecathlonDataset
from monai.data import DataLoader
from monai.transforms import (
    LoadImageD,
    EnsureChannelFirstD,
    Compose,
    OrientationD,
    OrientationD,
)

import torch
import numpy as np
import random
import sys
from tqdm import tqdm
import pickle as pkl

### HYPERPARAMS ###


In [4]:
crt_dir = os.getcwd()
datasets_path = f'{crt_dir}/datasets/MedicalDecathlon/'

DOWNLOAD_FLAG = False
BATCH_SIZE = 1

KEYS = ["image", "label"]

### PREPROCESSING TRANSFORMS

In [5]:
base_transform = Compose([
    LoadImageD(keys=KEYS),
    EnsureChannelFirstD(keys=KEYS),
    OrientationD(keys=KEYS, axcodes='RAS'),
])

### MAIN ###

In [7]:
# We make sure to eliminate uncertainty in the data loading, by setting all the random seeds

# Set random seeds
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

In [8]:
### TRAINING DATA ###
train_name = 'training' # From Monai: ['training', 'validation', 'test']
train_dataset = DecathlonDataset(root_dir = f'{datasets_path}{train_name}/',
                        task = "Task06_Lung", section = train_name,
                        transform = base_transform, download = DOWNLOAD_FLAG)

train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = False) #, num_workers = num_workers)

Loading dataset:   0%|          | 0/51 [00:00<?, ?it/s]

Loading dataset: 100%|██████████| 51/51 [03:28<00:00,  4.08s/it]


In [9]:
def save_dataset(dataset, dataset_name, split_name):
    '''
    Save the dataset to the disk per patient as two files: image and mask'''
    i = 0

    for data in tqdm(dataset):
        img, seg = data['image'], data['label']
        
        
        img = img.cpu()
        seg = seg.cpu()

        filename = f'{crt_dir}/datasets/{dataset_name}/{split_name}/images/patient_{i}.pt'
        maskname = f'{crt_dir}/datasets/{dataset_name}/{split_name}/labels/patient_{i}.pt'

        torch.save(img, filename)
        torch.save(seg, maskname)

        i += 1

    print(f'Saved {i} images and masks for {split_name} split')

In [10]:
save_dataset(train_dataset, 'MedicalDecathlonTensors', train_name)

100%|██████████| 51/51 [00:31<00:00,  1.61it/s]

Saved 51 images and masks for training split





In [8]:
### VALIDATION DATA ###
val_name = 'validation' # From Monai: ['training', 'validation', 'test']
val_dataset = DecathlonDataset(root_dir = f'{datasets_path}{val_name}/',
                        task = "Task06_Lung", section = val_name,
                        transform = base_transform, download = DOWNLOAD_FLAG)

val_loader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle = False) #, num_workers = num_workers)


Loading dataset:   0%|          | 0/12 [00:00<?, ?it/s]

Loading dataset: 100%|██████████| 12/12 [00:46<00:00,  3.91s/it]


In [9]:
save_dataset(val_dataset, 'MedicalDecathlonTensors', val_name)

100%|██████████| 12/12 [00:07<00:00,  1.68it/s]

Saved 12 images and masks for validation split



