In [1]:
from tqdm import tqdm
from glob import glob
import tifffile
import numpy as np
import os
from EmbedSeg.utils.preprocess_data import extract_data, split_train_val, split_train_test
from EmbedSeg.utils.generate_crops import *

### Download Data

The images and corresponding masks are downloaded from an external url, specified by `zip_url` to the path specified by the variables `data_dir` and `project_dir`. The following structure is generated after executing the `extract_data` method below:

```
data
└───basel-2020
    |───download
        |───train
            └───images
            └───masks
    |───basel-2020.zip
```

<div class="alert alert-warning">
The `basel-2020` dataset is 1.25 GB large. Downloading may take a while!<br>
Here, we have pre-saved phase-contrast channel images and corresponding masks in the `images` and `masks` sub-directories respectively.
</div>

In [2]:
data_dir = '../../../data'
project_name = 'basel-2020'

In [None]:
extract_data(
    zip_url = 'https://github.com/juglab/EmbedSeg/releases/download/v1.0/basel-2020.zip',
    data_dir = data_dir,
    project_name = project_name,
)

### Split Data into train  \& test

Since the train-test data partition doesn't exist by itself in the original data, we can execute the following cell to reserve some data (10 \% by default) as evaluation or test data.

In [None]:
split_train_test(
    data_dir = data_dir,
    project_name = project_name, 
    train_test_name = 'train',
    subset = 0.1
)

### Split Data into `train`, `val` \& `test`

Now, we would like to reserve a small fraction (15 % by default) of the provided train dataset as validation data. Here, in case you would like to repeat multiple experiments with the same partition, you may continue and press <kbd>Shift</kbd> + <kbd>Enter</kbd> on the next cell - but in case, you would like different partitions each time, please add the `seed` attribute equal to a different integer (For example, 
```
split_train_val(
data_dir = data_dir, 
project_name = project_name, 
train_val_name = 'train', 
subset = 0.15,
seed = 1000)
```
)

In [None]:
split_train_val(
    data_dir = data_dir,
    project_name = project_name, 
    train_val_name = 'train',
    subset = 0.15, 
    mode = 'TYX')

### Specify desired centre location for spatial embedding of pixels

Interior pixels of an object instance can either be embedded at the `centroid` (evaluated in $\mathcal{O(n)}$ operations, where $\mathcal{n}$ is the number of pixels in an object instance), or the `approximate-medoid` (also evaluated in $\mathcal{O(n)}$ operations) or the `medoid` (evaluated in $\mathcal{O(n^{2})}$ operations). Please note that evaluating `medoid` of the instances could be slow especially if you choose a large `crop_size` later: in such a scenario, a quicker alternative is opting for the `approximate-medoid` option, which gives comparable results.

In [3]:
center = 'centroid'
try:
    assert center in {'medoid', 'approximate-medoid', 'centroid'}
    print("Spatial Embedding Location chosen as : {}".format(center))
except AssertionError as e:
    e.args += ('Please specify center as one of : {"medoid", "approximate-medoid", "centroid"}', 42)
    raise



Spatial Embedding Location chosen as : centroid


### Specify cropping configuration parameters

Images and the corresponding masks are cropped into patches centred around an object instance, which are pre-saved prior to initiating the training. **Run the following two cells twice** - first time set `data_subset = 'train'` and the second time set `data_subset = 'val'`.  Note that the cropped images, masks and center-images would be saved at the path specified by `crops_dir`. Please set `one_hot = True` in case the instances are encoded in a one-hot style. 

In [4]:
crops_dir = 'crops'
data_subset = 'train' 
crop_size_y = 200
crop_size_x = 64
norm = False
one_hot = False

### Generate Crops



<div class="alert alert-warning">
The cropped images and masks are saved at the same-location as the example notebooks. <br>
Generating the crops would take a little while!
</div>

In [5]:
image_dir = os.path.join(data_dir, project_name, data_subset, 'images')
instance_dir = os.path.join(data_dir, project_name, data_subset, 'masks')
image_names = sorted(glob(os.path.join(image_dir, '*.tif'))) 
instance_names = sorted(glob(os.path.join(instance_dir, '*.tif')))  
for i in tqdm(np.arange(len(image_names))):
    if one_hot:
        process_one_hot(image_names[i], instance_names[i], os.path.join(crops_dir, project_name), data_subset, crop_size, center, one_hot = one_hot)
    else:
        process(image_names[i], instance_names[i], os.path.join(crops_dir, project_name), data_subset, crop_size_y = crop_size_y, crop_size_x = crop_size_x, center=center, one_hot=one_hot, norm = norm)
print("Cropping of images, instances and centre_images for data_subset = `{}` done!".format(data_subset))

  0%|          | 6/15842 [00:00<04:26, 59.36it/s]

Created new directory : crops/basel-2020/train/images/
Created new directory : crops/basel-2020/train/masks/
Created new directory : crops/basel-2020/train/center-centroid/


100%|██████████| 15842/15842 [27:24<00:00,  9.64it/s]

Cropping of images, instances and centre_images for data_subset = `train` done!



