# Data Generation

### Import modules

In [1]:
# General
import os
import math
import re
from glob import glob
import pickle
from tqdm.notebook import tqdm

# Data handling
import numpy as np
import pandas as pd
 
# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Image handling
import cv2
import rasterio
from skimage.draw import polygon2mask
from skimage import color

from utilities import *
import tensorflow as tf

In [2]:
import resource

def memory_limit():
    soft, hard = resource.getrlimit(resource.RLIMIT_AS)
    resource.setrlimit(resource.RLIMIT_AS, (int(get_memory() * 1024 * 0.5), hard))

def get_memory():
    with open('/proc/meminfo', 'r') as mem:
        free_memory = 0
        for i in mem:
            sline = i.split()
            if str(sline[0]) in ('MemFree:', 'Buffers:', 'Cached:'):
                free_memory += int(sline[1])
    return free_memory

memory_limit()

##  Load Data

In [3]:
train_segmentation = pd.read_csv('data/train.csv')
train_segmentation.head(5)

Unnamed: 0,id,encoding
0,2f6ecfcdf,296084587 4 296115835 6 296115859 14 296147109...
1,8242609fa,96909968 56 96941265 60 96972563 64 97003861 6...
2,aaa6a05cc,30989109 59 31007591 64 31026074 68 31044556 7...
3,cb2d976f4,78144363 5 78179297 15 78214231 25 78249165 35...
4,b9a3865fc,61271840 4 61303134 13 61334428 22 61365722 30...


This csv file links the id of each sample to the ground truth segmentation in a run-length encoding

In [4]:
train_patient_data = pd.read_csv('data/HuBMAP-20-dataset_information.csv')
train_patient_data.head(5)

Unnamed: 0,image_file,width_pixels,height_pixels,anatomical_structures_segmention_file,glomerulus_segmentation_file,patient_number,race,ethnicity,sex,age,weight_kilograms,height_centimeters,bmi_kg/m^2,laterality,percent_cortex,percent_medulla
0,aa05346ff.tiff,47340,30720,aa05346ff-anatomical-structure.json,aa05346ff.json,67347,White,Not Hispanic or Latino,Female,58,59.0,160.0,23.0,Right,80,20
1,afa5e8098.tiff,43780,36800,afa5e8098-anatomical-structure.json,afa5e8098.json,67347,White,Not Hispanic or Latino,Female,58,59.0,160.0,23.0,Right,55,45
2,54f2eec69.tiff,22240,30440,54f2eec69-anatomical-structure.json,54f2eec69.json,67548,Black or African American,Not Hispanic or Latino,Male,58,79.9,190.5,22.0,Right,75,25
3,d488c759a.tiff,29020,46660,d488c759a-anatomical-structure.json,d488c759a.json,68138,White,Not Hispanic or Latino,Female,66,81.5,158.8,32.2,Left,100,0
4,1e2425f28.tiff,32220,26780,1e2425f28-anatomical-structure.json,1e2425f28.json,63921,White,Not Hispanic or Latino,Male,48,131.5,193.0,35.3,Right,65,35


This dataframe holds infomration about the size of the image as well as data from the patient like heigth, weight, sex, etc. There is some redundant information like the name of the files, that can be derived from the sample id. I will merge this dataframe and train_segmentation. 

In [5]:
# Add a column for the sample id in train_patient_data
train_patient_data['sample_id'] = train_patient_data.image_file.apply(lambda x: x.split('.')[0])

In [6]:
# Merge train_segmentation and train_patient_data using the sample id columns
train = pd.merge(train_segmentation, train_patient_data, left_on='id', right_on='sample_id')

# Remove non desired columns
train = train.drop(['id', 'image_file', 'anatomical_structures_segmention_file',
            'glomerulus_segmentation_file', 'patient_number', 'encoding'], axis=1)

# format column names
train.columns = ['width_pixels', 'height_pixels', 'race', 'ethnicity', 'sex',
       'age', 'weight', 'height', 'bmi', 'laterality', 'percent_cortex',
                 'percent_medulla', 'sample_id']

## Save Images

In [21]:
defective_images = ['e79de561c','54f2eec69']

In [40]:
size=1000
out_shape=(250, 250)

for sample_id in tqdm(train.sample_id):
    if sample_id in defective_images:
        continue
    
    # Get the path of the image of the smaple
    path = glob(f"data/train/{sample_id}.tiff")[0]
    with rasterio.open(path) as src:
            shape = src.shape


    print(sample_id)
    cortex_mask = get_mask(sample_id, 'cortex', train, out_shape=out_shape)
    glom_mask = get_mask(sample_id, 'glom', train, out_shape=out_shape)

    cortex_mask = cortex_mask.astype(int)
    cortex_mask[glom_mask] = 2
    
    cortex_pixels = np.stack(np.where(cortex_mask))
    
#     plt.imshow(cortex_mask)
#     plt.show()

    for i in tqdm(range(3000)):
        r_point = cortex_pixels[:, np.random.choice(np.arange(cortex_pixels.shape[1]))]
        t0, t1 = r_point
        
        # Load the image
        with rasterio.open(path) as src:
            shape = src.shape
            scaled_point = r_point/np.array(out_shape)*shape
            
            r0, r1 = scaled_point
            
            
            w = ((r0-size//2, r0+size//2), (r1-size//2, r1+size//2))
            glom_window = get_mask(sample_id, 'glom', train, out_shape=out_shape, window=w)
            glom_window = (glom_window*255).astype(int)
            sample_image = src.read(window = w, out_shape=out_shape)
            # Move first chanel axis from the first to the last position and normalize it
            sample_image = np.moveaxis(sample_image, 0, -1)

            if sample_image.shape[-1] == 3:
                sample_image = color.rgb2gray(sample_image)
            sample_image = (sample_image*255).astype(int)
                
            
            cv2.imwrite(f'data/processed/{sample_id}_{i}_image.jpg', sample_image)
            cv2.imwrite(f'data/processed/{sample_id}_{i}_mask.jpg', glom_window)
            
                
#             plt.subplot(1,2,1)
#             plt.imshow(glom_window)
#             plt.axis('off')
#             plt.subplot(1,2,2)

#             plt.imshow(sample_image)
#             plt.axis('off')
#             plt.show()



  0%|          | 0/15 [00:00<?, ?it/s]

2f6ecfcdf


  s = DatasetReader(path, driver=driver, sharing=sharing, **kwargs)


  0%|          | 0/3000 [00:00<?, ?it/s]

8242609fa


  0%|          | 0/3000 [00:00<?, ?it/s]

aaa6a05cc


  polygon = np.array(cd['geometry']['coordinates']).reshape(-1, 2)


  0%|          | 0/3000 [00:00<?, ?it/s]

cb2d976f4


  0%|          | 0/3000 [00:00<?, ?it/s]

b9a3865fc


  0%|          | 0/3000 [00:00<?, ?it/s]

b2dc8411c


  0%|          | 0/3000 [00:00<?, ?it/s]

0486052bb


  0%|          | 0/3000 [00:00<?, ?it/s]

095bf7a1f


  0%|          | 0/3000 [00:00<?, ?it/s]

4ef6695ce


  0%|          | 0/3000 [00:00<?, ?it/s]

26dc41664


  0%|          | 0/3000 [00:00<?, ?it/s]

c68fe75ea


  0%|          | 0/3000 [00:00<?, ?it/s]

afa5e8098


  0%|          | 0/3000 [00:00<?, ?it/s]

1e2425f28


  0%|          | 0/3000 [00:00<?, ?it/s]

## Generate tfrecords

In [8]:
# Count the number of images generated per image_id
im_list = glob('data/processed/*')

file_list = pd.DataFrame([im.split('/')[-1].split('_')[:2] for im in im_list],
                         columns=['id', 'wid'])

file_list.groupby('id').wid.count()

id
0486052bb    6000
095bf7a1f    6000
1e2425f28    6000
26dc41664    6000
2f6ecfcdf    6000
4ef6695ce    6000
8242609fa    6000
aaa6a05cc    6000
afa5e8098    6000
b2dc8411c    6000
b9a3865fc    6000
c68fe75ea    6000
cb2d976f4    6000
Name: wid, dtype: int64

In [9]:
def serialize_example(image, mask):

    feature = {
        'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image])),
        'mask': tf.train.Feature(bytes_list=tf.train.BytesList(value=[mask]))
    }

    example_proto = tf.train.Example(
        features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [13]:
def save_tfrecord(file_base_name, generator):

    
    counter = 1
    
    file = f'{file_base_name}_{counter}.tfrecords'
    writer = tf.io.TFRecordWriter(file)
    
    for im_path, mask_path in generator:
        
        image_string = open(im_path, 'rb').read()
        mask_string = open(mask_path, 'rb').read()
        
        tf_example = serialize_example(image_string, mask_string)
        writer.write(tf_example)

        b = os.path.getsize(file)*1e-6
        if b > 100:
            counter += 1
            writer.close()
            
            file = f'{file_base_name}_{counter}.tfrecords'
            writer = tf.io.TFRecordWriter(file)
            
    writer.close()


In [14]:
def data_generator(im_id):
    for im_path in tqdm(glob(f'data/processed/{im_id}*image.jpg')):
#         print(im_path)
        rep_id = im_path.split('/')[-1].split('_')[1]
        mask_path = f'data/processed/{im_id}_{rep_id}_mask.jpg'

        yield im_path, mask_path

In [15]:
for im_id in tqdm(file_list.id.unique()):
    
    generator = data_generator(im_id)
    
    save_tfrecord(f'data/tf_records/{im_id}', generator)

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]