In [1]:
import tensorflow as tf
import os 
import numpy as np
import cv2
from tqdm import trange
import random
import segmentation_models as sm
import albumentations as A
sm.set_framework('tf.keras')

Segmentation Models: using `tf.keras` framework.


# Augmentation

In [None]:
# if you want to augment your data you can put functions on here and pass it down on the dataset_split

# Parameters

In [16]:
# Dataset Constants
#DATASET_SPLIT = {"train": [4646, None], "val" : [995, None], "test" : [995, None]} # split_name: [amount_of_files, preprocessing_func] | you can leave amount of files None if you don't wan't to split
DATASET_SPLIT = {"train": 6486, "val" : 150} # split_name: [amount_of_files, preprocessing_func] | you can leave amount of files None if you don't wan't to split
DATASET_PATH = ".\\dataset"
TRAIN_DIR = "train"
VAL_DIR = "validation"
TEST_DIR = "test"

BACKBONE = 'efficientnetb3' # enter the preprocessing for model to here, leave none if don't want to the preprocessing

DATA_DIR = "data"
LABEL_DIR = "label"

IMG_EXT = "png"

OUT_PATH = "./outdata/tfrecord/"

CLASS_VALUES = [1, 2] # unlabelled 0, iskemik 1, hemorajik 2 so output : iskemik 0, hemorajik 1, bg 2
CLASS_NAMES = ["Iskemik", "Hemorajik", "Unlabelled"]

ENCODING_TYPE = "ZLIB" # zlib, gzip or none - encoding  increases preprocessing time but reduces size by HUGE AMOUNTS (about %96 percent) 

MAX_FILES = 600

# Helper Functions

In [22]:
def image_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[serialize_array(value)])
    )

def bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def int64_feature_list(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def float_feature_list(value):
    """Returns a list of float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

# non keras
def serialize_array(array):
  array = tf.io.serialize_tensor(array).numpy()
  return array

def get_preprocessing(preprocessing_fn):
    """Construct preprocessing transform
    
    Args:
        preprocessing_fn (callbale): data normalization function 
            (can be specific for each pretrained neural network)
    Return:
        transform: albumentations.Compose
    
    """
    
    _transform = [
        A.Lambda(image=preprocessing_fn),
    ]
    return A.Compose(_transform)

In [23]:
def parse_single_image(image, label):
  
  #define the dictionary -- the structure -- of our single example
  data = {
        'image/raw_image' : image_feature(image),
        'label/raw' : image_feature(label)
    }
  #create an Example, wrapping the single features
  out = tf.train.Example(features=tf.train.Features(feature=data))
  return out

In [24]:
def generate_label_filenames_merge(img_filenames):
    label_filenames = []
    for i in img_filenames:
        current_dir = i.rsplit('\\')[-2]
        label_filenames.append(i.replace(current_dir, f"{current_dir}annot"))
    return label_filenames

def merge_paths(img_path, randomize=True, generate_labels=True):
    img_filenames = []
    for i in img_path:
        img_filenames.extend(tf.io.gfile.glob(f"{i}/*.{IMG_EXT}"))
    if randomize:
        random.shuffle(img_filenames)
    if generate_labels:
        return img_filenames, generate_label_filenames_merge(img_filenames)
    else:
        return img_filenames

def generate_label_filenames(img_filenames, img_path, label_path):
    label_filenames = []
    for i in img_filenames:
        label_filenames.append(i.replace(img_path, label_path))
    return label_filenames

def get_file_paths(img_path, label_path, randomize=True, generate_labels=True):
    img_filenames = []
    img_filenames = tf.io.gfile.glob(f"{img_path}/*.{IMG_EXT}")
    if randomize:
        random.shuffle(img_filenames)
    if generate_labels:
        label_filenames = []
        for i in img_filenames:
            label_filenames.append(i.replace(img_path, label_path))
        return img_filenames, label_filenames
    else:
        return img_filenames

# Main Processing Function

In [18]:
def write_image_batches_to_tfr(img_filenames, label_filenames, filename:str="batch", max_files:int=100, out_dir:str="/data/tfrecord/", augmentation=None, preprocessing=None):
    double_labelled_num = 0
    num_of_files = []
    assert len(img_filenames) == len(label_filenames)
    for i in range(len(CLASS_VALUES) + 1):
        num_of_files.append(0)
    # determine the number of shards (single TFRecord files) we need:
    splits = (len(img_filenames)//max_files) + 1
    if len(img_filenames)%max_files == 0:
        splits-=1
    print(f"\nUsing {splits} shard(s) for {len(img_filenames)} files, with up to {max_files} samples per shard")
    os.makedirs(out_dir, exist_ok=True)
    file_count = 0
    for i in trange(splits):
        current_shard_name = f"{out_dir}tfrecord_{i+1}in{splits}_{filename}.tfrecords"
        if ENCODING_TYPE is not None:
            options = tf.io.TFRecordOptions(compression_type=ENCODING_TYPE)
            writer = tf.io.TFRecordWriter(current_shard_name, options=options)
        else:
            writer = tf.io.TFRecordWriter(current_shard_name)

        current_shard_count = 0
        while current_shard_count < max_files: #as long as our shard is not full
            #get the index of the file that we want to parse now
            index = i*max_files+current_shard_count
            if index == len(img_filenames): #when we have consumed the whole data, preempt generation
                break
            
            #img = None
            #with open(img_filenames[index], 'rb') as file_reader:
            #    img = file_reader.read()
            img = cv2.imread(img_filenames[index])
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            mask = cv2.imread(label_filenames[index], 0)
            masks = [(mask == v) for v in CLASS_VALUES]
            unlabelled = True

            for count, d in enumerate(masks):
                if np.any(d):
                    if count == 1:
                        if not unlabelled:
                            double_labelled_num += 1
                    unlabelled = False
                    num_of_files[count] += 1        
            
            if unlabelled:
                num_of_files[-1] += 1

            mask = np.stack(masks, axis=-1).astype('float32')
            # add background if mask is not binary
            if mask.shape[-1] != 1:
                background = 1 - mask.sum(axis=-1, keepdims=True)
                mask = np.concatenate((mask, background), axis=-1)
                
            #if augmentation is not None:
            #    sample = augmentation()(image=img, mask=mask)
            #    img, mask = sample['image'], sample['mask']

            #if preprocessing is not None:
            #    sample = preprocessing(image=img, mask=mask)
            #    img, mask = sample['image'], sample['mask']

            #create the required Example representation
            out = parse_single_image(image=img, label=mask)
            
            writer.write(out.SerializeToString())
            current_shard_count+=1
            file_count += 1
        writer.close()
    print(f"\nWrote {file_count} elements to TFRecord")
    for count, i in enumerate(num_of_files):
        print(f"{i} files for class {CLASS_NAMES[count]}")
    print(f"{double_labelled_num} files for class double")

In [25]:
print(f"Starting the process.")
img_path = os.path.join(DATASET_PATH, DATA_DIR)
label_path = os.path.join(DATASET_PATH, LABEL_DIR)

img_filenames, label_filenames = get_file_paths(img_path, label_path)

last_index = 0
print(f"Info: Total amount of files is {len(img_filenames)}")
for split in DATASET_SPLIT:
    file_amount = DATASET_SPLIT[split]
    print(f"Info: Starting to process split **{split}** with {file_amount} files")
    split_img_files = img_filenames[last_index:last_index+file_amount]
    split_label_files = label_filenames[last_index:last_index+file_amount]
    write_image_batches_to_tfr(split_img_files, split_label_files, filename=split, max_files=MAX_FILES, out_dir=OUT_PATH, augmentation=None, preprocessing=None)
    last_index = last_index+file_amount
print(f"Info: {len(img_filenames) - last_index} left over files")
#write_image_batches_to_tfr(split_img, split_label, filename="teknofest", max_files=MAX_FILES, out_dir=OUT_PATH)

Starting the process.
Info: Total amount of files is 6636
Info: Starting to process split **train** with 6486 files

Using 11 shard(s) for 6486 files, with up to 600 samples per shard


100%|██████████| 11/11 [10:17<00:00, 56.09s/it]



Wrote 6486 elements to TFRecord
1066 files for class Iskemik
1103 files for class Hemorajik
4317 files for class Unlabelled
0 files for class double
Info: Starting to process split **val** with 150 files

Using 1 shard(s) for 150 files, with up to 600 samples per shard


100%|██████████| 1/1 [00:12<00:00, 12.84s/it]


Wrote 150 elements to TFRecord
27 files for class Iskemik
27 files for class Hemorajik
96 files for class Unlabelled
0 files for class double
Info: 0 left over files





# Make tf record files with train, val and test splitting

In [40]:
split_imgs = []
last_index = 0
for split in DATASET_SPLIT:
    split_imgs.append(os.path.join(DATASET_PATH, split))
split_imgs = merge_paths(split_imgs, randomize=False, generate_labels=False)
print(split_imgs[1861])
random.shuffle()
print(f"Info: Total amount of files is {len(split_imgs)}")
for split in DATASET_SPLIT:
    split_data = DATASET_SPLIT[split]
    file_amount = split_data[0]
    print(f"Info: Starting to process split **{split}** with {file_amount} files")
    split_img_files = split_imgs[last_index:last_index+file_amount]

    split_label_files = split_labels[last_index:last_index+file_amount]
    write_image_batches_to_tfr(split_img_files, split_label_files, filename=split, max_files=MAX_FILES, out_dir=OUT_PATH, augmentation=split_data[1](), preprocessing=get_preprocessing(sm.get_preprocessing(BACKBONE)))
print(f"Info: {len(split_imgs) - last_index} left over files")

.\data\dataset1\train\12459.png


TypeError: shuffle() missing 1 required positional argument: 'x'

1


In [10]:
split_imgs = []
for split in DATASET_SPLIT:
    split_imgs.append(os.path.join(DATASET_PATH, split))
split_imgs, split_labels = merge_paths(split_imgs)
write_image_batches_to_tfr(split_imgs, split_labels, filename='teknofest', max_files=MAX_FILES, out_dir=OUT_PATH, preprocessing=get_preprocessing(sm.get_preprocessing(BACKBONE)))


Using 34 shard(s) for 6636 files, with up to 200 samples per shard


100%|██████████| 34/34 [16:57<00:00, 29.92s/it]


Wrote 6636 elements to TFRecord
1093 files for class Iskemik
1130 files for class Hemorajik
4413 files for class Unlabelled
0 files for class double





# Make tf record files without train, val and test splitting

In [64]:
print(f"Starting the process.")
img_path = os.path.join(DATASET_PATH, DATA_DIR)
train_img_path = os.path.join(DATASET_PATH, 'train')
train_label_path = os.path.join(DATASET_PATH, 'trainannot')
label_path = os.path.join(DATASET_PATH, LABEL_DIR)

img_filenames = tf.io.gfile.glob(f"{train_img_path}/*.{IMG_EXT}") # first get train exceptions
label_filenames = generate_label_filenames(img_filenames, train_img_path, train_label_path)
mixed_img, mixed_label = get_file_paths(img_path, label_path)
img_filenames.extend(mixed_img)
label_filenames.extend(mixed_label)

last_index = 0
print(f"Info: Total amount of files is {len(img_filenames)}")
for split in DATASET_SPLIT:
    split_data = DATASET_SPLIT[split]
    file_amount = split_data[0]
    print(f"Info: Starting to process split **{split}** with {file_amount} files")
    split_img_files = img_filenames[last_index:last_index+file_amount]
    split_label_files = label_filenames[last_index:last_index+file_amount]
    write_image_batches_to_tfr(split_img_files, split_label_files, filename=split, max_files=MAX_FILES, out_dir=OUT_PATH, augmentation=split_data[1], preprocessing=get_preprocessing(sm.get_preprocessing(BACKBONE)))
    last_index = last_index+file_amount
print(f"Info: {len(split_imgs) - last_index} left over files")
#write_image_batches_to_tfr(split_img, split_label, filename="teknofest", max_files=MAX_FILES, out_dir=OUT_PATH)

Starting the process.
Info: Total amount of files is 8497
Info: Starting to process split **train** with 4646 files

Using 24 shard(s) for 4646 files, with up to 200 samples per shard


 12%|█▎        | 3/24 [01:44<12:14, 34.97s/it]


KeyboardInterrupt: 