In [1]:
import tensorflow as tf
import os 
import numpy as np
import cv2
from tqdm import trange
import random
import albumentations as A

# Parameters

In [9]:
# Dataset Constants
DATASET_SPLIT = {"train": 6486, "val" : 150} # split_name: [amount_of_files, preprocessing_func] | you can leave amount of files None if you don't wan't to split
NORMAL_SPLIT = {"train": 3310, "val" : 1103} # split_name: [amount_of_files, preprocessing_func] | you can leave amount of files None if you don't wan't to split
ANORMAL_SPLIT = {"train": 1668 , "val" : 555}

DATASET_PATH = "./data"
NORMAL_DIR = "inme_yok"
ANORMAL_DIR = "inme_var"

CLASS_NAMES = ['inme_yok', 'inme_var']
CLASSES = [0, 1]
BACKBONE = 'efficientnetb3' # enter the preprocessing for model to here, leave none if don't want to the preprocessing

DATA_DIR = "data"
LABEL_DIR = "label"

IMG_EXT = "png"

OUT_PATH = "./outdata/tfrecord/"

ENCODING_TYPE = "ZLIB" # zlib, gzip or none - encoding  increases preprocessing time but reduces size by HUGE AMOUNTS (about %96 percent) 

MAX_FILES = 600

# Helper Functions

In [4]:
def image_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[serialize_array(value)])
    )

def bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def int64_feature_list(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def float_feature_list(value):
    """Returns a list of float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

# non keras
def serialize_array(array):
  array = tf.io.serialize_tensor(array).numpy()
  return array

def get_preprocessing(preprocessing_fn):
    """Construct preprocessing transform
    
    Args:
        preprocessing_fn (callbale): data normalization function 
            (can be specific for each pretrained neural network)
    Return:
        transform: albumentations.Compose
    
    """
    
    _transform = [
        A.Lambda(image=preprocessing_fn),
    ]
    return A.Compose(_transform)

In [5]:
def parse_single_image(image, label):
  
  #define the dictionary -- the structure -- of our single example
  data = {
        'image' : image_feature(image),
        'label' : float_feature(label)
    }
  #create an Example, wrapping the single features
  out = tf.train.Example(features=tf.train.Features(feature=data))
  return out

In [6]:
def get_file_paths(img_normal_path, img_anormal_path, randomize=True):
    img_filenames_w_labels = []
    for i in tf.io.gfile.glob(f"{img_normal_path}/*.{IMG_EXT}"):
        img_filenames_w_labels.append([i, 0])
    for i in tf.io.gfile.glob(f"{img_anormal_path}/*.{IMG_EXT}"):
        img_filenames_w_labels.append([i, 1])

    if randomize:
        random.shuffle(img_filenames_w_labels)

    return img_filenames_w_labels

# Main Processing Function

In [7]:
def write_image_batches_to_tfr(img_w_labels, filename:str="batch", max_files:int=100, out_dir:str="/data/tfrecord/", augmentation=None, preprocessing=None):
    num_of_files = []
    for i in range(len(CLASSES)):
        num_of_files.append(0)
    # determine the number of shards (single TFRecord files) we need:
    splits = (len(img_w_labels)//max_files) + 1
    if len(img_w_labels)%max_files == 0:
        splits-=1
    print(f"\nUsing {splits} shard(s) for {len(img_w_labels)} files, with up to {max_files} samples per shard")
    os.makedirs(out_dir, exist_ok=True)
    file_count = 0
    for i in trange(splits):
        current_shard_name = f"{out_dir}tfrecord_{i+1}in{splits}_{filename}.tfrecords"
        if ENCODING_TYPE is not None:
            options = tf.io.TFRecordOptions(compression_type=ENCODING_TYPE)
            writer = tf.io.TFRecordWriter(current_shard_name, options=options)
        else:
            writer = tf.io.TFRecordWriter(current_shard_name)

        current_shard_count = 0
        while current_shard_count < max_files: #as long as our shard is not full
            #get the index of the file that we want to parse now
            index = i*max_files+current_shard_count
            if index == len(img_w_labels): #when we have consumed the whole data, preempt generation
                break
            
            #img = None
            #with open(img_filenames[index], 'rb') as file_reader:
            #    img = file_reader.read()
            img = cv2.imread(img_w_labels[index][0])
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            label = img_w_labels[index][1]
            
            for counter, value in enumerate(CLASSES):
                if label == value:
                    num_of_files[counter] += 1
            
            #create the required Example representation
            out = parse_single_image(image=img, label=label)
            
            writer.write(out.SerializeToString())
            current_shard_count+=1
            file_count += 1
        writer.close()
    print(f"\nWrote {file_count} elements to TFRecord")
    for count, i in enumerate(num_of_files):
        print(f"{i} files for class {CLASS_NAMES[count]}")

In [10]:
print(f"Starting the process.")
normal_img_path = os.path.join(DATASET_PATH, NORMAL_DIR)
anormal_img_path = os.path.join(DATASET_PATH, ANORMAL_DIR)

img_filenames_w_labels = get_file_paths(normal_img_path, anormal_img_path, randomize=True)
last_index = 0
print(f"Info: Total amount of files is {len(img_filenames_w_labels)}")
for split in DATASET_SPLIT:
    file_amount = DATASET_SPLIT[split]
    print(f"Info: Starting to process split **{split}** with {file_amount} files")
    split_data = img_filenames_w_labels[last_index:last_index+file_amount]
    write_image_batches_to_tfr(split_data, filename=split, max_files=MAX_FILES, out_dir=OUT_PATH, augmentation=None, preprocessing=None)
    last_index = last_index+file_amount
print(f"Info: {len(img_filenames_w_labels) - last_index} left over files")
#write_image_batches_to_tfr(split_img, split_label, filename="teknofest", max_files=MAX_FILES, out_dir=OUT_PATH)

Starting the process.
Info: Total amount of files is 6636
Info: Starting to process split **train** with 6486 files

Using 11 shard(s) for 6486 files, with up to 600 samples per shard


  0%|          | 0/11 [00:00<?, ?it/s]2021-09-21 15:32:14.667315: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-21 15:32:14.777363: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-21 15:32:14.779149: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-21 15:32:14.787086: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  FMA
To enable them in other operations, rebuild TensorFlow wi


Wrote 6486 elements to TFRecord
4310 files for class inme_yok
2176 files for class inme_var
Info: Starting to process split **val** with 150 files

Using 1 shard(s) for 150 files, with up to 600 samples per shard


100%|██████████| 1/1 [00:05<00:00,  5.20s/it]


Wrote 150 elements to TFRecord
103 files for class inme_yok
47 files for class inme_var
Info: 0 left over files



