# Imports

In [1]:
import tensorflow as tf
import math
import cv2
import re
from tqdm.notebook import tqdm
from glob import glob
import os
import numpy as np

from Utils import make_dir

In [2]:
def convert_images_to_jpg(base_path='./COVID-19 Dataset/X-ray/', to_bw=False):
    # Rename jpg to jpeg for consistency
    jpg_files = [
        y for x in os.walk(base_path)
        for y in glob(os.path.join(x[0], '*.jpg'))
    ]
    if jpg_files:
        for file in tqdm(jpg_files, desc='Renaming .jpg to .jpeg'):
            os.rename(file, file.replace('.jpg', '.jpeg'))

    # Convert png to jpeg
    png_files = [
        y for x in os.walk(base_path)
        for y in glob(os.path.join(x[0], '*.png'))
    ]
    if png_files:
        for file in tqdm(png_files, desc='Converting .png to .jpeg'):
            im = cv2.imread(file)
            cv2.imwrite(file.replace('.png', '.jpeg'), im)
            os.remove(file)

    if to_bw:
        # Convert to bw
        files = [
            y for x in os.walk(base_path)
            for y in glob(os.path.join(x[0], '*.jpeg'))
        ]
        if files:
            for file in tqdm(files, desc='Converting images to b&w'):
                im = cv2.imread(file)
                im = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
                cv2.imwrite(file, im)

In [58]:
def _float32_list(floats):
    return tf.train.Feature(float_list=tf.train.FloatList(
        value=floats))

def _int64_list(values):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=values))

def to_tfrecord(tfrec_filewriter, image, label):
    feature = {
        "image": _float32_list(image.ravel()),
        "label": _int64_list([label]),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

def encode_image(file_name):
    class_folders = tf.constant([
        '\.\/COVID-19 Dataset\/X-ray\/Non-COVID.+',
        '\.\/COVID-19 Dataset\/X-ray\/COVID.+'
    ])
    image_label = tf.argmax(tf.map_fn(
        lambda x: tf.strings.regex_full_match(file_name, x),
        class_folders,
        fn_output_signature=tf.bool),
                            output_type=tf.dtypes.int32)
    image = tf.io.read_file(file_name)
    image = tf.image.decode_jpeg(image)
    image = tf.image.resize(image, [224, 224]) / 255.0
    return image, image_label


def generate_tfrecord_files(tfrecords_path='./dataset/train/',
                            images_path='./COVID-19 Dataset/X-ray/',
                            images_per_file=512):
    make_dir(tfrecords_path)
    
    images_path_pattern = images_path + '*/*.jpeg'
    found_images = len(tf.io.gfile.glob(images_path_pattern))
    print(
        'Pattern matches {} images which will be rewritten as {} TFRecord files containing ~{} images each.'
        .format(found_images, math.ceil(found_images / images_per_file),
                images_per_file))
    images = tf.data.Dataset.list_files(images_path_pattern)
    dataset = images.map(encode_image).batch(images_per_file)

    for file_number, (image, label) in enumerate(tqdm(dataset, desc='Generating TFRecords')):
        tfrecord_filename = tfrecords_path + "{:02d}-{}.tfrecord".format(
            file_number, images_per_file)
        
        images_in_this_file = image.numpy().shape[0]
        if not os.path.isfile(tfrecord_filename):
            with tf.io.TFRecordWriter(tfrecord_filename) as out_file:
                for i in range(images_in_this_file):
                    example = to_tfrecord(out_file,
                                           np.array(image)[i],
                                           label.numpy()[i])
                    out_file.write(example.SerializeToString())

In [None]:
dataset = generate_tfrecord_files()

Pattern matches 8380 images which will be rewritten as 17 TFRecord files containing ~512 images each.


HBox(children=(FloatProgress(value=0.0, description='Generating TFRecords', max=17.0, style=ProgressStyle(desc…

In [4]:
convert_images_to_jpg()