<a href="https://colab.research.google.com/github/Angelvj/TFG/blob/main/code/generate_tfrecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
# Clone my repo.
!git clone -l -s https://github.com/Angelvj/TFG.git cloned-repo
# Change directory into code dir
%cd cloned-repo/code
# List repo contents
!ls

Cloning into 'cloned-repo'...
remote: Enumerating objects: 240, done.[K
remote: Counting objects: 100% (240/240), done.[K
remote: Compressing objects: 100% (184/184), done.[K
remote: Total 240 (delta 80), reused 61 (delta 15), pack-reused 0[K
Receiving objects: 100% (240/240), 3.06 MiB | 14.24 MiB/s, done.
Resolving deltas: 100% (80/80), done.
/content/cloned-repo/code/cloned-repo/code/cloned-repo/code/cloned-repo/code/cloned-repo/code
image_reading.py  main.ipynb


In [34]:
import numpy as np, os, glob
import tensorflow as tf
import nibabel as nib
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import csv
from image_reading import *

In [35]:
from google.colab import drive
drive.mount('/content/drive')

# PATHS TO IMAGES
PET_PATH = '/content/drive/My Drive/Machine learning/data/preprocessed/PET'
MRI_PATH = '/content/drive/My Drive/Machine learning/data/preprocessed/MRI'
MRI_SUBPATHS = ['GREY', 'WHITE']

# PATHS TO TFRECORDS
PET_TFREC_PATH = '/content/drive/My Drive/Machine learning/data/tfrecords/PET'
MRI_TFREC_PATH = '/content/drive/My Drive/Machine learning/data/tfrecords/MRI'

# CLASS SUBFOLDERS AND LABEL FOR EACH ONE
CLASS_SUBPATHS = ['ppNOR', 'ppAD', 'ppMCI'] 
LABELS = {'ppNOR': 0, 'ppAD': 1, 'ppMCI': 2}

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The following code comes from TensorFlow's documentation: [here](https://www.tensorflow.org/tutorials/load_data/tfrecord?hl=en#data_types_for_tftrainexample)


In [36]:
# The following functions convert a value to a type compatible
# with tf.train.Example
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(feature0, feature1, feature2):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible
    # data type.
    feature = {
        'image': _bytes_feature(feature0),
        'name': _bytes_feature(feature1), #TODO: not needed
        'label': _int64_feature(feature2)
    }
    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

# Functions for generating TFRecords

In [37]:
def generate_labeled_tfrecords(img_paths, img_labels, tfrecords_dir, tfrecords_labels, num_groups=10):
    """
    For each class, create num_groups tfrecords. The label of the images contained
    on each tfrecord is saved into de file tfrecords_labels.
    """
    if not os.path.exists(tfrecords_dir):
        os.makedirs(tfrecords_dir)

    with open(tfrecords_labels, 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['tfrecord_name', 'label'])

    f = open(tfrecords_labels, 'a', encoding='UTF8', newline='')
    csv_writer = csv.writer(f)

    for label in np.unique(img_labels):
        
        imgs_current_label = img_paths[img_labels == label]
        skfold = StratifiedKFold(n_splits = num_groups) # TODO: stratified not needed
        groups_generator = skfold.split(imgs_current_label, np.full(imgs_current_label.shape, label))
        
        for n, (_, indices) in enumerate(groups_generator):
        
            tfrecord_name = f'tfrecord_l{label}_{n}.tfrec'
            csv_writer.writerow([tfrecord_name, str(label)])
            
            with tf.io.TFRecordWriter(os.path.join(tfrecords_dir, tfrecord_name)) as writer:

                for index in indices:
                    path = imgs_current_label[index]
                    img = np.nan_to_num(load_image(path), copy = False)
                    img_bytes = img.tobytes()
                    img_name = str.encode(path.split('/')[-1])
                    example = serialize_example(img_bytes, img_name, label)
                    writer.write(example)
    f.close()


def generate_tfrecords(img_paths, img_labels, tfrecords_dir, tfrecords_labels = None, num_samples=10):

    """
    Create tfrecords containing num_samples imgs each. If a file for saving tfrecord
    labels is passed, each tfrecord will contain only one image, and label will be
    saved into the file.
    """
    if not os.path.exists(tfrecords_dir):
        os.makedirs(tfrecords_dir)
    
    if tfrecords_labels is not None:
        num_samples = 1

        with open(tfrecords_labels, 'w', encoding='UTF8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['tfrecord_name', 'label'])

        f = open(tfrecords_labels, 'a', encoding='UTF8', newline='')
        csv_writer = csv.writer(f)

    num_tfrecords = img_paths.shape[0] // num_samples

    if img_paths.shape[0] % num_samples:
        num_tfrecords += 1

    for j in range(num_tfrecords):
        
        tfrec_name = f'tfrecord_{j}.tfrec'
        with tf.io.TFRecordWriter(os.path.join(tfrecords_dir, tfrec_name)) as writer:

            for k in range(min(num_samples, len(img_paths) - j*num_samples)):

                path = img_paths[num_samples*j + k]
                label = img_labels[num_samples*j + k]
                img = load_image(path)
                np.nan_to_num(img, copy=False) 
                img = img.tobytes()
                img_name = str.encode(path.split('/')[-1])
                example = serialize_example(img, img_name, label)
                writer.write(example)

                if num_samples == 1:
                    csv_writer.writerow([tfrec_name, label])
    f.close()

# Functions for splitting into train and test sets

In [38]:
def unison_shuffled(a, b):
    """ shuffle two ndarrays of same shape, at the same time """
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

def stratified_train_test_split(y, test_size = 0.2, shuffle = True):
    """ 
    Given the labels of a dataset, split it into train and test sets, maintaining
    proportion of each class
    """
    if not isinstance(y, np.ndarray):
        y = np.array(y)

    initial_idx = np.arange(y.shape[0])

    if shuffle:
        y, initial_idx = unison_shuffled(y, initial_idx)

    train_idx = np.zeros((0,), np.int16)
    test_idx = np.zeros((0,), np.int16)

    for label in np.unique(y):
        idx = np.where(y==label)[0]
        test_idx_aux = np.random.choice(idx, int(idx.shape[0]*test_size), replace=False)
        test_idx = np.concatenate((test_idx, test_idx_aux))
        train_idx = np.concatenate((train_idx, np.setdiff1d(idx, test_idx_aux, assume_unique=True)))

    return initial_idx[train_idx], initial_idx[test_idx]

# Read filenames and split into train and test sets
And create tfrecords after that

In [39]:
# ================= Read PET paths =================
img_paths = np.empty((0,), dtype=str)
img_labels = np.empty((0,), dtype=np.int64)

for subpath in CLASS_SUBPATHS:
    path = os.path.join(PET_PATH, subpath)
    label = LABELS[subpath]
    for filename in os.listdir(path):
        abs_path = os.path.join(path, filename)
        if os.path.isfile(abs_path):
            img_paths = np.concatenate((img_paths, abs_path), axis = None)
            img_labels = np.concatenate((img_labels, label), axis = None)

train_idx, test_idx = stratified_train_test_split(img_labels, test_size=0.2)

X_train, y_train= img_paths[train_idx], img_labels[train_idx]
X_test, y_test = img_paths[test_idx], img_labels[test_idx]

# Generate train and test tfrecords
generate_tfrecords(X_train, y_train, PET_TFREC_PATH + '/train', PET_TFREC_PATH + "/train/tfrecords_labels.csv")
generate_tfrecords(X_test, y_test, PET_TFREC_PATH + '/test', PET_TFREC_PATH + "/test/tfrecords_labels.csv")

# ================= Read MRI-GREY paths =================
img_paths = np.empty((0,), dtype=str)
img_labels = np.empty((0,), dtype=np.int64)

for class_subpath in CLASS_SUBPATHS:
    path = os.path.join(MRI_PATH, class_subpath, MRI_SUBPATHS[0])
    label = LABELS[class_subpath]

    for filename in os.listdir(path):
        abs_path = os.path.join(path, filename)
        if os.path.isfile(abs_path):
            img_paths = np.concatenate((img_paths, abs_path), axis = None)
            img_labels = np.concatenate((img_labels, label), axis = None)

train_idx, test_idx = stratified_train_test_split(img_labels, test_size=0.2)

X_train, y_train = img_paths[train_idx], img_labels[train_idx]
X_test, y_test = img_paths[test_idx], img_labels[test_idx]

# Generate train and test tfrecords
generate_tfrecords(X_train, y_train, MRI_TFREC_PATH + '/GREY/train', MRI_TFREC_PATH + "/GREY/train/tfrecords_labels.csv")
generate_tfrecords(X_test, y_test, MRI_TFREC_PATH + '/GREY/test', MRI_TFREC_PATH + "/GREY/test/tfrecords_labels.csv")

# ================= Read MRI-WHITE paths =================
img_paths = np.empty((0,), dtype=str)
img_labels = np.empty((0,), dtype=np.int64)

for class_subpath in CLASS_SUBPATHS:
    path = os.path.join(MRI_PATH, class_subpath, MRI_SUBPATHS[1])
    label = LABELS[class_subpath]

    for filename in os.listdir(path):
        abs_path = os.path.join(path, filename)
        if os.path.isfile(abs_path):
            img_paths = np.concatenate((img_paths, abs_path), axis = None)
            img_labels = np.concatenate((img_labels, label), axis = None)

train_idx, test_idx = stratified_train_test_split(img_labels, test_size=0.2)

X_train, y_train = img_paths[train_idx], img_labels[train_idx]
X_test, y_test = img_paths[test_idx], img_labels[test_idx]

# Generate train and test tfrecords (PET)
generate_tfrecords(X_train, y_train, MRI_TFREC_PATH + '/WHITE/train', MRI_TFREC_PATH + "/WHITE/train/tfrecords_labels.csv")
generate_tfrecords(X_test, y_test, MRI_TFREC_PATH + '/WHITE/test', MRI_TFREC_PATH + "/WHITE/test/tfrecords_labels.csv")