<a href="https://colab.research.google.com/github/Angelvj/TFG/blob/main/code/generate_tfrecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Clone my repo.
!git clone -l -s https://github.com/Angelvj/TFG.git cloned-repo
# Change directory into code dir
%cd cloned-repo/code
# List repo contents
!ls

Cloning into 'cloned-repo'...
remote: Enumerating objects: 256, done.[K
remote: Counting objects: 100% (256/256), done.[K
remote: Compressing objects: 100% (200/200), done.[K
remote: Total 256 (delta 87), reused 61 (delta 15), pack-reused 0[K
Receiving objects: 100% (256/256), 3.07 MiB | 17.35 MiB/s, done.
Resolving deltas: 100% (87/87), done.
/content/cloned-repo/code
generate_tfrecords.ipynb  image_reading.py  main.ipynb


In [2]:
import numpy as np, os
import tensorflow as tf
import nibabel as nib
from sklearn.model_selection import StratifiedKFold
import csv
from image_reading import *

Most of the following code comes from TensorFlow's documentation: [here](https://www.tensorflow.org/tutorials/load_data/tfrecord?hl=en#data_types_for_tftrainexample)


In [3]:
# The following functions convert a value to a type compatible
# with tf.train.Example
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double"""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _float_list_feature(value):
  """Returns a float_list from a float / double list"""
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _int64_list_feature(value):
  """Returns an int64_list from a bool / enum / int / uint list"""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def serialize_example(image, shape, name, label):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible
    # data type.
    feature = {
        'image': _float_list_feature(image),
        'shape': _int64_list_feature(shape),
        'name': _bytes_feature(name), #TODO: not needed
        'label': _int64_feature(label)
    }
    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

# Functions for generating TFRecords

In [4]:
def generate_labeled_tfrecords(img_paths, img_labels, tfrecords_dir, tfrecords_labels, num_groups=10):
    """
    For each class, create num_groups tfrecords. The label of the images contained
    on each tfrecord is saved into de file tfrecords_labels.
    """
    if not os.path.exists(tfrecords_dir):
        os.makedirs(tfrecords_dir)

    with open(tfrecords_labels, 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['tfrecord_name', 'label'])

    f = open(tfrecords_labels, 'a', encoding='UTF8', newline='')
    csv_writer = csv.writer(f)

    for label in np.unique(img_labels):
        
        imgs_current_label = img_paths[img_labels == label]
        skfold = StratifiedKFold(n_splits = num_groups) # TODO: stratified not needed
        groups_generator = skfold.split(imgs_current_label, np.full(imgs_current_label.shape, label))
        
        for n, (_, indices) in enumerate(groups_generator):
        
            tfrecord_name = f'tfrecord_l{label}_{n}.tfrec'
            csv_writer.writerow([tfrecord_name, str(label)])
            
            with tf.io.TFRecordWriter(os.path.join(tfrecords_dir, tfrecord_name)) as writer:

                for index in indices:
                    path = imgs_current_label[index]
                    img = np.nan_to_num(load_image(path), copy=False)
                    img_name = str.encode(path.split('/')[-1])
                    example = serialize_example(img.ravel(), img.shape, img_name, label)
                    writer.write(example)
    f.close()


def generate_tfrecords(img_paths, img_labels, tfrecords_dir, tfrecords_labels = None, num_samples=10):

    """
    Create tfrecords containing num_samples imgs each. If a file for saving tfrecord
    labels is passed, each tfrecord will contain only one image, and label will be
    saved into the file.
    """
    if not os.path.exists(tfrecords_dir):
        os.makedirs(tfrecords_dir)
    
    if tfrecords_labels is not None:
        num_samples = 1

        with open(tfrecords_labels, 'w', encoding='UTF8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['tfrecord_name', 'label'])

        f = open(tfrecords_labels, 'a', encoding='UTF8', newline='')
        csv_writer = csv.writer(f)

    num_tfrecords = img_paths.shape[0] // num_samples

    if img_paths.shape[0] % num_samples:
        num_tfrecords += 1

    for j in range(num_tfrecords):
        
        tfrec_name = f'tfrecord_{j}.tfrec'
        with tf.io.TFRecordWriter(os.path.join(tfrecords_dir, tfrec_name)) as writer:

            for k in range(min(num_samples, len(img_paths) - j*num_samples)):

                path = img_paths[num_samples*j + k]
                label = img_labels[num_samples*j + k]
                img = np.nan_to_num(load_image(path), copy=False)
                img_name = str.encode(path.split('/')[-1])
                example = serialize_example(img.ravel(), img.shape, img_name, label)
                writer.write(example)

                if num_samples == 1:
                    csv_writer.writerow([tfrec_name, label])
    f.close()

# Functions for splitting into train and test sets

In [5]:
def unison_shuffled(a, b):
    """ shuffle two ndarrays of same shape, in the same way """
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

def stratified_train_test_split(y, test_size = 0.2, shuffle = True):
    """ 
    Given the labels of a dataset, split it into train and test sets, maintaining
    proportion of each class (return indices, not data).
    """
    if not isinstance(y, np.ndarray):
        y = np.array(y)

    initial_idx = np.arange(y.shape[0])

    if shuffle:
        y, initial_idx = unison_shuffled(y, initial_idx)

    train_idx = np.zeros((0,), np.int16)
    test_idx = np.zeros((0,), np.int16)

    for label in np.unique(y):
        idx = np.where(y==label)[0]
        test_idx_aux = np.random.choice(idx, int(idx.shape[0]*test_size), replace=False)
        test_idx = np.concatenate((test_idx, test_idx_aux))
        train_idx = np.concatenate((train_idx, np.setdiff1d(idx, test_idx_aux, assume_unique=True)))

    return initial_idx[train_idx], initial_idx[test_idx]

# Initial variables about data organization

In [6]:
from google.colab import drive
drive.mount('/content/drive')

# ============== PATH TO IMAGES ==============

# Root data folder
ROOT = '/content/drive/My Drive/Machine learning/data'

# Path to brain images
BRAIN = ROOT + '/alzheimer_pet&mri'
BRAIN_PREPROCESSED = BRAIN + '/preprocessed'
BRAIN_RAW = BRAIN + '/raw'

# Paths brain images modalities
PET_PREPROCESSED = BRAIN_PREPROCESSED + '/PET'
MRI_PREPROCESSED= BRAIN_PREPROCESSED + '/MRI'
PET_RAW = BRAIN_RAW + '/PET'
MRI_RAW = BRAIN_RAW + '/MRI'

# Preprocessed MRI subtypes (divided into grey and white matter)
MRI_MATTER = ['GREY', 'WHITE']

# Class subfolders and labels for each one
CLASSES = ['ppNOR', 'ppAD', 'ppMCI'] 
LABELS = {'ppNOR': 0, 'ppAD': 1, 'ppMCI': 2}

# ============== PATH TO TFRECORDS ==============

PET_PREPROCESSED_TFREC = PET_PREPROCESSED + '/tfrecords'
MRI_PREPROCESSED_TFREC = MRI_PREPROCESSED + '/tfrecords'
PET_RAW_TFREC = PET_RAW + '/tfrecords'
MRI_RAW_TFREC = MRI_RAW + '/tfrecords'

Mounted at /content/drive


# Read images, split into train and test sets
... and create tfrecords

In [7]:
# ================= create PREPROCESED PET tfrecords =================
img_paths = np.empty((0,), dtype=str)
img_labels = np.empty((0,), dtype=np.int64)

for c in CLASSES:
    path = os.path.join(PET_PREPROCESSED, c)
    label = LABELS[c]
    for f in os.listdir(path):
        full_path = os.path.join(path, f)
        if os.path.isfile(full_path):
            img_paths = np.concatenate((img_paths, full_path), axis = None)
            img_labels = np.concatenate((img_labels, label), axis = None)

ordered_idx = np.argsort(img_paths) # Sort to have the same order of patients in MRI
img_paths, img_labels = img_paths[ordered_idx], img_labels[ordered_idx]

# IMPORTANT: we want to make an ensemble classifier with PET and MRI images.
# So we have use this partition also for MRI, if not, we would be leaking data.
train_idx, test_idx = stratified_train_test_split(img_labels, test_size=0.2)

X_train, y_train= img_paths[train_idx], img_labels[train_idx]
X_test, y_test = img_paths[test_idx], img_labels[test_idx]

# Generate train and test tfrecords
generate_tfrecords(X_train, y_train, PET_PREPROCESSED_TFREC + '/train', PET_PREPROCESSED_TFREC + "/train/tfrecords_labels.csv")
generate_tfrecords(X_test, y_test, PET_PREPROCESSED_TFREC + '/test', PET_PREPROCESSED_TFREC + "/test/tfrecords_labels.csv")

# ================= create PREPROCESED MRI tfrecords =================
for matter in MRI_MATTER:

    img_paths = np.empty((0,), dtype=str)
    img_labels = np.empty((0,), dtype=np.int64)

    for c in CLASSES:
        path = os.path.join(MRI_PREPROCESSED, c, matter)
        label = LABELS[c]
        for f in os.listdir(path):
            full_path = os.path.join(path, f)
            if os.path.isfile(full_path):
                img_paths = np.concatenate((img_paths, full_path), axis = None)
                img_labels = np.concatenate((img_labels, label), axis = None)


    ordered_idx = np.argsort(img_paths)
    img_paths, img_labels = img_paths[ordered_idx], img_labels[ordered_idx]

    # Same idx as before
    X_train, y_train = img_paths[train_idx], img_labels[train_idx]
    X_test, y_test = img_paths[test_idx], img_labels[test_idx]

    # Generate train and test tfrecords
    generate_tfrecords(X_train, y_train, MRI_PREPROCESSED_TFREC + f'/{matter}/train', MRI_PREPROCESSED_TFREC + f"/{matter}/train/tfrecords_labels.csv")
    generate_tfrecords(X_test, y_test, MRI_PREPROCESSED_TFREC + f'/{matter}/test', MRI_PREPROCESSED_TFREC + f"/{matter}/test/tfrecords_labels.csv")