<a href="https://colab.research.google.com/github/Angelvj/Alzheimer-disease-classification/blob/main/code/generate_tfrecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook has the function of converting the original dataset in tfrecords (better performance on i/o operations and other advantages). We will execute this notebook on Google Colab because we can organize the outputs into folders.

# Imports

In [1]:
# Colab only
from google.colab import drive

In [2]:
import numpy as np, os, shutil
import math
import tensorflow as tf
import nibabel as nib
from sklearn.model_selection import StratifiedKFold, KFold
import csv

In [3]:
if os.path.exists('cloned_repo'):
    shutil.rmtree('cloned_repo')
    
!git clone -l -s https://github.com/Angelvj/TFG.git cloned_repo

# Imports from my github repo
from cloned_repo.code.image_reading import *

Cloning into 'cloned_repo'...
remote: Enumerating objects: 279, done.[K
remote: Counting objects: 100% (279/279), done.[K
remote: Compressing objects: 100% (223/223), done.[K
remote: Total 279 (delta 100), reused 61 (delta 15), pack-reused 0[K
Receiving objects: 100% (279/279), 3.08 MiB | 2.91 MiB/s, done.
Resolving deltas: 100% (100/100), done.


# Input data path

In [4]:
DS = 'ad-preprocessed' # choose the input dataset

KAGGLE = False

if KAGGLE:
    DS_PATH = '/kaggle/input/' + DS

else:
    drive.mount('/content/drive')
    DATA_PATH = '/content/drive/My Drive/data/'
    DS_PATH =  DATA_PATH + DS

Mounted at /content/drive


# Output data path

In [5]:
OUT_DS = 'tfrecords-preprocessed-20skf'

if KAGGLE:
    OUT_PATH = './' + OUT_DS
else:
    OUT_PATH = DATA_PATH + OUT_DS

# Generate Tfrecords dataset from images

Links of interest: [1](http://https://www.tensorflow.org/tutorials/load_data/tfrecord?hl=en#data_types_for_tftrainexample), [2](https://colab.research.google.com/github/GoogleCloudPlatform/training-data-analyst/blob/master/courses/fast-and-lean-data-science/03_Flower_pictures_to_TFRecords.ipynb#scrollTo=9X82-4D2syG4)

In [6]:
# We can store three types of data in a TFRecord: bytestring, integer and floats. 
# They are always stored as lists, a single data element will be a list of size 1
def _bytestring_feature(list_of_bytestrings):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _float_feature(list_of_floats): # float32
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

def _int_feature(list_of_ints): # int64
    return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def to_tfrecord(image, label, shape, filename):
    
    one_hot_label = np.eye(3)[label]
        
    feature = {
        'image': _float_feature(image),
        'label': _int_feature([label]),
        'one_hot_label': _float_feature(one_hot_label.tolist()),
        'shape': _int_feature(shape),
        'filename': _bytestring_feature([filename]) # Delete later
    }
    
    # Create a Features message using tf.train.Example.
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [7]:
def generate_tfrecords(img_paths, img_labels, tfrecords_dir, tfrecords_metadata, num_folds=15, 
                       num_samples=None, stratify=True, shuffle=True, create_folders=True):
    """Given path to images and corresponding labels, creates num_folds tfrecords containing the images, 
    or tfrecords containing num_samples each."""
    
    if create_folders and not os.path.exists(tfrecords_dir):
        os.makedirs(tfrecords_dir)
    
    if num_samples is not None:
        num_folds = math.ceil(len(img_paths)/num_samples)
        
    with open(tfrecords_metadata, 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        row = ['tfrecord_name', 'total_samples']
        for c in CLASSES:
            row.append(c)
        writer.writerow(row)

    f = open(tfrecords_metadata, 'a', encoding='UTF8', newline='')
    csv_writer = csv.writer(f)

    if stratify:
        kfold = StratifiedKFold(n_splits=num_folds, shuffle=shuffle)
    else:
        kfold = KFold(n_splits=num_folds, shuffle=shuffle)
    
    for n, (_, indices) in enumerate(kfold.split(img_paths, img_labels)):
        # If folders are created, we don't need to add extra info in the name
        tfrecord_name = f'tfrecord{n}-{len(indices)}.tfrec' if create_folders else f'{tfrecords_dir}{n}-{len(indices)}.tfrec'
        num_samples = str(len(indices))
        unique, counts = np.unique(img_labels[indices], return_counts=True)
        per_class_count = np.zeros(len(CLASSES), dtype=np.int64)
        per_class_count[unique] = counts
        per_class_count = list(per_class_count.astype(str))
        row = [tfrecord_name] + [num_samples] + per_class_count
        csv_writer.writerow(row)
        
        aux = os.path.join(tfrecords_dir, tfrecord_name) if create_folders else tfrecord_name
        
        with tf.io.TFRecordWriter(aux) as writer:
            for index in indices:
                filename = img_paths[index]
                img = np.nan_to_num(load_image(filename), copy=False)
                label = img_labels[index]
                example = to_tfrecord(img.ravel(), label, img.shape, str.encode(filename))
                writer.write(example.SerializeToString())
    f.close()

In [8]:
def stratified_train_test_split(y, test_size = 0.2):
    """ 
    Given the labels of a dataset, split it into train and test sets, maintaining
    proportion of each class (return indices, not data).
    """
    if not isinstance(y, np.ndarray):
        y = np.array(y)

    train_idx = np.zeros((0,), np.int16)
    test_idx = np.zeros((0,), np.int16)

    for label in np.unique(y):
        idx = np.where(y==label)[0]
        test_idx_aux = np.random.choice(idx, int(idx.shape[0]*test_size), replace=False)
        test_idx = np.concatenate((test_idx, test_idx_aux))
        train_idx = np.concatenate((train_idx, np.setdiff1d(idx, test_idx_aux, assume_unique=True)))

    return train_idx, test_idx

In [9]:
# Classes in the dataset
CLASSES = ['NOR', 'AD', 'MCI']

In [10]:
# Path to PET images
pet_paths = np.empty((0,), dtype=str)
pet_labels = np.empty((0,), dtype=np.int64)

for label, c in enumerate(CLASSES):
    pattern = os.path.join(DS_PATH, c, 'PET') + '/*.nii'
    pet_paths = np.concatenate((pet_paths, np.array(tf.io.gfile.glob(pattern))))
    pet_labels = np.concatenate((pet_labels, np.full(len(pet_paths) - len(pet_labels), label, dtype=np.int64)))

# Path to MRI, grey matter images
mri_grey_paths = np.empty((0,), dtype=str)
mri_grey_labels = np.empty((0,), dtype=np.int64)

for label, c in enumerate(CLASSES):
    pattern = os.path.join(DS_PATH, c, 'MRI/grey') + '/*.nii'
    mri_grey_paths = np.concatenate((mri_grey_paths, np.array(tf.io.gfile.glob(pattern))))
    mri_grey_labels = np.concatenate((mri_grey_labels, np.full(len(mri_grey_paths) - len(mri_grey_labels), label, dtype=np.int64)))
    
# Path to MRI, white matter images
mri_white_paths = np.empty((0,), dtype=str)
mri_white_labels = np.empty((0,), dtype=np.int64)

for label, c in enumerate(CLASSES):
    pattern = os.path.join(DS_PATH, c, 'MRI/white') + '/*.nii'
    mri_white_paths = np.concatenate((mri_white_paths, np.array(tf.io.gfile.glob(pattern))))
    mri_white_labels = np.concatenate((mri_white_labels, np.full(len(mri_white_paths) - len(mri_white_labels), label, dtype=np.int64)))

In [11]:
# Put all images in the same order so that each position on the three datasets correspond to the same patient
# This is not useful by now, it will be useful in a future ensemble model with MRI and PET.
idx = np.argsort(pet_paths)
pet_paths, pet_labels = pet_paths[idx], pet_labels[idx]

idx = np.argsort(mri_grey_paths)
mri_grey_paths, mri_grey_labels = mri_grey_paths[idx], mri_grey_labels[idx]

idx = np.argsort(mri_white_paths)
mri_white_paths, mri_white_labels = mri_white_paths[idx], mri_white_labels[idx]

In [None]:
# Generating datasets with tfrecords
train_idx, test_idx = stratified_train_test_split(pet_labels, test_size=0.2)

generate_tfrecords(pet_paths[train_idx], pet_labels[train_idx], OUT_PATH + '/PET/train', 
                   OUT_PATH + '/PET/train/tfrec_metadata.csv', num_folds=20, stratify=True, 
                   shuffle=True, create_folders=True)
generate_tfrecords(pet_paths[test_idx], pet_labels[test_idx], OUT_PATH + '/PET/test', 
                   OUT_PATH + '/PET/test/tfrec_metadata.csv', num_folds=16, stratify=False, 
                   shuffle=False, create_folders=True)

generate_tfrecords(mri_grey_paths[train_idx], mri_grey_labels[train_idx], OUT_PATH + '/MRI/white/train', 
                   OUT_PATH + '/MRI/white/train/tfrec_metadata.csv',num_folds=20, stratify=True, 
                   shuffle=True, create_folders=True)
generate_tfrecords(mri_grey_paths[test_idx], mri_grey_labels[test_idx], OUT_PATH + '/MRI/white/test', 
                   OUT_PATH + '/MRI/white/test/tfrec_metadata.csv', num_folds=16, stratify=False, 
                   shuffle=False, create_folders=True)

generate_tfrecords(mri_white_paths[train_idx], mri_white_labels[train_idx], OUT_PATH + '/MRI/grey/train', 
                   OUT_PATH +'/MRI/grey/train/tfrec_metadata.csv',num_folds=20, stratify=True, 
                   shuffle=True, create_folders=True)
generate_tfrecords(mri_white_paths[test_idx], mri_white_labels[test_idx], OUT_PATH + '/MRI/grey/test', 
                   OUT_PATH + '/MRI/grey/test/tfrec_metadata.csv',num_folds=16, stratify=False, 
                   shuffle=False, create_folders=True)

In [None]:
# shutil.rmtree('PET')
# shutil.rmtree('MRI')
# shutil.rmtree('cloned_repo')