<a href="https://colab.research.google.com/github/Angelvj/Alzheimer-disease-classification/blob/main/code/generate_tfrecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook has the function of converting the original dataset in tfrecords (better performance on i/o operations and other advantages). We will execute this notebook on Google Colab because we can organize the outputs into folders.

# Imports

In [46]:
# Colab only
from google.colab import drive

In [47]:
import sys
import numpy as np, os, shutil, math
import tensorflow as tf, csv
from sklearn.model_selection import StratifiedKFold, KFold
import nibabel as nib

In [48]:
def load_image(path):    

    img = nib.load(path)
    img = np.asarray(img.dataobj, dtype=np.float32)
    img = np.expand_dims(img, axis=3) # Add dummy axis for channel
    return img

def max_intensity_normalization(X, proportion):

    n_max_values = int(np.prod(X.shape, axis=0) * proportion)
    n_max_idx = np.unravel_index((X).argsort(axis=None)[-n_max_values:], X.shape)
    mean = np.mean(X[n_max_idx])
    X /= mean

def preprocess_image(X, steps, arguments):

    for f, args in zip(steps, arguments):
        f(X, *arguments)

# Generate Tfrecords dataset from images

In [49]:
# We can store three types of data in a TFRecord: bytestring, integer and floats. 
# They are always stored as lists, a single data element will be a list of size 1
def _bytestring_feature(list_of_bytestrings):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _float_feature(list_of_floats): # float32
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

def _int_feature(list_of_ints): # int64
    return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def to_tfrecord(image, label):
    
    one_hot_label = np.eye(3, dtype=np.int64)[label]
        
    feature = {
        'image': _float_feature(image),
        'one_hot_label': _float_feature(one_hot_label.tolist())
    }
    
    # Create a Features message
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [54]:
def generate_tfrecords(img_paths, img_labels, tfrecords_dir, preprocess_steps=None, prepr_args=None, num_folds=15, 
                       num_samples=None, stratify=True, shuffle=True, create_folders=True):#, tfrecords_metadata):
    """Given path to images and corresponding labels, creates num_folds tfrecords containing the images, 
    or tfrecords containing num_samples each."""
    
    if create_folders and not os.path.exists(tfrecords_dir):
        os.makedirs(tfrecords_dir)
    
    if num_samples is not None:
        num_folds = math.ceil(len(img_paths)/num_samples)

    if stratify:
        kfold = StratifiedKFold(n_splits=num_folds, shuffle=shuffle)
    else:
        kfold = KFold(n_splits=num_folds, shuffle=shuffle)
    
    for n, (_, indices) in enumerate(kfold.split(img_paths, img_labels)):
        
        np.random.shuffle(indices)
        
        # If folders are created, we don't need to add extra info in the name
        tfrecord_name = f'tfrecord{n}-{len(indices)}.tfrec' if create_folders else f'{tfrecords_dir}{n}-{len(indices)}.tfrec'

        aux = os.path.join(tfrecords_dir, tfrecord_name) if create_folders else tfrecord_name
        
        with tf.io.TFRecordWriter(aux) as writer:

            for index in indices:

                filename = img_paths[index]
                label = img_labels[index]
                img = np.nan_to_num(load_image(filename), copy=False)
                if preprocess_steps != None:
                    preprocess_image(img, preprocess_steps, prepr_args)
                example = to_tfrecord(img.ravel(), label)
                writer.write(example.SerializeToString())

In [51]:
def stratified_train_test_split(y, test_size = 0.2):
    """ 
    Given the labels of a dataset, split it into train and test sets, maintaining
    proportion of each class (return indices, not data).
    """
    if not isinstance(y, np.ndarray):
        y = np.array(y)

    train_idx = np.zeros((0,), np.int64)
    test_idx = np.zeros((0,), np.int64)

    for label in np.unique(y):
        idx = np.where(y==label)[0]
        test_idx_aux = np.random.choice(idx, int(idx.shape[0]*test_size), replace=False)
        test_idx = np.concatenate((test_idx, test_idx_aux))
        train_idx = np.concatenate((train_idx, np.setdiff1d(idx, test_idx_aux, assume_unique=True)))

    return train_idx, test_idx

In [55]:
# Classes in the dataset. Note: the position in the vector sets the class label
CLASSES = ['NOR', 'AD', 'MCI']

drive.mount('/content/drive')
DATA_PATH = '/content/drive/My Drive/data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1. AD - PET images

## 1.1 : Spatially normalized (elastic deformations) PET images.

In [43]:
DS = 'ad-preprocessed'
DS_PATH =  DATA_PATH + DS

# Path to images
pet_paths = np.empty((0,), dtype=str)
pet_labels = np.empty((0,), dtype=np.int64)

for label, c in enumerate(CLASSES):
    pattern = os.path.join(DS_PATH, c, 'PET') + '/*.nii'
    pet_paths = np.concatenate((pet_paths, np.array(tf.io.gfile.glob(pattern))))
    pet_labels = np.concatenate((pet_labels, np.full(len(pet_paths) - len(pet_labels), label, dtype=np.int64)))

train_idx, test_idx = stratified_train_test_split(pet_labels, test_size=0.2)
np.random.shuffle(train_idx); np.random.shuffle(test_idx) # Add some randomness

### 1.1.1: Non intensity normalized (tfrecords)

In [44]:
OUT_DS = 'tfrec-20skf-PET-spatialnorm-elastic'
OUT_PATH = DATA_PATH + OUT_DS

generate_tfrecords(pet_paths[train_idx], pet_labels[train_idx], OUT_PATH + '/train', 
                   num_folds=20, stratify=True, shuffle=True, create_folders=True)

generate_tfrecords(pet_paths[test_idx], pet_labels[test_idx], OUT_PATH + '/test', 
                   num_folds=16, stratify=False, shuffle=False, create_folders=True)

### 1.1.2: Intensity normalized (tfrecords)

In [45]:
OUT_DS = 'tfrec-20skf-PET-spatialnorm-elastic-intensitynorm'
OUT_PATH = DATA_PATH + OUT_DS

preprocess_steps = [max_intensity_normalization]
preprocess_args = [(0.01)]

generate_tfrecords(pet_paths[train_idx], pet_labels[train_idx], OUT_PATH + '/train', 
                   preprocess_steps, preprocess_args, 20)

generate_tfrecords(pet_paths[test_idx], pet_labels[test_idx], OUT_PATH + '/test', 
                   preprocess_steps, preprocess_args, 16, stratify=False, shuffle=False)

## 1.2: Spatially normalized (non-elastic deformations).

# 2. AD - MRI Images

In [None]:
# # Path to MRI, grey matter images
# mri_grey_paths = np.empty((0,), dtype=str)
# mri_grey_labels = np.empty((0,), dtype=np.int64)

# for label, c in enumerate(CLASSES):
#     pattern = os.path.join(DS_PATH, c, 'MRI/grey') + '/*.nii'
#     mri_grey_paths = np.concatenate((mri_grey_paths, np.array(tf.io.gfile.glob(pattern))))
#     mri_grey_labels = np.concatenate((mri_grey_labels, np.full(len(mri_grey_paths) - len(mri_grey_labels), label, dtype=np.int64)))
    
# # Path to MRI, white matter images
# mri_white_paths = np.empty((0,), dtype=str)
# mri_white_labels = np.empty((0,), dtype=np.int64)

# for label, c in enumerate(CLASSES):
#     pattern = os.path.join(DS_PATH, c, 'MRI/white') + '/*.nii'
#     mri_white_paths = np.concatenate((mri_white_paths, np.array(tf.io.gfile.glob(pattern))))
#     mri_white_labels = np.concatenate((mri_white_labels, np.full(len(mri_white_paths) - len(mri_white_labels), label, dtype=np.int64)))

In [None]:
# # Put all images in the same order so that each position on the three datasets correspond to the same patient
# # This is not useful by now, it will be useful in a future ensemble model with MRI and PET.
# idx = np.argsort(pet_paths)
# pet_paths, pet_labels = pet_paths[idx], pet_labels[idx]

# idx = np.argsort(mri_grey_paths)
# mri_grey_paths, mri_grey_labels = mri_grey_paths[idx], mri_grey_labels[idx]

# idx = np.argsort(mri_white_paths)
# mri_white_paths, mri_white_labels = mri_white_paths[idx], mri_white_labels[idx]

In [None]:
# # Generating datasets with tfrecords
# train_idx, test_idx = stratified_train_test_split(pet_labels, test_size=0.2)

# np.random.shuffle(train_idx); np.random.shuffle(test_idx) # Add some randomness

# generate_tfrecords(pet_paths[train_idx], pet_labels[train_idx], OUT_PATH + '/PET/train', 
#                    OUT_PATH + '/PET/train/tfrec_metadata.csv', num_folds=20, stratify=True, 
#                    shuffle=True, create_folders=True)
# generate_tfrecords(pet_paths[test_idx], pet_labels[test_idx], OUT_PATH + '/PET/test', 
#                    OUT_PATH + '/PET/test/tfrec_metadata.csv', num_folds=16, stratify=False, 
#                    shuffle=False, create_folders=True)

# generate_tfrecords(mri_grey_paths[train_idx], mri_grey_labels[train_idx], OUT_PATH + '/MRI/white/train', 
#                    OUT_PATH + '/MRI/white/train/tfrec_metadata.csv',num_folds=20, stratify=True, 
#                    shuffle=True, create_folders=True)
# generate_tfrecords(mri_grey_paths[test_idx], mri_grey_labels[test_idx], OUT_PATH + '/MRI/white/test', 
#                    OUT_PATH + '/MRI/white/test/tfrec_metadata.csv', num_folds=16, stratify=False, 
#                    shuffle=False, create_folders=True)

# generate_tfrecords(mri_white_paths[train_idx], mri_white_labels[train_idx], OUT_PATH + '/MRI/grey/train', 
#                    OUT_PATH +'/MRI/grey/train/tfrec_metadata.csv',num_folds=20, stratify=True, 
#                    shuffle=True, create_folders=True)
# generate_tfrecords(mri_white_paths[test_idx], mri_white_labels[test_idx], OUT_PATH + '/MRI/grey/test', 
#                    OUT_PATH + '/MRI/grey/test/tfrec_metadata.csv',num_folds=16, stratify=False, 
#                    shuffle=False, create_folders=True)