<a href="https://colab.research.google.com/github/Angelvj/Alzheimer-disease-classification/blob/main/code/generate_tfrecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook has the function of converting the original dataset in tfrecords (better performance on i/o operations and other advantages). We will execute this notebook on Google Colab because we can organize the outputs into folders.

# Imports

In [None]:
# Colab only
from google.colab import drive

In [None]:
import sys
import numpy as np, os, shutil, math
import tensorflow as tf, csv
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split
import nibabel as nib

In [None]:
def load_image(path):    

    img = nib.load(path)
    img = np.asarray(img.dataobj, dtype=np.float32)
    img = np.expand_dims(img, axis=3) # Add axis for channel
    return img

def standarize(X):

    mean = np.mean(X)
    std = np.std(X)
    
    if std > 0:
        X -= mean
        X /= std
    else:
        X *= 0

def max_intensity_normalization(X, proportion):

    n_max_values = int(np.prod(X.shape, axis=0) * proportion)
    n_max_idx = np.unravel_index((X).argsort(axis=None)[-n_max_values:], X.shape)
    mean = np.mean(X[n_max_idx])
    X /= mean

def preprocess_image(X, steps, arguments):

    for f, args in zip(steps, arguments):
        if args is None:
            f(X)
        else:
            f(X, *args)

# Generate Tfrecords dataset from images

In [None]:
# We can store three types of data in a TFRecord: bytestring, integer and floats. 
# They are always stored as lists, a single data element will be a list of size 1
def _bytestring_feature(list_of_bytestrings):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _float_feature(list_of_floats): # float32
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))

def _int_feature(list_of_ints): # int64
    return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def to_tfrecord(image, label):
    
    one_hot_label = np.eye(3, dtype=np.float32)[label]
        
    feature = {
        'image': _float_feature(image),
        'one_hot_label': _float_feature(one_hot_label.tolist())
    }
    
    # Create a Features message
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [None]:
def generate_tfrecords(filenames, labels, dir, tfrec_name, preprocess_steps=None, 
                       prepr_args=None, num_folds=15, stratify=True, shuffle=True, 
                       random_state=None, make_summary=True):
    """Given path to images and corresponding labels, creates num_folds tfrecords 
    containing the images"""
    
    if not os.path.exists(dir):
        os.makedirs(dir)
    
    if make_summary:
        summary_filename = os.path.join(dir, tfrec_name,)
        summary_filename += '_summary.csv'
        with open(summary_filename, 'w', encoding='UTF8', newline='') as f:
            csv_writer = csv.writer(f)
            header = ['tfrec_id', '#samples']
            header += [c for c in CLASSES]
            csv_writer.writerow(header)

        f = open(summary_filename, 'a', encoding='UTF8', newline='')
        csv_writer = csv.writer(f)

    if stratify:
        kfold = StratifiedKFold(num_folds, shuffle, random_state)
    else:
        kfold = KFold(num_folds, shuffle, random_state)
    
    for n, (_, indices) in enumerate(kfold.split(filenames, labels)):
                
        name = f'{tfrec_name}_{n}-{len(indices)}.tfrec'

        if make_summary:
            num_samples = str(len(indices))
            classes, count = np.unique(labels[indices], return_counts=True)
            class_counts = np.zeros(len(CLASSES), dtype=np.int64)
            class_counts[classes] = count
            row = [name] + [num_samples] + list(class_counts.astype(str))
            csv_writer.writerow(row)
        
        with tf.io.TFRecordWriter(os.path.join(dir, name)) as writer:

            for index in indices:
                filename = filenames[index]
                label = labels[index]
                img = np.nan_to_num(load_image(filename), copy=False)
                if preprocess_steps != None:
                    preprocess_image(img, preprocess_steps, prepr_args)
                example = to_tfrecord(img.ravel(), label)
                writer.write(example.SerializeToString())

In [None]:
# Not used
# def stratified_train_test_split(y, test_size = 0.2):
#     """ 
#     Given the labels of a dataset, split it into train and test sets, maintaining
#     proportion of each class (return indices, not data).
#     """
#     if not isinstance(y, np.ndarray):
#         y = np.array(y)

#     train_idx = np.zeros((0,), np.int64)
#     test_idx = np.zeros((0,), np.int64)

#     for label in np.unique(y):
#         idx = np.where(y==label)[0]
#         test_idx_aux = np.random.choice(idx, int(idx.shape[0]*test_size), replace=False)
#         test_idx = np.concatenate((test_idx, test_idx_aux))
#         train_idx = np.concatenate((train_idx, np.setdiff1d(idx, test_idx_aux, assume_unique=True)))

#     return train_idx, test_idx

In [None]:
# Classes in the dataset. Note: the position in the vector sets the class label
CLASSES = ['NOR', 'AD', 'MCI']

drive.mount('/content/drive')
DATA_PATH = '/content/drive/My Drive/data/'

SEED = 27

Mounted at /content/drive


# 1. AD - PET images

## 1.1 : Spatially normalized (elastic deformations) PET images.

In [None]:
DS = 'ad-preprocessed'
DS_PATH =  DATA_PATH + DS

# Path to images
pet_paths = np.empty((0,), dtype=str)
pet_labels = np.empty((0,), dtype=np.int64)

for label, c in enumerate(CLASSES):
    pattern = os.path.join(DS_PATH, c, 'PET') + '/*.nii'
    pet_paths = np.concatenate((pet_paths, np.array(tf.io.gfile.glob(pattern))))
    pet_labels = np.concatenate((pet_labels, np.full(len(pet_paths) - len(pet_labels), label, dtype=np.int64)))

X_train, X_test, y_train, y_test = train_test_split(pet_paths, pet_labels,
                                                    test_size = 0.2,
                                                    random_state = SEED,
                                                    stratify = pet_labels)

### 1.1.1: Non intensity normalized

In [None]:
OUT_DS = 'tfrec-pet-spatialnorm-elastic'
OUT_PATH = DATA_PATH + OUT_DS

generate_tfrecords(X_train, y_train, OUT_PATH + '/train', 'train',
                   num_folds=len(X_train), stratify=False, shuffle=False, random_state=None)

generate_tfrecords(X_test, y_test, OUT_PATH + '/test', 'test', num_folds=len(X_test), 
                   stratify=False, shuffle=False)

### 1.1.2: Max-intensity normalized

In [None]:
OUT_DS = 'tfrec-PET-spatialnorm-elastic-maxintensitynorm'
OUT_PATH = DATA_PATH + OUT_DS

preprocess_steps = [max_intensity_normalization]
preprocess_args = [(0.01,)]

generate_tfrecords(X_train, y_train, OUT_PATH + '/train', 'train', preprocess_steps,
                   preprocess_args, len(X_train), False, False)

generate_tfrecords(X_test, y_test, OUT_PATH + '/test', 'test', preprocess_steps, 
                   preprocess_args, len(X_test), stratify=False, shuffle=False,)

### 1.1.3 Standarized (zero mean unit variance)

In [None]:
OUT_DS = 'tfrec-PET-spatialnorm-elastic-standarized'
OUT_PATH = DATA_PATH + OUT_DS

preprocess_steps = [standarize]
preprocess_args = [None]

generate_tfrecords(X_train, y_train, OUT_PATH + '/train', 'train', preprocess_steps,
                   preprocess_args, len(X_train), False, False)

generate_tfrecords(X_test, y_test, OUT_PATH + '/test', 'test', preprocess_steps,
                   preprocess_args, len(X_test), stratify=False, shuffle=False)

### 1.1.4 Max-intensity normalized and standarized

In [None]:
OUT_DS = 'tfrec-PET-spatialnorm-elastic-maxintensitynorm-standarized'
OUT_PATH = DATA_PATH + OUT_DS

preprocess_steps = [max_intensity_normalization, standarize]
preprocess_args = [(0.01,), None]

generate_tfrecords(X_train, y_train, OUT_PATH + '/train', 'train', preprocess_steps, 
                   preprocess_args, len(X_train), False, False)

generate_tfrecords(X_test, y_test, OUT_PATH + '/test', 'test', preprocess_steps, 
                   preprocess_args, len(X_test), stratify=False, shuffle=False)

## 1.2: Spatially normalized (non-elastic deformations).

### 1.2.1 Non intensity normalization

In [None]:
OUT_DS = 'tfrec-pet-spatialnorm-rigid'
OUT_PATH = DATA_PATH + OUT_DS

generate_tfrecords(X_train, y_train, OUT_PATH + '/train', 'train',
                   num_folds=len(X_train), stratify=False, shuffle=False, random_state=None)

generate_tfrecords(X_test, y_test, OUT_PATH + '/test', 'test', num_folds=len(X_test), 
                   stratify=False, shuffle=False)

### 1.2.2 Standarized

In [None]:
OUT_DS = 'tfrec-pet-spatialnorm-rigid-standarized'
OUT_PATH = DATA_PATH + OUT_DS

preprocess_steps = [standarize]
preprocess_args = [None]

generate_tfrecords(X_train, y_train, OUT_PATH + '/train', 'train', preprocess_steps,
                   preprocess_args, len(X_train), False, False)

generate_tfrecords(X_test, y_test, OUT_PATH + '/test', 'test', preprocess_steps, 
                   preprocess_args, len(X_test), stratify=False, shuffle=False,)

### 1.2.3 Max intensity normalization & standarized

In [None]:
OUT_DS = 'tfrec-PET-spatialnorm-rigid-maxintensitynorm-standarized'
OUT_PATH = DATA_PATH + OUT_DS

preprocess_steps = [max_intensity_normalization, standarize]
preprocess_args = [(0.01,), None]

generate_tfrecords(X_train, y_train, OUT_PATH + '/train', 'train', preprocess_steps, 
                   preprocess_args, len(X_train), False, False)

generate_tfrecords(X_test, y_test, OUT_PATH + '/test', 'test', preprocess_steps, 
                   preprocess_args, len(X_test), stratify=False, shuffle=False)

# 2. AD - MRI Images

In [None]:
# # Path to MRI, grey matter images
# mri_grey_paths = np.empty((0,), dtype=str)
# mri_grey_labels = np.empty((0,), dtype=np.int64)

# for label, c in enumerate(CLASSES):
#     pattern = os.path.join(DS_PATH, c, 'MRI/grey') + '/*.nii'
#     mri_grey_paths = np.concatenate((mri_grey_paths, np.array(tf.io.gfile.glob(pattern))))
#     mri_grey_labels = np.concatenate((mri_grey_labels, np.full(len(mri_grey_paths) - len(mri_grey_labels), label, dtype=np.int64)))
    
# # Path to MRI, white matter images
# mri_white_paths = np.empty((0,), dtype=str)
# mri_white_labels = np.empty((0,), dtype=np.int64)

# for label, c in enumerate(CLASSES):
#     pattern = os.path.join(DS_PATH, c, 'MRI/white') + '/*.nii'
#     mri_white_paths = np.concatenate((mri_white_paths, np.array(tf.io.gfile.glob(pattern))))
#     mri_white_labels = np.concatenate((mri_white_labels, np.full(len(mri_white_paths) - len(mri_white_labels), label, dtype=np.int64)))

In [None]:
# # Put all images in the same order so that each position on the three datasets correspond to the same patient
# # This is not useful by now, it will be useful in a future ensemble model with MRI and PET.
# idx = np.argsort(pet_paths)
# pet_paths, pet_labels = pet_paths[idx], pet_labels[idx]

# idx = np.argsort(mri_grey_paths)
# mri_grey_paths, mri_grey_labels = mri_grey_paths[idx], mri_grey_labels[idx]

# idx = np.argsort(mri_white_paths)
# mri_white_paths, mri_white_labels = mri_white_paths[idx], mri_white_labels[idx]

In [None]:
# # Generating datasets with tfrecords
# train_idx, test_idx = stratified_train_test_split(pet_labels, test_size=0.2)

# np.random.shuffle(train_idx); np.random.shuffle(test_idx) # Add some randomness

# generate_tfrecords(pet_paths[train_idx], pet_labels[train_idx], OUT_PATH + '/PET/train', 
#                    OUT_PATH + '/PET/train/tfrec_metadata.csv', num_folds=20, stratify=True, 
#                    shuffle=True, create_folders=True)
# generate_tfrecords(pet_paths[test_idx], pet_labels[test_idx], OUT_PATH + '/PET/test', 
#                    OUT_PATH + '/PET/test/tfrec_metadata.csv', num_folds=16, stratify=False, 
#                    shuffle=False, create_folders=True)

# generate_tfrecords(mri_grey_paths[train_idx], mri_grey_labels[train_idx], OUT_PATH + '/MRI/white/train', 
#                    OUT_PATH + '/MRI/white/train/tfrec_metadata.csv',num_folds=20, stratify=True, 
#                    shuffle=True, create_folders=True)
# generate_tfrecords(mri_grey_paths[test_idx], mri_grey_labels[test_idx], OUT_PATH + '/MRI/white/test', 
#                    OUT_PATH + '/MRI/white/test/tfrec_metadata.csv', num_folds=16, stratify=False, 
#                    shuffle=False, create_folders=True)

# generate_tfrecords(mri_white_paths[train_idx], mri_white_labels[train_idx], OUT_PATH + '/MRI/grey/train', 
#                    OUT_PATH +'/MRI/grey/train/tfrec_metadata.csv',num_folds=20, stratify=True, 
#                    shuffle=True, create_folders=True)
# generate_tfrecords(mri_white_paths[test_idx], mri_white_labels[test_idx], OUT_PATH + '/MRI/grey/test', 
#                    OUT_PATH + '/MRI/grey/test/tfrec_metadata.csv',num_folds=16, stratify=False, 
#                    shuffle=False, create_folders=True)