<a href="https://colab.research.google.com/github/Angelvj/Alzheimer-disease-classification/blob/main/code/generate_tfrecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np, os, shutil
import math
import tensorflow as tf
import nibabel as nib
from sklearn.model_selection import StratifiedKFold, KFold
import csv

In [2]:
if os.path.exists('cloned_repo'):
    shutil.rmtree('cloned_repo')
    
!git clone -l -s https://github.com/Angelvj/TFG.git cloned_repo

# Imports from my github repo
from cloned_repo.code.image_reading import *

Cloning into 'cloned_repo'...
remote: Enumerating objects: 271, done.[K
remote: Counting objects: 100% (271/271), done.[K
remote: Compressing objects: 100% (215/215), done.[K
remote: Total 271 (delta 94), reused 61 (delta 15), pack-reused 0[K
Receiving objects: 100% (271/271), 3.07 MiB | 17.58 MiB/s, done.
Resolving deltas: 100% (94/94), done.


Most of the following code comes from TensorFlow's documentation: [here](https://www.tensorflow.org/tutorials/load_data/tfrecord?hl=en#data_types_for_tftrainexample)


In [3]:
# The following functions convert a value to a type compatible
# with tf.train.Example
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double"""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _float_list_feature(value):
    """Returns a float_list from a float / double list"""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _int64_list_feature(value):
    """Returns an int64_list from a bool / enum / int / uint list"""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def serialize_example(image, shape, name, label):
    """
    Creates a tf.train.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible
    # data type.
    feature = {
        'image': _float_list_feature(image),
        'shape': _int64_list_feature(shape),
        'name': _bytes_feature(name), #TODO: not needed
        'label': _int64_feature(label)
    }
    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [4]:
def generate_tfrecords(img_paths, img_labels, tfrecords_dir, tfrecords_metadata, num_folds=15, 
                       num_samples=None, stratify=True, shuffle=True, create_folders=True):
    """Given path to images and corresponding labels, creates num_folds tfrecords containing the images, 
    or tfrecords containing num_samples each."""
    
    if create_folders and not os.path.exists(tfrecords_dir):
        os.makedirs(tfrecords_dir)
    
    if num_samples is not None:
        num_folds = math.ceil(len(img_paths)/num_samples)
        
    with open(tfrecords_metadata, 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        row = ['tfrecord_name', 'total_samples']
        for c in CLASSES:
            row.append(c)
        writer.writerow(row)

    f = open(tfrecords_metadata, 'a', encoding='UTF8', newline='')
    csv_writer = csv.writer(f)

    if stratify:
        kfold = StratifiedKFold(n_splits=num_folds, shuffle=shuffle)
    else:
        kfold = KFold(n_splits=num_folds, shuffle=shuffle)
    
    for n, (_, indices) in enumerate(kfold.split(img_paths, img_labels)):
        tfrecord_name = f'tfrecord_{n}.tfrec' if create_folders else f'{tfrecords_dir}{n}-{len(indices)}.tfrec'
        num_samples = str(len(indices))
        unique, counts = np.unique(img_labels[indices], return_counts=True)
        per_class_count = np.zeros(len(CLASSES), dtype=np.int64)
        per_class_count[unique] = counts
        per_class_count = list(per_class_count.astype(str))
        row = [tfrecord_name] + [num_samples] + per_class_count
        csv_writer.writerow(row)
        
        aux = os.path.join(tfrecords_dir, tfrecord_name) if create_folders else tfrecord_name
        
        with tf.io.TFRecordWriter(aux) as writer:
            for index in indices:
                path = img_paths[index]
                img = np.nan_to_num(load_image(path), copy=False)
                img_name = str.encode(path.split('/')[-1])
                example = serialize_example(img.ravel(), img.shape, img_name, label)
                writer.write(example)
    f.close()

In [5]:
def unison_shuffled(a, b):
    """ shuffle two ndarrays of same shape, in the same way """
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

def stratified_train_test_split(y, test_size = 0.2):
    """ 
    Given the labels of a dataset, split it into train and test sets, maintaining
    proportion of each class (return indices, not data).
    """
    if not isinstance(y, np.ndarray):
        y = np.array(y)

    train_idx = np.zeros((0,), np.int16)
    test_idx = np.zeros((0,), np.int16)

    for label in np.unique(y):
        idx = np.where(y==label)[0]
        test_idx_aux = np.random.choice(idx, int(idx.shape[0]*test_size), replace=False)
        test_idx = np.concatenate((test_idx, test_idx_aux))
        train_idx = np.concatenate((train_idx, np.setdiff1d(idx, test_idx_aux, assume_unique=True)))

    return train_idx, test_idx

In [9]:
COLAB = True

# ============== Path to data ==============
if COLAB:
    from google.colab import drive
    drive.mount('/content/drive')
    # Root data folder
    ROOT = '/content/drive/My Drive/Machine learning/data'
    # Path to brain images
else:
    ROOT = '/kaggle/input'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# =========== Path to images folder ===========
preprocessed = ROOT + '/preprocessed-nii'

# Classes in the dataset
CLASSES = ['NOR', 'AD', 'MCI']

In [11]:
# Path to PET images
pet_paths = np.empty((0,), dtype=str)
pet_labels = np.empty((0,), dtype=np.int64)

for label, c in enumerate(CLASSES):
    aux = os.path.join(preprocessed, c, 'PET')
    for f in os.listdir(aux):
        if f.endswith('.nii'):
            path = os.path.join(aux, f)
            pet_paths = np.concatenate((pet_paths, path), axis=None)
            pet_labels = np.concatenate((pet_labels, label), axis=None)

# Path to MRI, grey matter images
mri_grey_paths = np.empty((0,), dtype=str)
mri_grey_labels = np.empty((0,), dtype=np.int64)

for label, c in enumerate(CLASSES):
    aux = os.path.join(preprocessed, c, 'MRI/grey')
    for f in os.listdir(aux):
        if f.endswith('.nii'):
            path = os.path.join(aux, f)
            mri_grey_paths = np.concatenate((mri_grey_paths, path), axis=None)
            mri_grey_labels = np.concatenate((mri_grey_labels, label), axis=None)

# Path to MRI, white matter images
mri_white_paths = np.empty((0,), dtype=str)
mri_white_labels = np.empty((0,), dtype=np.int64)

for label, c in enumerate(CLASSES):
    aux = os.path.join(preprocessed, c, 'MRI/white')
    for f in os.listdir(aux):
        if f.endswith('.nii'):
            path = os.path.join(aux, f)
            mri_white_paths = np.concatenate((mri_white_paths, path), axis=None)
            mri_white_labels = np.concatenate((mri_white_labels, label), axis=None)

In [12]:
# Put all images in the same order so that each position on the three datasets correspond to the same patient
# This is not useful by now, it will be useful in a future ensemble model with MRI and PET.
idx = np.argsort(pet_paths)
pet_paths, pet_labels = pet_paths[idx], pet_labels[idx]

idx = np.argsort(mri_grey_paths)
mri_grey_paths, mri_grey_labels = mri_grey_paths[idx], mri_grey_labels[idx]

idx = np.argsort(mri_white_paths)
mri_white_paths, mri_white_labels = mri_white_paths[idx], mri_white_labels[idx]

In [13]:
# Generating datasets with tfrecords
train_idx, test_idx = stratified_train_test_split(pet_labels, test_size=0.2)

dir = ROOT + '/preprocessed-tfrecords-20skf/'

generate_tfrecords(pet_paths[train_idx], pet_labels[train_idx], dir + 'PET/train', 
                   dir + 'PET/train/tfrec_metadata.csv', num_folds=20, stratify=True, 
                   shuffle=True, create_folders=True)
generate_tfrecords(pet_paths[test_idx], pet_labels[test_idx], dir + 'PET/test', 
                   dir + 'PET/test/tfrec_metadata.csv', num_folds=16, stratify=False, 
                   shuffle=False, create_folders=True)

generate_tfrecords(mri_grey_paths[train_idx], mri_grey_labels[train_idx], dir + 'MRI/white/train', 
                   dir + 'MRI/white/train/tfrec_metadata.csv',num_folds=20, stratify=True, 
                   shuffle=True, create_folders=True)
generate_tfrecords(mri_grey_paths[test_idx], mri_grey_labels[test_idx], dir + 'MRI/white/test', 
                   dir + 'MRI/white/test/tfrec_metadata.csv', num_folds=16, stratify=False, 
                   shuffle=False, create_folders=True)

generate_tfrecords(mri_white_paths[train_idx], mri_white_labels[train_idx], dir + 'MRI/grey/train', 
                   dir +'MRI/grey/train/tfrec_metadata.csv',num_folds=20, stratify=True, 
                   shuffle=True, create_folders=True)
generate_tfrecords(mri_white_paths[test_idx], mri_white_labels[test_idx], dir + 'MRI/grey/test', 
                   dir + 'MRI/grey/test/tfrec_metadata.csv',num_folds=16, stratify=False, 
                   shuffle=False, create_folders=True)

In [None]:
# shutil.rmtree('PET')
# shutil.rmtree('MRI')
# shutil.rmtree('cloned_repo')