### <font color=orange> Installs </font>

pip install tensorflow

pip install keras

pip install imblearn

pip install matplotlib

pip install seaborn

pip install scikit-learn

pip install numpy pandas scipy nibabel pillow scikit-image

### <font color=orange> Importing Libraries </font>

In [1]:
import os
import shutil
import glob
import numpy as np
import re
import pandas as pd
from scipy.stats import entropy
from PIL import Image
from skimage.util import img_as_ubyte
import nibabel as nib

### <font color=orange> Define directory of dataset & Classes names </font>

In [4]:
# Define your data paths and classes

source_dir = 'C:\\Users\\pedro\\iCloudDrive\\Academics\\UNICAMP\\2023\\IA901\\Projeto\\ADNI\\MPRAGE_all\\Data\\' # ADNI Raw data (3D MRI Volumes) 
interim_dir0 = 'C:\\Users\\pedro\\Project\\Datasets\\ADNI0' #ADNI slice selected, separado em suas labels e em subfolders de cada sessão de MRI
interim_dir1 = 'C:\\Users\\pedro\\Project\\Datasets\\ADNI1' #ADNI slice selected, separado em suas labels
final_di r= 'C:\\Users\\pedro\\Project\\Datasets\\ADNI3' #ADNI slice selected, separado em suas labels, em train, test, val (subjects unicos)
classes = ['AD', 'CN', 'EMCI', 'LMCI', 'MCI']

train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15 

### <font color=orange> Slice Selection and Label Separation </font>

Takes the ADNI dataset csv table and the raw ADNI data (3D MRI Volumes) and selects 10 slices based on highest entropy values and reorganizes it into label based folders ['AD', 'CN', 'EMCI', 'LMCI', 'MCI']

In [None]:
# Reading ADNI table
df = pd.read_csv('MPRAGE_all.csv')
groupLabels = df['Group']
groupNames = groupLabels.unique()

for groupName in groupNames:
    count = sum(groupLabels == groupName)
    print(f'{groupName}: {count}')

In [None]:
subject_ids = [name for name in os.listdir(source_dir) if os.path.isdir(os.path.join(source_dir, name))]

for i, subject_id in enumerate(subject_ids): # Looping over all subjects
    subjectGroup = df.loc[df['Subject'] == subject_id, 'Group'].values[0] # Getting the group of the current subject 
    fileList = [os.path.join(dp, f) for dp, dn, filenames in os.walk(os.path.join(source_dir, subject_id, 'MPRAGE')) for f in filenames if os.path.splitext(f)[1] == '.nii']
    print(f'{(i/len(subject_ids))*100:.2f}%')

    for file in fileList: # Looping over all image files 
        try:
            X = nib.load(file).get_fdata() # Extract the image data from the structure
        except Exception as e:
            print(f'Problem using nib.load for file: {file}. \nError: {str(e)}')
            continue

        imagefoldername = os.path.join(interim_dir0, subjectGroup, 'slices_' + os.path.basename(file))
        os.makedirs(imagefoldername, exist_ok=True)

        # Get entropy values for slices 
        val = [entropy(img / img.max()) if img.max() > 0 else 0 for img in X[:,:,120:]]

        # Get indices of slices sorted by entropy
        Xind = np.argsort(val)[::-1]

        for n in range(10): # Generate jpg images of the 10 highest entropy slices
            img = img_as_ubyte(X[:,:,Xind[n]])
            Image.fromarray(img).save(os.path.join(imagefoldername, f'{n:02d}.jpg'))

### <font color=orange> Extraindo e renomeando as slices de seus respectivos subfolders </font>

In [None]:
# For each class, move the first 10 images from each subfolder
for cls in classes:
    # Get the path to the current class folder in both source and destination
    src_cls_path = os.path.join(source_path, cls)
    dst_cls_path = os.path.join(destination_path, cls)
    
    # For each subfolder in the current class folder
    for subfolder in os.listdir(src_cls_path):
        # Get the path to the current subfolder
        src_subfolder_path = os.path.join(src_cls_path, subfolder)

        # Get a list of all the image files in the current subfolder
        all_files = os.listdir(src_subfolder_path)

        # Sort the list and take the first 10 images
        all_files.sort()
        images_to_move = all_files

        # For each image to move
        for image_file in images_to_move:
            # Get the path to the image in the source folder
            src_image_path = os.path.join(src_subfolder_path, image_file)
            
            # Create a new image name by prepending the subfolder's name to the original image's name
            new_image_name = f"{subfolder}_{image_file}"
            # Get the path to where the image will be in the destination folder
            dst_image_path = os.path.join(dst_cls_path, new_image_name)

            # Move the image
            shutil.copy2(src_image_path, dst_image_path)


### <font color=orange> Testing, validation and training split </font>


Separating test, val and train sets making sure there is no subject overlap, i.e., there is no subject whose images are in two sets of data. 

In [5]:
# Regular expression to match and extract the subject id from the filename
subject_id_regex = re.compile(r"S_(\d+)")
def get_subject_id(filename):
    match = subject_id_regex.search(filename)
    return match.group(1) if match else None

In [6]:
for cls in classes:
    os.makedirs(dest_dir + '\\train\\' + cls, exist_ok=True)
    os.makedirs(dest_dir + '\\val\\' + cls, exist_ok=True)
    os.makedirs(dest_dir + '\\test\\' + cls, exist_ok=True)

    # Get a list of all the images
    images = glob.glob(source_dir + '\\' + cls + '\\*.jpg')

    # Extract the unique subject IDs
    subject_ids = list(set(get_subject_id(image) for image in images))
    np.random.shuffle(subject_ids)

    # Split into training, validation, and test datasets
    train_subject_ids, val_subject_ids, test_subject_ids = np.split(
        np.array(subject_ids),
        [int(len(subject_ids)* train_ratio), int(len(subject_ids)* (train_ratio + val_ratio))])

    # Copy the files
    for img in images:
        subject_id = get_subject_id(img)
        if subject_id in train_subject_ids:
            shutil.copy2(img, dest_dir + '\\train\\' + cls)
        elif subject_id in val_subject_ids:
            shutil.copy2(img, dest_dir + '\\val\\' + cls)
        elif subject_id in test_subject_ids:
            shutil.copy2(img, dest_dir + '\\test\\' + cls)

KeyboardInterrupt: 

In [7]:
for cls in ['AD']:
    os.makedirs(dest_dir + '\\train\\' + cls, exist_ok=True)
    os.makedirs(dest_dir + '\\val\\' + cls, exist_ok=True)
    os.makedirs(dest_dir + '\\test\\' + cls, exist_ok=True)

    # Get a list of all the images
    images = glob.glob(source_dir + '\\' + cls + '\\*.jpg')

    # Extract the unique subject IDs
    subject_ids = list(set(get_subject_id(image) for image in images))
    np.random.shuffle(subject_ids)

    # Split into training, validation, and test datasets
    train_subject_ids, val_subject_ids, test_subject_ids = np.split(
        np.array(subject_ids),
        [int(len(subject_ids)* train_ratio), int(len(subject_ids)* (train_ratio + val_ratio))])

    

In [10]:
train_subject_ids, val_subject_ids, images[0:10]


(array(['1059', '1137', '0010', '1037', '4280', '4733', '0341', '4730',
        '0724', '0682', '0404', '4110', '4894', '1253', '0805', '1205',
        '1082', '5074', '0083', '0753', '5149', '4172', '5184', '5016',
        '1296', '0029', '4672', '4792', '0400', '0733', '0053', '0183',
        '0335', '1144', '5240', '0689', '5163', '0007', '4615', '5210',
        '1391', '5162', '4526', '0003', '4859', '1263', '4912', '0535',
        '1101', '4772', '4770', '0219', '5275', '5146', '0228', '4994',
        '1368', '0543', '4827', '4589', '5090', '0286', '5120', '4696',
        '0487', '4124', '0816', '0777', '0084', '4755', '0372', '1373',
        '0696', '0790', '1289', '0474', '0110', '0216', '4024', '5112',
        '5087', '1377', '4982', '1157', '5205', '4001', '4990', '5013',
        '4719', '5017', '1257', '4993', '4905', '4783', '5252', '1079',
        '4971', '1024', '4223', '5019', '5231', '0147', '4980', '0565',
        '0853', '0720', '5015', '5018', '0699', '5106', '0492', 

In [12]:
img = images[1]
subject_id = get_subject_id(img)
subject_id

'0816'

In [None]:
# Copy the files
for img in images:
        subject_id = get_subject_id(img)
        if subject_id in train_subject_ids:
            shutil.copy2(img, dest_dir + '\\train\\' + cls)
        elif subject_id in val_subject_ids:
            shutil.copy2(img, dest_dir + '\\val\\' + cls)
        elif subject_id in test_subject_ids:
            shutil.copy2(img, dest_dir + '\\test\\' + cls)