# Import modules

In [1]:
import os
import cv2
import h5py
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Get dataset images
* Dataset downloaded from Kaggle: [Brain Tumor Classification (MRI)](https://www.kaggle.com/datasets/sartajbhuvaji/brain-tumor-classification-mri/discussion?sort=hotness)
* Unzip file

# Store images in hdf5 file structure

## Define the size of hdf5 structure
With this function, we get the total number of images, and with that we define the size of the structure

In [2]:
def count_images_in_directory(directory):
    image_count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.jpg'):
                image_count += 1
    return image_count

## Store images in hdf5 structure
* This function receives the path to the folder with the data, also receives the name in which the hdf5 file is going to be saved.
* The hdf5 file store the images in a random way in a dataset of images.
* Also saves its associated class in a dataset of labels.

In [3]:
def image_hdf5_from_directory(pathfile, name_hdf5_file = 'dataset.h5'):
    #output_hdf5 = 'Dataset/'+name_hdf5_file
    output_hdf5 = name_hdf5_file

    path_to_each_folder = [os.path.join(pathfile, name) for name in os.listdir(pathfile)]

    # Initialize the HDF5 file
    with h5py.File(output_hdf5, 'w') as hdf5_file:

      ammount_images = count_images_in_directory(pathfile)

      # Create datasets for images and labels
      image_dataset = hdf5_file.create_dataset("images", (ammount_images, 256, 256, 3), dtype='uint8')
      label_dataset = hdf5_file.create_dataset("labels", (ammount_images,), dtype='int')

      # Create lists to keep track of the order of images
      #image_indices = list(range(4000))
      image_indices = np.arange(ammount_images)

      # Shuffle the order in which images will be added
      #random.shuffle(image_indices)
      np.random.shuffle(image_indices)

      # Function to add image and label to the dataset
      def add_image(image, label, index):
          image_dataset[index] = image
          label_dataset[index] = label

      i = 0
      for class_label, folder_path in enumerate(path_to_each_folder):
        for filename in tqdm(os.listdir(folder_path), desc = f'storing images class {class_label}'):
            if filename.endswith(('.jpg')):
                file_path = os.path.join(folder_path, filename)
                image = cv2.imread(file_path)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                image = cv2.resize(image, (256, 256))

                if image is not None:
                  index = image_indices[i]
                  add_image(image, class_label, index)
                  #print('Saved image', i, 'to h5 file in', index, 'position')
                  i+=1

## Store images
*Note: the testing folder in Kaggle was mislabeled, that's why we'll only work with the training folder.

In [4]:
image_hdf5_from_directory(r'C:\Users\edfli\Downloads\brain_tumor_dataset\Training',
                          name_hdf5_file = r'C:\Users\edfli\Downloads\brain_tumor_dataset\brain_tumor_dataset.h5')

storing images class 0: 100%|██████████| 826/826 [00:11<00:00, 72.16it/s] 
storing images class 1: 100%|██████████| 822/822 [00:01<00:00, 565.64it/s]
storing images class 2: 100%|██████████| 395/395 [00:00<00:00, 636.06it/s]
storing images class 3: 100%|██████████| 827/827 [00:01<00:00, 538.50it/s]


# Split data into train and test splits

## Open the file we've just created

In [5]:
# Open dataset
with h5py.File(r'C:\Users\edfli\Downloads\brain_tumor_dataset\brain_tumor_dataset.h5', 'r') as hdf5_file:
    images = hdf5_file["images"][:]
    labels = hdf5_file["labels"][:]
print(images.shape, labels.shape)

(2870, 256, 256, 3) (2870,)


## Split data into train and test splits

In [6]:
images_training, images_testing, labels_training, labels_testing = train_test_split(images, labels, test_size=0.2)
print(images_training.shape, labels_training.shape)
print(images_testing.shape, labels_testing.shape)

(2296, 256, 256, 3) (2296,)
(574, 256, 256, 3) (574,)


## Store train split into train.h5

In [9]:
with h5py.File(r'C:\Users\edfli\Downloads\brain_tumor_dataset\brain_tumor_dataset_train.h5', 'w') as f:
    f.create_dataset('images', data=images_training)
    f.create_dataset('labels', data=labels_training)

## Store test split into train.h5

In [10]:
with h5py.File(r'C:\Users\edfli\Downloads\brain_tumor_dataset\brain_tumor_dataset_test.h5', 'w') as f:
    f.create_dataset('images', data=images_testing)
    f.create_dataset('labels', data=labels_testing)