Load module for work with hdf5 files

In [11]:
import os
import cv2
import h5py
import numpy as np

Download dataset from web and unzip

In [2]:
#Dataset from
#https://www.kaggle.com/datasets/thomasdubail/brain-tumors-256x256/
#https://www.kaggle.com/datasets/sartajbhuvaji/brain-tumor-classification-mri/data

This function check in each folder of the classes anf get the total amount of images
Necessary to predefine the size of the hdf5 file

In [3]:
def count_images_in_directory(directory):
    image_count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.jpg'):
                image_count += 1
    return image_count

 This function receives the path to the folder with the data, also receives the name in which the hdf5 file is going to be saved
 The hdf5 file store the images in a random way in a dataset of images
 Also saves its associated class in a dataset of labels
 Notice that the output h5 file is going to be saved in the folder Dataset

In [5]:
def image_hdf5_from_directory(pathfile, name_hdf5_file = 'dataset.h5'):
    output_hdf5 = 'Dataset/'+name_hdf5_file

    path_to_each_folder = [os.path.join(pathfile, name) for name in os.listdir(pathfile)]

    # Initialize the HDF5 file
    with h5py.File(output_hdf5, 'w') as hdf5_file:

      ammount_images = count_images_in_directory(pathfile)

      # Create datasets for images and labels
      image_dataset = hdf5_file.create_dataset("images", (ammount_images, 256, 256, 3), dtype='uint8')
      label_dataset = hdf5_file.create_dataset("labels", (ammount_images,), dtype='int')

      # Create lists to keep track of the order of images
      #image_indices = list(range(4000))
      image_indices = np.arange(ammount_images)

      # Shuffle the order in which images will be added
      #random.shuffle(image_indices)
      np.random.shuffle(image_indices)

      # Function to add image and label to the dataset
      def add_image(image, label, index):
          image_dataset[index] = image
          label_dataset[index] = label

      i = 0
      for class_label, folder_path in enumerate(path_to_each_folder):
        for filename in os.listdir(folder_path):
            if filename.endswith(('.jpg')):
                file_path = os.path.join(folder_path, filename)
                image = cv2.imread(file_path)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                image = cv2.resize(image, (256, 256))

                if image is not None:
                  index = image_indices[i]
                  add_image(image, class_label, index)
                  print('Saved image', i, 'to h5 file in', index, 'position')
                  i+=1

Call the previous function, we specify the path to the training data folder and also specify the name 'dataset_training.h5'

In [12]:
image_hdf5_from_directory('Dataset/Training', name_hdf5_file = 'dataset_training.h5')

Saved image 0 to h5 file in 1547 position
Saved image 1 to h5 file in 1250 position
Saved image 2 to h5 file in 819 position
Saved image 3 to h5 file in 2215 position
Saved image 4 to h5 file in 1148 position
Saved image 5 to h5 file in 1402 position
Saved image 6 to h5 file in 805 position
Saved image 7 to h5 file in 2152 position
Saved image 8 to h5 file in 1654 position
Saved image 9 to h5 file in 887 position
Saved image 10 to h5 file in 1443 position
Saved image 11 to h5 file in 1156 position
Saved image 12 to h5 file in 1753 position
Saved image 13 to h5 file in 606 position
Saved image 14 to h5 file in 167 position
Saved image 15 to h5 file in 1995 position
Saved image 16 to h5 file in 7 position
Saved image 17 to h5 file in 943 position
Saved image 18 to h5 file in 2012 position
Saved image 19 to h5 file in 2596 position
Saved image 20 to h5 file in 2232 position
Saved image 21 to h5 file in 1269 position
Saved image 22 to h5 file in 1397 position
Saved image 23 to h5 file in 2

Call the previous function, we specify the path to the testing data folder and also specify the name 'dataset_testing.h5'

In [13]:
image_hdf5_from_directory('Dataset/Testing', name_hdf5_file = 'dataset_testing.h5')

Saved image 0 to h5 file in 76 position
Saved image 1 to h5 file in 351 position
Saved image 2 to h5 file in 124 position
Saved image 3 to h5 file in 185 position
Saved image 4 to h5 file in 158 position
Saved image 5 to h5 file in 217 position
Saved image 6 to h5 file in 380 position
Saved image 7 to h5 file in 212 position
Saved image 8 to h5 file in 390 position
Saved image 9 to h5 file in 49 position
Saved image 10 to h5 file in 70 position
Saved image 11 to h5 file in 159 position
Saved image 12 to h5 file in 108 position
Saved image 13 to h5 file in 63 position
Saved image 14 to h5 file in 153 position
Saved image 15 to h5 file in 292 position
Saved image 16 to h5 file in 116 position
Saved image 17 to h5 file in 84 position
Saved image 18 to h5 file in 23 position
Saved image 19 to h5 file in 3 position
Saved image 20 to h5 file in 271 position
Saved image 21 to h5 file in 60 position
Saved image 22 to h5 file in 281 position
Saved image 23 to h5 file in 55 position
Saved image 