In [1]:
import h5py
from IPython.display import display

files = [
    "/media/external/clothing_300x300_0.h5",
    "/media/external/clothing_300x300_1.h5",
    "/media/external/clothing_300x300_2.h5",
    "/media/external/clothing_300x300_3.h5",
    "/media/external/clothing_300x300_4.h5",
    "/media/external/clothing_300x300_5.h5",
    "/media/external/clothing_300x300_6.h5",
    "/media/external/clothing_300x300_7.h5",
]

In [2]:
%%time
for f_path in files:
    with h5py.File(f_path, 'a') as f:
        l = list(f.keys())
        if not 'batches' in l:
            unique_ids = set(i[:-2] for i in l)

            batches = []

            for id in unique_ids:
                batches.append([f"{id}-x".encode("ascii", "ignore"),
                                f"{id}-y".encode("ascii", "ignore")])

            f.create_dataset(name='batches',
                             shape=(len(batches), 2),
                             data=batches,
                             dtype=h5py.special_dtype(vlen=str),
                             compression='gzip',
                             compression_opts=9)

CPU times: user 372 ms, sys: 235 ms, total: 607 ms
Wall time: 29 s


In [3]:
%%time
total_batches = 0
total_images = 0

for f_path in files:
    with h5py.File(f_path, 'r') as f:
        l = list(f.keys())
        first = f['batches'][0]
        last = f['batches'][-1]
        partial_images = 0
        
        for x_ref, y_ref in f['batches']:
            temp_count = f[x_ref].shape[0]
            partial_images += temp_count
            total_images += temp_count
            
        total_batches += len(f["batches"])
        
        print('file', f_path, 'has', len(l), 'datasets')
        print('The image dataset has', len(f['batches']), 'batches',
              f'({len(f["batches"])} x 2) + 1 =', (len(f["batches"])*2 + 1))
        print('This dataset has a total of', partial_images, 'images')
        
        print('  the first image x ref in', f_path, "is present?", 
              (first[0] in l))
        print('  the first image y ref in', f_path, "is present?", 
              (first[1] in l))
        print('   the last image x ref in', f_path, "is present?", 
              (last[0] in l))
        print('   the last image y ref in', f_path, "is present?", 
              (last[1] in l), end='\n\n')
        
        
print(f'A total of {total_batches} batches are going to be used in train')
print(f'there are {total_images} images among this batches')

file /media/external/clothing_300x300_0.h5 has 2953 datasets
The image dataset has 1476 batches (1476 x 2) + 1 = 2953
This dataset has a total of 15956 images
  the first image x ref in /media/external/clothing_300x300_0.h5 is present? True
  the first image y ref in /media/external/clothing_300x300_0.h5 is present? True
   the last image x ref in /media/external/clothing_300x300_0.h5 is present? True
   the last image y ref in /media/external/clothing_300x300_0.h5 is present? True

file /media/external/clothing_300x300_1.h5 has 2931 datasets
The image dataset has 1465 batches (1465 x 2) + 1 = 2931
This dataset has a total of 15916 images
  the first image x ref in /media/external/clothing_300x300_1.h5 is present? True
  the first image y ref in /media/external/clothing_300x300_1.h5 is present? True
   the last image x ref in /media/external/clothing_300x300_1.h5 is present? True
   the last image y ref in /media/external/clothing_300x300_1.h5 is present? True

file /media/external/clo

In [4]:
with h5py.File(files[4], 'r') as f:
    print('''
        An example row, has N images 300x300 pixels with 3 color layers
    ''')
    display(f[f['batches'][272][0]].shape)
    print('''
        and each of their targets has 8732 positions of possible bboxes
        predicting 39 classes + 1 no class and 4 numbers of anchors box
    ''')
    display(f[f['batches'][272][1]].shape)


        An example row, has N images 300x300 pixels with 3 color layers
    


(11, 300, 300, 3)


        and each of their targets has 8732 positions of possible bboxes
        predicting 39 classes + 1 no class and 4 numbers of anchors box
    


(11, 8732, 6)