In [1]:
import os
os.sys.path.append('../..')

import h5py

from tqdm import tqdm

from IPython.display import display

import project.download_content as content

files = [
    "/media/external/all_classes_300x300_0.h5",
    "/media/external/all_classes_300x300_1.h5",
    "/media/external/all_classes_300x300_2.h5",
    "/media/external/all_classes_300x300_3.h5",
    "/media/external/all_classes_300x300_4.h5",
    "/media/external/all_classes_300x300_5.h5",
    "/media/external/all_classes_300x300_6.h5",
    "/media/external/all_classes_300x300_7.h5",
]

In [10]:
for f_path in files:
    with h5py.File(f_path, 'a') as f:
        try:
            del f['batches']
        except KeyError:
            pass

In [11]:
%%time
for f_path in tqdm(files):
    with h5py.File(f_path, 'a') as f:
        keys = f.keys()
        if not 'batches' in keys:
            unique_ids = set(i[:-2] for i in keys)

            batches = []

            for id in unique_ids:
                batches.append([f"{id}-x".encode("ascii", "ignore"),
                                f"{id}-y".encode("ascii", "ignore")])

            f.create_dataset(name='batches',
                             shape=(len(batches), 2),
                             data=batches,
                             dtype=h5py.special_dtype(vlen=str),
                             compression='gzip',
                             compression_opts=1)


  0%|          | 0/8 [00:00<?, ?it/s][A
 38%|███▊      | 3/8 [00:40<01:07, 13.60s/it][A
 50%|█████     | 4/8 [01:21<01:27, 21.76s/it][A
 62%|██████▎   | 5/8 [02:02<01:22, 27.54s/it][A
 75%|███████▌  | 6/8 [02:31<00:55, 27.82s/it][A
 88%|████████▊ | 7/8 [02:59<00:28, 28.14s/it][A
100%|██████████| 8/8 [03:30<00:00, 26.29s/it][A

CPU times: user 1.94 s, sys: 2.66 s, total: 4.59 s
Wall time: 3min 30s





In [None]:
%%time
total_batches = 0
total_images = 0

for f_path in tqdm(files):
    with h5py.File(f_path, 'r') as f:
        keys = f.keys() 
        first = f['batches'][0]
        last = f['batches'][-1]
        partial_images = 0
        
        for x_ref, y_ref in f['batches']:
            temp_count = f[x_ref].shape[0]
            partial_images += temp_count
            total_images += temp_count
            
        total_batches += len(f["batches"])
        
        print('file', f_path, 'has', len(keys), 'datasets')
        print('The image dataset has', len(f['batches']), 'batches',
              f'({len(f["batches"])} x 2) + 1 =', (len(f["batches"])*2 + 1))
        print('This dataset has a total of', partial_images, 'images')
        
        print('  the first image x ref in', f_path, "is present?", 
              (first[0] in keys))
        print('  the first image y ref in', f_path, "is present?", 
              (first[1] in keys))
        print('   the last image x ref in', f_path, "is present?", 
              (last[0] in keys))
        print('   the last image y ref in', f_path, "is present?", 
              (last[1] in keys), end='\n\n')
        
        
print(f'A total of {total_batches} batches are going to be used in train')
print(f'there are {total_images} images among this batches')


  0%|          | 0/8 [00:00<?, ?it/s][A
 12%|█▎        | 1/8 [01:54<13:24, 114.99s/it][A

file /media/external/all_classes_300x300_0.h5 has 14583 datasets
The image dataset has 7291 batches (7291 x 2) + 1 = 14583
This dataset has a total of 318361 images
  the first image x ref in /media/external/all_classes_300x300_0.h5 is present? True
  the first image y ref in /media/external/all_classes_300x300_0.h5 is present? True
   the last image x ref in /media/external/all_classes_300x300_0.h5 is present? True
   the last image y ref in /media/external/all_classes_300x300_0.h5 is present? True




 25%|██▌       | 2/8 [03:48<11:27, 114.50s/it][A

file /media/external/all_classes_300x300_1.h5 has 14563 datasets
The image dataset has 7281 batches (7281 x 2) + 1 = 14563
This dataset has a total of 319633 images
  the first image x ref in /media/external/all_classes_300x300_1.h5 is present? True
  the first image y ref in /media/external/all_classes_300x300_1.h5 is present? True
   the last image x ref in /media/external/all_classes_300x300_1.h5 is present? True
   the last image y ref in /media/external/all_classes_300x300_1.h5 is present? True




 38%|███▊      | 3/8 [05:41<09:29, 113.97s/it][A

file /media/external/all_classes_300x300_2.h5 has 14543 datasets
The image dataset has 7271 batches (7271 x 2) + 1 = 14543
This dataset has a total of 319568 images
  the first image x ref in /media/external/all_classes_300x300_2.h5 is present? True
  the first image y ref in /media/external/all_classes_300x300_2.h5 is present? True
   the last image x ref in /media/external/all_classes_300x300_2.h5 is present? True
   the last image y ref in /media/external/all_classes_300x300_2.h5 is present? True




 50%|█████     | 4/8 [07:33<07:34, 113.63s/it][A

file /media/external/all_classes_300x300_3.h5 has 14535 datasets
The image dataset has 7267 batches (7267 x 2) + 1 = 14535
This dataset has a total of 318689 images
  the first image x ref in /media/external/all_classes_300x300_3.h5 is present? True
  the first image y ref in /media/external/all_classes_300x300_3.h5 is present? True
   the last image x ref in /media/external/all_classes_300x300_3.h5 is present? True
   the last image y ref in /media/external/all_classes_300x300_3.h5 is present? True




 62%|██████▎   | 5/8 [09:27<05:40, 113.64s/it][A

file /media/external/all_classes_300x300_4.h5 has 14519 datasets
The image dataset has 7259 batches (7259 x 2) + 1 = 14519
This dataset has a total of 319485 images
  the first image x ref in /media/external/all_classes_300x300_4.h5 is present? True
  the first image y ref in /media/external/all_classes_300x300_4.h5 is present? True
   the last image x ref in /media/external/all_classes_300x300_4.h5 is present? True
   the last image y ref in /media/external/all_classes_300x300_4.h5 is present? True




 75%|███████▌  | 6/8 [10:49<03:28, 104.18s/it][A

file /media/external/all_classes_300x300_5.h5 has 14509 datasets
The image dataset has 7254 batches (7254 x 2) + 1 = 14509
This dataset has a total of 318354 images
  the first image x ref in /media/external/all_classes_300x300_5.h5 is present? True
  the first image y ref in /media/external/all_classes_300x300_5.h5 is present? True
   the last image x ref in /media/external/all_classes_300x300_5.h5 is present? True
   the last image y ref in /media/external/all_classes_300x300_5.h5 is present? True




 88%|████████▊ | 7/8 [12:14<01:38, 98.41s/it] [A

file /media/external/all_classes_300x300_6.h5 has 14505 datasets
The image dataset has 7252 batches (7252 x 2) + 1 = 14505
This dataset has a total of 317831 images
  the first image x ref in /media/external/all_classes_300x300_6.h5 is present? True
  the first image y ref in /media/external/all_classes_300x300_6.h5 is present? True
   the last image x ref in /media/external/all_classes_300x300_6.h5 is present? True
   the last image y ref in /media/external/all_classes_300x300_6.h5 is present? True



In [None]:
with h5py.File(files[4], 'r') as f:
    print('''
        An example row, has N images 300x300 pixels with 3 color layers
    ''')
    display(f[f['batches'][272][0]].shape)
    print('''
        and each of their targets has 8732 positions of possible bboxes
        predicting 39 classes + 1 no class and 4 numbers of anchors box
    ''')
    display(f[f['batches'][272][1]].shape)