In [1]:
import numpy as np
import pandas as pd

import os
os.sys.path.append('../..')

import h5py
import ray

from tqdm import tqdm

from IPython.display import display

import project.download_content as content

METAPATH = os.path.join(content.DATAPATH, 'METADATA')

files_path = [
    "/media/external/all_classes_300x300_0.h5",
    "/media/external/all_classes_300x300_1.h5",
    "/media/external/all_classes_300x300_2.h5",
    "/media/external/all_classes_300x300_3.h5",
    "/media/external/all_classes_300x300_4.h5",
    "/media/external/all_classes_300x300_5.h5",
    "/media/external/all_classes_300x300_6.h5",
    "/media/external/all_classes_300x300_7.h5",
]

In [39]:
%%time

def count(y):
    # count for each img in batch how many bboxes for each class
    classes_c = np.sum(y[:,:,1:-4], axis=(1, 0))
    # count for each img in batch how many bboxes for each class
    bbox_c = y.shape[0] - np.sum(y[:,:,0], axis=0)
    
    return np.concatenate([classes_c, bbox_c])


@ray.remote
def count_file(path):
    df = (pd.DataFrame(columns=(['file', 'batch']
                            + [f'c_{i}' for i in range(599)]
                            + [f'b_{i}' for i in range(8732)]))
            .set_index(['file', 'batch']))
    
    with h5py.File(path, 'r') as f_temp:
        batches = f_temp['batches'][:]

        y_refs = [y for _, y in batches]
        
        print(path, "has", len(y_refs), "batches")
        
        for i, ref in enumerate(y_refs, 1):
            if i%250 == 0:
                print(path, i)
            
            y = f_temp[ref][:]

            np.save(METAPATH + f"/dataaug/{path[-24:-3]}/{ref[:-2]}.npy", count(y))

ray.init(num_cpus=8)
try:
    futures = [count_file.remote(path) for path in files_path]
    ray.get(futures)
finally:   
    ray.shutdown()

2020-01-04 11:21:13,249	INFO node.py:498 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2020-01-04_11-21-13_246078_5391/logs.
2020-01-04 11:21:13,357	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:59810 to respond...
2020-01-04 11:21:13,492	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:40126 to respond...
2020-01-04 11:21:13,499	INFO services.py:809 -- Starting Redis shard with 10.0 GB max memory.
2020-01-04 11:21:13,544	INFO node.py:512 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2020-01-04_11-21-13_246078_5391/logs.
2020-01-04 11:21:13,547	INFO services.py:1475 -- Starting the Plasma object store with 15.15 GB memory using /dev/shm.


[2m[36m(pid=11173)[0m /media/external/all_classes_300x300_1.h5 has 7281 batches
[2m[36m(pid=11170)[0m /media/external/all_classes_300x300_3.h5 has 7267 batches
[2m[36m(pid=11166)[0m /media/external/all_classes_300x300_4.h5 has 7259 batches
[2m[36m(pid=11167)[0m /media/external/all_classes_300x300_0.h5 has 7291 batches
[2m[36m(pid=11172)[0m /media/external/all_classes_300x300_2.h5 has 7271 batches
[2m[36m(pid=11171)[0m /media/external/all_classes_300x300_6.h5 has 7252 batches
[2m[36m(pid=11174)[0m /media/external/all_classes_300x300_7.h5 has 7248 batches
[2m[36m(pid=11169)[0m /media/external/all_classes_300x300_5.h5 has 7254 batches
[2m[36m(pid=11170)[0m /media/external/all_classes_300x300_3.h5 250
[2m[36m(pid=11167)[0m /media/external/all_classes_300x300_0.h5 250
[2m[36m(pid=11166)[0m /media/external/all_classes_300x300_4.h5 250
[2m[36m(pid=11174)[0m /media/external/all_classes_300x300_7.h5 250
[2m[36m(pid=11169)[0m /media/external/all_classes_300

[2m[36m(pid=11169)[0m /media/external/all_classes_300x300_5.h5 3500
[2m[36m(pid=11170)[0m /media/external/all_classes_300x300_3.h5 3500
[2m[36m(pid=11166)[0m /media/external/all_classes_300x300_4.h5 3500
[2m[36m(pid=11173)[0m /media/external/all_classes_300x300_1.h5 3500
[2m[36m(pid=11171)[0m /media/external/all_classes_300x300_6.h5 3500
[2m[36m(pid=11167)[0m /media/external/all_classes_300x300_0.h5 3750
[2m[36m(pid=11174)[0m /media/external/all_classes_300x300_7.h5 3750
[2m[36m(pid=11172)[0m /media/external/all_classes_300x300_2.h5 3750
[2m[36m(pid=11169)[0m /media/external/all_classes_300x300_5.h5 3750
[2m[36m(pid=11171)[0m /media/external/all_classes_300x300_6.h5 3750
[2m[36m(pid=11166)[0m /media/external/all_classes_300x300_4.h5 3750
[2m[36m(pid=11173)[0m /media/external/all_classes_300x300_1.h5 3750
[2m[36m(pid=11170)[0m /media/external/all_classes_300x300_3.h5 3750
[2m[36m(pid=11167)[0m /media/external/all_classes_300x300_0.h5 4000
[2m[

[2m[36m(pid=11166)[0m /media/external/all_classes_300x300_4.h5 7000
[2m[36m(pid=11167)[0m /media/external/all_classes_300x300_0.h5 7250
[2m[36m(pid=11170)[0m /media/external/all_classes_300x300_3.h5 7250
[2m[36m(pid=11171)[0m /media/external/all_classes_300x300_6.h5 7250
[2m[36m(pid=11169)[0m /media/external/all_classes_300x300_5.h5 7250
[2m[36m(pid=11173)[0m /media/external/all_classes_300x300_1.h5 7250
[2m[36m(pid=11172)[0m /media/external/all_classes_300x300_2.h5 7250
[2m[36m(pid=11166)[0m /media/external/all_classes_300x300_4.h5 7250
CPU times: user 6min 9s, sys: 1min, total: 7min 10s
Wall time: 8h 15min 34s


In [43]:
y[:,:,0:-4]

array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0.