In [4]:
import h5py
import numpy as np

In [1]:
# def merge_dataset(f_in, f_out, chunk_size=20):
#     # Copy 'train' datasets if not exist in output
#     for data_name in f_in['train']:
#         if data_name not in f_out:
#             data = f_in['train'][data_name]
#             # Create chunked dataset
#             f_out.create_dataset(data_name, data=data, chunks=(min(chunk_size, data.shape[0]),) + data.shape[1:])

#     for group_name in ['test', 'valid']:
#         for data_name in f_in[group_name]:
#             ds_in = f_in[group_name][data_name]
#             if data_name in f_out:
#                 # Get existing dataset
#                 ds_out = f_out[data_name]
#                 # Determine the name for a temporary dataset
#                 temp_name = data_name + "_temp"
#                 # If a temporary dataset already exists, delete it
#                 if temp_name in f_out:
#                     del f_out[temp_name]
#                 # Create a new dataset with chunking and expanded size
#                 new_shape = (ds_out.shape[0] + ds_in.shape[0],) + ds_out.shape[1:]
#                 chunked_shape = (min(chunk_size, new_shape[0]),) + new_shape[1:]
#                 temp_ds = f_out.create_dataset(temp_name, new_shape, chunks=chunked_shape)
#                 # Copy old data to new dataset
#                 temp_ds[:ds_out.shape[0]] = ds_out[:]
#                 # Append new data in chunks
#                 for i in range(0, ds_in.shape[0], chunk_size):
#                     chunk = ds_in[i:i + chunk_size]
#                     temp_ds[ds_out.shape[0] + i:ds_out.shape[0] + i + chunk.shape[0]] = chunk
#                 # Delete the old dataset
#                 del f_out[data_name]
#                 # Create a new dataset with the original name and copy the data from the temporary dataset
#                 f_out.create_dataset(data_name, data=temp_ds, chunks=chunked_shape)
#                 # Delete the temporary dataset
#                 del f_out[temp_name]
#             else:
#                 # Create new dataset if not exist
#                 f_out.create_dataset(data_name, data=ds_in, chunks=(min(chunk_size, ds_in.shape[0]),) + ds_in.shape[1:])

In [8]:
def merge_dataset_chunked(f_in, f_out, chunk_size=20):
    # Copy 'train' datasets if not exist in output, in chunks
    for data_name in f_in['train']:
        if data_name not in f_out:
            ds_in = f_in['train'][data_name]
            max_shape = ds_in.shape
            chunk_shape = (min(chunk_size, max_shape[0]),) + max_shape[1:]
            ds_out = f_out.create_dataset(data_name, shape=max_shape, dtype=ds_in.dtype, chunks=chunk_shape)
            for i in range(0, ds_in.shape[0], chunk_size):
                ds_out[i:i + chunk_size] = ds_in[i:i + chunk_size]
    
    for group_name in ['test', 'valid']:
        if group_name in f_in:
            for data_name in f_in[group_name]:
                ds_in = f_in[group_name][data_name]
                if data_name in f_out:
                    ds_out = f_out[data_name]
                    new_shape = (ds_out.shape[0] + ds_in.shape[0],) + ds_out.shape[1:]
                    temp_name = data_name + "_temp"
                    temp_ds = f_out.create_dataset(temp_name, new_shape, dtype=ds_in.dtype, chunks=ds_out.chunks)
                    
                    # Copy data from the old dataset in chunks
                    for i in range(0, ds_out.shape[0], chunk_size):
                        end_index = min(i + chunk_size, ds_out.shape[0])
                        temp_ds[i:end_index] = ds_out[i:end_index]
                    
                    del f_out[data_name]  # Remove old dataset
                    
                    # Append new data in chunks
                    for i in range(0, ds_in.shape[0], chunk_size):
                        end_index = ds_out.shape[0] + i + chunk_size
                        temp_ds[ds_out.shape[0] + i:end_index] = ds_in[i:i + chunk_size]
                    
                    f_out.move(temp_name, data_name)
                else:
                    max_shape = ds_in.shape
                    chunk_shape = (min(chunk_size, max_shape[0]),) + max_shape[1:]
                    f_out.create_dataset(data_name, shape=max_shape, dtype=ds_in.dtype, chunks=chunk_shape, data=ds_in)


In [9]:
def copy_datasets(source_file_path, target_file_path):
    with h5py.File(source_file_path, 'r') as source_file, h5py.File(target_file_path, 'w') as target_file:
        def copy_dataset(name, obj):
            if isinstance(obj, h5py.Dataset):
                target_file.create_dataset(name, data=obj[()])
            elif isinstance(obj, h5py.Group):
                target_file.create_group(name)
        
        source_file.visititems(copy_dataset)

In [11]:
input_file = "../data/ns_contextual/ns_random_forces.h5"
output_file = "../data/ns_contextual/ns_random_forces_v1.h5"

with h5py.File(input_file, 'r') as f_in:
    with h5py.File(output_file, 'a') as f_out:
        merge_dataset_chunked(f_in, f_out)

        # Read the data into memory
        a_data = f_out['a'][:]
        u_data = f_out['u'][:]

        # Process the data
        expanded_a_data = np.expand_dims(a_data, axis=-1)
        concatenated_data = np.concatenate((expanded_a_data, u_data), axis=-1)

        # Replace the 'u' dataset
        del f_out['u']  # Delete the existing 'u' dataset
        f_out.create_dataset('u', data=concatenated_data)

        # Optionally, delete the 'a' dataset if no longer needed
        del f_out['a']

MemoryError: Unable to allocate 68.4 GiB for an array with shape (1400, 256, 256, 200) and data type float32

In [None]:
source_h5_path = "../data/ns_contextual/ns_random_forces_v1.h5"
target_h5_path = "../data/ns_contextual/ns_random_forces_v0.h5"
copy_datasets(source_h5_path, target_h5_path)

In [None]:
!rm "../data/ns_contextual/ns_random_forces_v1.h5"