In [15]:
import h5py
import json
import os
import numpy as np
import openpyxl
import pandas as pd

# Define paths to data files
fdir = os.path.normpath(os.getcwd() + os.sep)
scdir = fdir + '\\Paterson5_CarraraTestRun_new2.json'  # schema file name
data_folder = fdir + '\\Paterson_Carrara_Test_2'

# Path to output HDF5 file
hdf5_file = fdir + '\\HDF5_data.h5'

# Open JSON file and load metadata
with open(scdir, 'r') as f:
    metadata = json.load(f)

# Remove HDF5 file if it exists
if os.path.exists(hdf5_file):
    os.remove(hdf5_file)

# Function to create nested groups based on directory structure
def create_nested_groups(f, path):
    groups = path.split(os.sep)
    for group_name in groups:
        if group_name:
            if group_name not in f:
                f.create_group(group_name)
            f = f[group_name]

# Open HDF5 file in append mode
with h5py.File(hdf5_file, 'a') as f:
    # Save metadata JSON as an attribute of the root group
    metadata_str = json.dumps(metadata, indent=4)
    f.attrs['Schema_json'] = metadata_str

    # Loop through each directory and subdirectory
    for root, dirs, files in os.walk(data_folder):
        group_path = os.path.relpath(root, data_folder)
        create_nested_groups(f, group_path)
        group = f
        if group_path != '.':
            groups = group_path.split(os.sep)
            for sub_group_name in groups:
                group = group[sub_group_name]

        for file_name in files:
            file_path = os.path.join(root, file_name)

            # Skip non-regular files
            if not os.path.isfile(file_path):
                continue

            # Get the file extension
            extension = os.path.splitext(file_name)[1]

            # Check if dataset already exists and delete it
            if file_name in group:
                del group[file_name]

            if extension in [".csv", ".txt", ".xls", ".xlsx"]:
                # Read the file as binary
                with open(file_path, "rb") as file:
                    binary_data = file.read()
                binary_data_vla = np.asarray(binary_data)
                group.create_dataset(file_name, data=binary_data_vla)
            
            elif extension in [".jpg", ".jpeg", ".png"]:
                # Read the image file as binary
                with open(file_path, 'rb') as img_f:
                    binary_data = img_f.read() # read the image as python binary
                binary_data_vla = np.asarray(binary_data)
                group.create_dataset(file_name, data=binary_data_vla)

            else:
                # Read other file types as binary
                with open(file_path, "rb") as file:
                    binary_data = file.read()
                binary_data_vla = np.asarray(binary_data)
                group.create_dataset(file_name, data=binary_data_vla)