In [331]:
import ale
import base64
import brotli
import h5py
import gzip
import json
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os
from functools import reduce
from sozipfile import sozipfile as szip
from osgeo import gdal

In [333]:
filenames = ['./lroc_isd0.json','./lroc_isd1.json','./lroc_isd2.json',
             './lroc_isd3.json','./lroc_isd4.json','./lroc_isd5.json',
             './lroc_isd6.json','./lroc_isd7.json','./lroc_isd8.json',
             './lroc_isd9.json']
isds = []
for fn in filenames:
    with open(fn, 'r') as f:  
        new_isd = json.load(f)
        isds.append(new_isd)

In [334]:
isds_altered = isds.copy()

In [335]:
for i, altered in enumerate(isds_altered):
    diff = (1 + (i/100000000))
    altered['instrument_pointing']['ephemeris_times'] = [ diff * num for num in altered['instrument_pointing']['ephemeris_times']]
    altered['instrument_pointing']['quaternions'] = [[diff * num for num in quats] for quats in altered['instrument_pointing']['quaternions']]
    altered['instrument_pointing']['angular_velocities'] = [[diff * num for num in quats] for quats in altered['instrument_pointing']['angular_velocities']]


In [402]:
def compress_multispectral_isd(outfile, json_str_list, alg='hdf5'):
    if alg=="hdf5":
        with h5py.File(outfile, 'w') as f:
            def store_recursive(key, val, group):
                chain = (group.name +'/'+ key).split('/')
                # Drop empty elements from chain.  Should find a better way to to do this.
                chain = [item for item in chain if item]
                if isinstance(val, dict):
                    # Create nested group in hdf5, i.e. f['group/subgroup']
                    subgroup = group.create_group(key)
                    for k, v in val.items():
                        store_recursive(k,v, subgroup)
                elif isinstance(val, str):
                    data = np.array([val], dtype='S')
                    try:
                        # Use the list of keys to access same nested location in all the dictionaries
                        data = [reduce(lambda d, k: d[k], chain, item) for item in json_str_list]
                    except KeyError:
                        # If key isn't found, replicate a placeholder string for all isds.
                        #  Should probably use data where it's available instead of replacing all values with empty string
                        data = [str("empty")] * len(json_str_list)
                    # Encode all data to strings
                    data = list(map(lambda val: np.array([val], dtype='S'), data))
                    group.create_dataset(key, data=data)
                else:
                    data = np.array([val])
                    if data.size > 1:
                        data = [reduce(lambda d, k: d.get(k, {'empty': []}), chain, item) for item in json_str_list]
                        group.create_dataset(key, data=data, compression='gzip', compression_opts=9)
                    else:
                        if '/' in key:
                            return
                        data = [reduce(lambda d, k: d.get(k, {'empty': []}), chain, item) for item in json_str_list]
                        group.create_dataset(key, data=data)
            for k,v in json_str_list[0].items():
                store_recursive(k,v, f)
    elif alg=="brotli":
        json_bytes = np.frombuffer(json_str.encode('utf-8'), dtype='uint8')
        with open(outfile, 'wb') as f:
            f.write(brotli.compress(json_bytes))
            return True
    elif alg=="sozip":
        with szip.ZipFile(outfile, 'w', compression=szip.ZIP_DEFLATED) as f:
            f.write(json_str, os.path.basename(outfile))
            return True
    elif alg=="gdal":
        with gdal.Open('out.tif', gdal.GA_Update) as df:
            def store_recursive(key, val, keyword):
                chain = (keyword +'/'+ key).split('/')
                # Drop empty elements from chain.  Should find a better way to to do this.
                chain = [item for item in chain if item]
                if isinstance(val, dict):
                    # Create nested group in hdf5, i.e. f['group/subgroup']
                    subgroup = keyword + '/' + key
                    for k, v in val.items():
                        store_recursive(k,v, subgroup)
                elif isinstance(val, str):
                    data = np.array([val], dtype='S')
                    try:
                        # Use the list of keys to access same nested location in all the dictionaries
                        data = [reduce(lambda d, k: d[k], chain, item) for item in json_str_list]
                    except KeyError:
                        # If key isn't found, replicate a placeholder string for all isds.
                        #  Should probably use data where it's available instead of replacing all values with empty string
                        data = [str("empty")] * len(json_str_list)
                    # Encode all data to strings
                    data = json.dumps(data)
                    df.SetMetadataItem('/'.join(chain), data)
                else:
                    data = np.array([val])
                    if data.size > 10:
                        data = [reduce(lambda d, k: d.get(k, {'empty': []}), chain, item) for item in json_str_list]
                        # Uncomment these lines to compress 
                        # list of lists -> list of ndarrays
                        #data = list(map(np.array, data))
                        # list of ndarrays -> list of bytes
                        #data = list(map(np.ndarray.tobytes, data))
                        # encode strings as utf-8
                        #data = [base64.b64encode(byte_data).decode('utf-8') for byte_data in data]
                        # dump to json 
                        data = json.dumps(data)
                        df.SetMetadataItem('/'.join(chain), data)
                    else:
                        if '/' in key:
                            return
                        data = [reduce(lambda d, k: d.get(k, {'empty': []}), chain, item) for item in json_str_list]
                        #group.create_dataset(key, data=data)
                        try:
                            data = json.dumps(data.astype(list))
                        except:
                            data = json.dumps(data)
                        df.SetMetadataItem('/'.join(chain), data)
            for k,v in json_str_list[0].items():
                store_recursive(k,v, '')
    else:
        return False



        

In [403]:
def read_hdf5(hdf5_file):
    def read_recursive(group):
        result = {}
        for key, item in group.items():
            if isinstance(item, h5py.Group): 
                result[key] = read_recursive(item)
            elif isinstance(item, h5py.Dataset):
                data = item[:]
                decoded = decode_data(data)
                if (key == "ephemeris_times" or key == "constant_frames") and not hasattr(decoded, '__iter__'):
                    result[key] = [decoded]
                else:
                    result[key] = decoded
                
        return result
    def decode_data(data):
        """Convert NumPy arrays to raw Python data types."""
        if isinstance(data, np.ndarray):
            if data.dtype.char == 'S' or data.dtype.char == 'U': 
                return data.astype(str)[0]
            elif np.issubdtype(data.dtype, np.number):
                if data.size == 1:
                    return data.item()
                else:
                    return data[0].tolist()
            else:
                return data.tolist()

    with h5py.File(hdf5_file, 'r') as f:
        # Start the recursive reading from the root of the file
        data = read_recursive(f)

    return data

In [404]:
def read_brotli(compressed_json_file):
    with open(compressed_json_file, 'rb') as f:
        data = brotli.decompress(f.read())
    isd = json.loads(data)
    return isd

In [405]:
def read_sozip(zip_file):
    with szip.ZipFile(zip_file, 'r') as f:
        data = f.read("lroc.zip")
    isd = json.loads(data.decode('utf-8'))
    return isd

In [411]:
# Test that compression/decompression yields identical isds
#%timeit compress_isd("lroc.h5", original_isd, "hdf5")
#compress_multispectral_isd("lroc_transpose.h5", isds, "hdf5")

compress_multispectral_isd("out.tif", isds_altered, "gdal")


#print("compress brotli: ")
#%timeit compress_isd("lroc.br", json.dumps(original_isd), "brotli")
#print("compress sozip: ")
#%timeit compress_isd("lroc.zip", filename, "sozip")
#%timeit hdf5_isd = read_hdf5("lroc.h5")

#print("decompress brotli: ")
#%timeit brotli_isd = read_brotli("lroc.br")

#print("decompress sozip: ")
#%timeit sozip_isd = read_sozip("lroc.zip")


#print(f"HDF5: {hdf5_isd == original_isd}")
#print(f"brotli: {brotli_isd == original_isd}")
#print(f"sozip: {sozip_isd == original_isd}")


In [356]:
#compress_multispectral_isd("lroc_altered_transpose.h5", isds_altered, "hdf5")

In [357]:
hdf5_isd = read_hdf5('lroc_transpose.h5')

In [414]:
with h5py.File('lroc_transpose.h5', 'r') as f:
    # Start the recursive reading from the root of the file
    print(f.keys())
    print(f['body_rotation']['angular_velocities'][:])
    print(f['isis_camera_version'])
    print(f['line_scan_rate'][:])

<KeysViewHDF5 ['body_rotation', 'center_ephemeris_time', 'detector_center', 'detector_line_summing', 'detector_sample_summing', 'focal2pixel_lines', 'focal2pixel_samples', 'focal_length_model', 'image_lines', 'image_samples', 'instrument_pointing', 'instrument_position', 'interpolation_method', 'isis_camera_version', 'line_scan_rate', 'naif_keywords', 'name_model', 'name_platform', 'name_sensor', 'optical_distortion', 'radii', 'reference_height', 'starting_detector_line', 'starting_detector_sample', 'starting_ephemeris_time', 'sun_position']>
[[[ 6.22675585e-08 -1.02491422e-06  2.45531633e-06]
  [ 6.22675607e-08 -1.02491422e-06  2.45531633e-06]]

 [[ 6.22675585e-08 -1.02491422e-06  2.45531633e-06]
  [ 6.22675607e-08 -1.02491422e-06  2.45531633e-06]]

 [[ 6.22675585e-08 -1.02491422e-06  2.45531633e-06]
  [ 6.22675607e-08 -1.02491422e-06  2.45531633e-06]]

 [[ 6.22675585e-08 -1.02491422e-06  2.45531633e-06]
  [ 6.22675607e-08 -1.02491422e-06  2.45531633e-06]]

 [[ 6.22675585e-08 -1.02491