In [29]:
import os
import json
import ujson

import fsspec
import earthaccess

import xarray as xr
from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr
from virtualizarr import open_virtual_dataset

In [35]:
# Get Earthdata creds
earthaccess.login()

# Get AWS creds
fs_data = earthaccess.get_s3_filesystem(daac="PODAAC")

In [45]:
# Locate file information / metadata:
granule_info = earthaccess.search_data(
    short_name="OSTIA-UKMO-L4-GLOB-REP-v2.0",
    count=50
    )

In [46]:
fobjs = earthaccess.open(granule_info)

QUEUEING TASKS | :   0%|          | 0/50 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/50 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/50 [00:00<?, ?it/s]

## Create reference file with kerchunk

In [47]:
def single_ref_earthaccess_kerchunk(fobj):
    """
    Create a reference for a single data file. "fobj" (earthaccess.store.EarthAccessFile 
    object) is the output from earthaccess.open(), which also has the file endpoint.
    """
    endpoint = fobj.full_name
    return SingleHdf5ToZarr(fobj, endpoint, inline_threshold=0).translate()

In [50]:
%%time
single_refs_kerchunk = [single_ref_earthaccess_kerchunk(fobj) for fobj in fobjs]    

CPU times: user 918 ms, sys: 3.42 ms, total: 921 ms
Wall time: 920 ms


In [51]:
%%time
# Combined reference file
kwargs_mzz = {'remote_protocol':"s3", 'remote_options':fs.storage_options, 'concat_dims':["time"]}
combined_ref_kerchunk = MultiZarrToZarr(single_refs_kerchunk, **kwargs_mzz).translate()

# Save reference info to JSON:
with open("ref_combined_kerchunk.json", 'wb') as outf:
    outf.write(ujson.dumps(combined_ref_kerchunk).encode())

CPU times: user 327 ms, sys: 9.39 ms, total: 337 ms
Wall time: 3.02 s


## Create reference file with virtualizarr

In [52]:
%%time

# Get S3 endpoints:
data_s3links = [g.data_links(access="direct")[0] for g in granule_info]

# Create single ref files:
singe_refs_virtualizarr = [
    open_virtual_dataset(p, indexes={}, reader_options={"storage_options": fs_data.storage_options})
    for p in data_s3links
    ]

# Create combined ref file and save to json:
combined_ref_virtualizarr = xr.combine_nested(singe_refs_virtualizarr, concat_dim='time', coords='minimal', compat='override', combine_attrs='drop_conflicts')
combined_ref_virtualizarr.virtualize.to_kerchunk("ref_combined_virtualizarr.json", format='json')

CPU times: user 5.02 s, sys: 974 ms, total: 6 s
Wall time: 20.2 s


## Compare opening data with the two ref files

In [53]:
def opendf_withref(ref, fs_data):
    """
    Wrapper function to open data with xarray, using a ref file. "ref" is a reference file or object. 
    "fs_data" is a filesystem with credentials to access the data files. 
    """
    storage_opts = {"fo": ref, "remote_protocol": "s3", "remote_options": fs_data.storage_options}
    m = fsspec.filesystem('reference', **storage_opts).get_mapper('')
    return xr.open_dataset(
        m, engine="zarr", chunks={},
        backend_kwargs={"consolidated": False}
        )

In [54]:
%%time
data_kerchunk = opendf_withref("ref_combined_kerchunk.json", fs_data)
print(data_kerchunk)

<xarray.Dataset> Size: 36GB
Dimensions:           (time: 50, lat: 3600, lon: 7200)
Coordinates:
  * lat               (lat) float32 14kB -89.97 -89.93 -89.88 ... 89.93 89.97
  * lon               (lon) float32 29kB -180.0 -179.9 -179.9 ... 179.9 180.0
  * time              (time) datetime64[ns] 400B 1982-01-01T12:00:00 ... 1982...
Data variables:
    analysed_sst      (time, lat, lon) float64 10GB dask.array<chunksize=(1, 1200, 2400), meta=np.ndarray>
    analysis_error    (time, lat, lon) float64 10GB dask.array<chunksize=(1, 1200, 2400), meta=np.ndarray>
    mask              (time, lat, lon) float32 5GB dask.array<chunksize=(1, 1800, 3600), meta=np.ndarray>
    sea_ice_fraction  (time, lat, lon) float64 10GB dask.array<chunksize=(1, 1800, 3600), meta=np.ndarray>
Attributes: (12/47)
    Conventions:                CF-1.4, ACDD-1.3
    Metadata_Conventions:       Unidata Observation Dataset v1.0
    acknowledgment:             Please acknowledge the use of these data with...
    cdm_d

In [55]:
%%time
data_virtualizarr = opendf_withref("ref_combined_virtualizarr.json", fs_data)
print(data_virtualizarr)

<xarray.Dataset> Size: 36GB
Dimensions:           (time: 50, lat: 3600, lon: 7200)
Coordinates:
  * lat               (lat) float32 14kB -89.97 -89.93 -89.88 ... 89.93 89.97
  * lon               (lon) float32 29kB -180.0 -179.9 -179.9 ... 179.9 180.0
  * time              (time) datetime64[ns] 400B 1982-01-01T12:00:00 ... 1982...
Data variables:
    analysed_sst      (time, lat, lon) float64 10GB dask.array<chunksize=(1, 1200, 2400), meta=np.ndarray>
    analysis_error    (time, lat, lon) float64 10GB dask.array<chunksize=(1, 1200, 2400), meta=np.ndarray>
    mask              (time, lat, lon) float32 5GB dask.array<chunksize=(1, 1800, 3600), meta=np.ndarray>
    sea_ice_fraction  (time, lat, lon) float64 10GB dask.array<chunksize=(1, 1800, 3600), meta=np.ndarray>
Attributes: (12/42)
    Conventions:                CF-1.4, ACDD-1.3
    Metadata_Conventions:       Unidata Observation Dataset v1.0
    acknowledgment:             Please acknowledge the use of these data with...
    cdm_d