# Exploring building virtual datasets with kerchunk

The idea: build a test intake catalogue of kerchunk virtual datasets from the COSIMA outputs and see how it performs

In [1]:
import dask

import glob

import fsspec

import ujson

from distributed import Client

from kerchunk.hdf import SingleHdf5ToZarr

In [2]:
client = Client(n_workers=8)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 8
Total threads: 8,Total memory: 32.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:36631,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 32.00 GiB

0,1
Comm: tcp://127.0.0.1:36137,Total threads: 1
Dashboard: http://127.0.0.1:38633/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:34875,
Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-j22wi9vg,Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-j22wi9vg

0,1
Comm: tcp://127.0.0.1:32861,Total threads: 1
Dashboard: http://127.0.0.1:36529/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:38601,
Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-_mux_150,Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-_mux_150

0,1
Comm: tcp://127.0.0.1:46169,Total threads: 1
Dashboard: http://127.0.0.1:42829/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:35277,
Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-rbocs9vm,Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-rbocs9vm

0,1
Comm: tcp://127.0.0.1:33203,Total threads: 1
Dashboard: http://127.0.0.1:34967/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:41593,
Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-8mucxcfp,Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-8mucxcfp

0,1
Comm: tcp://127.0.0.1:33121,Total threads: 1
Dashboard: http://127.0.0.1:40611/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:35117,
Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-av_nfvma,Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-av_nfvma

0,1
Comm: tcp://127.0.0.1:40949,Total threads: 1
Dashboard: http://127.0.0.1:36409/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:43185,
Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-r7_u05u6,Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-r7_u05u6

0,1
Comm: tcp://127.0.0.1:44167,Total threads: 1
Dashboard: http://127.0.0.1:33823/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:33281,
Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-c_r4lmq1,Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-c_r4lmq1

0,1
Comm: tcp://127.0.0.1:40951,Total threads: 1
Dashboard: http://127.0.0.1:33153/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:32881,
Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-s_ffd8k8,Local directory: /jobfs/61001291.gadi-pbs/dask-worker-space/worker-s_ffd8k8


# First try build a virtual dataset from `access-om2-025` `025deg_jra55_iaf_omip2_cycle1` `ocean_month` data (~2.2TB, 61 netcdf files)

## Write single file jsons in parallel

In [3]:
exp_root = "/g/data/ik11/outputs/access-om2-025/025deg_jra55_iaf_omip2_cycle1"

fs = fsspec.filesystem('file')
flist = fs.glob(f"{exp_root}/output*/ocean/ocean_month.nc")

In [6]:
@dask.delayed
def gen_json(file):
    with fs.open(file) as infile:
        h5chunks = SingleHdf5ToZarr(infile, file)
        outf = f"{'.'.join(file.split('/')[5:])}.json"
        with open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());

In [7]:
%%time

# This would take well over an hour without dask
_ = dask.compute(*[gen_json(file) for file in flist])

CPU times: user 1min 10s, sys: 48 s, total: 1min 58s
Wall time: 13min 55s


## Combine into multi-file json

In [8]:
from kerchunk.combine import MultiZarrToZarr

In [11]:
%%time
json_list = fs.glob("./access-om2-025.025deg_jra55_iaf_omip2_cycle1.*.ocean.ocean_month.nc.json")

mzz = MultiZarrToZarr(
    json_list[:2],
    concat_dims=['time'],
    identical_dims=[
        "xt_ocean", 
        "yt_ocean", 
        "st_ocean", 
        "xu_ocean", 
        "yu_ocean", 
        "sw_ocean", 
        "grid_xt_ocean", 
        "grid_yt_ocean", 
        "grid_xu_ocean", 
        "grid_yu_ocean", 
        "potrho", 
        "neutral", 
        "nv"
     ],
)

d = mzz.translate()

with open('access-om2-025.025deg_jra55_iaf_omip2_cycle1.ocean_month.json', 'wb') as f:
    f.write(ujson.dumps(d).encode())

CPU times: user 2.91 s, sys: 1.47 s, total: 4.38 s
Wall time: 3.08 s


In [18]:
import xarray as xr

ds = xr.open_dataset(fs, engine="zarr", backend_kwargs={
                    "consolidated": False,
                    "storage_options": {"fo": 'access-om2-025.025deg_jra55_iaf_omip2_cycle1.ocean_month.json'}
                    })
print(ds)

ValueError: Starting with Zarr 2.11.0, stores must be subclasses of BaseStore, if your store exposes the MutableMapping interface wrap it in Zarr.storage.KVStore. Got <fsspec.implementations.local.LocalFileSystem object at 0x145cf1b596a0>