# How does kerchunk handle missing data? 

E.g. a variable is not present in all netcdf files

In [1]:
%cd /g/data/tm70/ds0092/projects/dev_data_querying/cosima_intake

/g/data/tm70/ds0092/projects/dev_data_querying/cosima_intake


In [2]:
import os
import glob

import ujson

import fsspec

import zarr

import xarray as xr

from kerchunk import hdf, df, combine

import matplotlib.pyplot as plt

In [3]:
# Need https://github.com/fsspec/kerchunk/pull/311
fsspec.__version__

'2023.1.0+20.ga50899e'

## Use some files from `/g/data/ik11/outputs/access-om2/1deg_jra55v14_ryf` as a simple test case

Note these data have an issue which requires a work-around. The `time_bounds` variable has non-cf-compliant (I think) units (`days` rather than `days since ...`)

In [4]:
dss = [
    (
        "ocean-2d-mld-1-monthly-mean-ym_1900_01.nc", 
        "ocean-2d-surface_pot_temp-1-monthly-mean-ym_1900_01.nc"
    ),
    (
        # "ocean-2d-mld-1-monthly-mean-ym_1901_01.nc",
        "ocean-2d-surface_pot_temp-1-monthly-mean-ym_1901_01.nc",
    ),
    (
        "ocean-2d-mld-1-monthly-mean-ym_1902_01.nc",
        "ocean-2d-surface_pot_temp-1-monthly-mean-ym_1902_01.nc"
    )
]

In [5]:
for idx, ds in enumerate(dss):
    data = xr.open_mfdataset(
        [f"./test_data/{d}" for d in ds],
        decode_timedelta=False # Needed 
    )
    data.to_netcdf(f"./test_data/ds{idx}.nc")

## Build kerchunk reference dataset

In [6]:
fs = fsspec.filesystem('file')

files = sorted(fs.glob("./test_data/ds*.nc"))

In [7]:
def gen_json(file):
    
    with fs.open(file) as infile:
        h5chunks = hdf.SingleHdf5ToZarr(infile, file)
        outf = f"{os.path.splitext(file)[0]}.json"
        with open(outf, 'wb') as f:
            f.write(ujson.dumps(h5chunks.translate()).encode());
            
for file in files:
    gen_json(file)

In [8]:
json_list = fs.glob("./test_data/*.json")

mzz = combine.MultiZarrToZarr(
    json_list,
    concat_dims='time',
    identical_dims=[
        "xt_ocean", 
        "yt_ocean", 
        "nv"
     ],
).translate()

ValueError: Found chunk size mismatch:
                        at prefix time_bounds in iteration 1 (file file:///g/data/tm70/ds0092/projects/dev_data_querying/cosima_intake/test_data/ds1.json)
                        new chunk: [12, 2]
                        chunks so far: [1, 2]

## Write to a json file

In [None]:
with open("./intermittent.json", 'wb') as f:
    f.write(ujson.dumps(mzz).encode());
    
for json in json_list:
    os.remove(json)

# Compute on the reference dataset

In [None]:
%%time

m = fsspec.get_mapper(
    'reference://', 
    fo="./intermittent.json", 
    remote_protocol="file"
)

ds = xr.open_dataset(
    m,
    engine='zarr', 
    backend_kwargs={"consolidated": False},
    chunks="auto",
    # decode_times=False
)

In [None]:
def plot_vars(ds):
    dsm = ds.mean(["xt_ocean", "yt_ocean"])
    dsm["mld"].plot(label="mld")
    (dsm["surface_pot_temp"]-273.15).plot(label="surface_pot_temp")
    plt.ylabel("")
    plt.legend()

In [None]:
plot_vars(ds)