# Using a Kerchunk catalog to open the GAMAR archive

In this example, we use a Kerchunk catalog generated on the HPC infrastructure. The same workflow can then open the data either directly on the HPC filesystem, or over HTTPS using the DATARMOR export service that exposes /scale/project/lops-oh-fair2adapt/ at https://data-fair2adapt.ifremer.fr/



Here are the list of kerchunk files created by fred

#open_virtual_dataset version using ds.virtualize.to_kerchunk(outfile, format="json")
#KERCHUNK_CATALOG='/scale/project/lops-oh-fair2adapt/fpaul/tmp/riomar.json'
#KERCHUNK_CATALOG='/scale/project/lops-oh-fair2adapt/fpaul/tmp/riomar_3months.json'
#KERCHUNK_CATALOG='https://data-fair2adapt.ifremer.fr/fpaul/tmp/riomar_3months.json'

#or, http urls inside (!timeout si depuis ifremer...):
#KERCHUNK_CATALOG='/scale/project/lops-oh-fair2adapt/fpaul/tmp/riomar_sed_web.json'  # timeout

#or, kerchunk multizarr version  KO / plot VIDE si time_counter != 0 !!
#KERCHUNK_CATALOG='/scale/project/lops-oh-fair2adapt/fpaul/tmp/riomar.kerchunk_multizarr.json'


In [None]:
%%time
import json
import fsspec
from pathlib import Path

#import fsspec
import xarray as xr

HPC_PREFIX    = "/scale/project/lops-oh-fair2adapt/"
HTTPS_PREFIX  = "https://data-fair2adapt.ifremer.fr/"
CATALOG_PATH  = "fpaul/tmp/riomar_3months.json"
OUT_PARQUET   = "riomar_3months_.parq"   # local parquet refs cache

def patch_kc_refs_inplace(kc, hpc_prefix=HPC_PREFIX, https_prefix=HTTPS_PREFIX):
    refs = kc.get("refs", kc.get("references"))
    if refs is None:
        raise KeyError("Can't find 'refs' (or 'references') in kerchunk JSON")

    def patch_target(x):
        if isinstance(x, str) and x.startswith(hpc_prefix):
            return https_prefix + x[len(hpc_prefix):]
        return x

    for k, v in list(refs.items()):
        if isinstance(v, list) and v and isinstance(v[0], str):
            refs[k] = [patch_target(v[0])] + v[1:]
        elif isinstance(v, str):
            refs[k] = patch_target(v)

    kc["refs"] = refs
    return kc


# ------------------------------
# 1) HPC mode: open directly
# ------------------------------
if Path(HPC_PREFIX).exists():
    KERCHUNK_CATALOG = HPC_PREFIX + CATALOG_PATH
    print("Running in HPC mode:", KERCHUNK_CATALOG)

    ds = xr.open_dataset(KERCHUNK_CATALOG, engine="kerchunk", chunks={})

# ------------------------------
# 2) HTTPS mode: prefer local parquet cache if present
# ------------------------------
else:
    KERCHUNK_CATALOG = HTTPS_PREFIX + CATALOG_PATH
    print("Running in HTTPS mode:", KERCHUNK_CATALOG)

    # If parquet refs already exist locally, open them (fast path)
    # This part is commented since on the fly transformation is faster than loading the parquet file in actual config
    # (check why at some point) 
    if False and Path(OUT_PARQUET).exists():
    #if Path(OUT_PARQUET).exists():
        print(f"✅ Found local parquet refs: ./{OUT_PARQUET} -> opening that")
        xr.open_dataset(OUT_PARQUET, engine="kerchunk", chunks={})

    # Else: fetch JSON, patch refs to https, open, AND write parquet refs cache
    else:
        print(f"ℹ️ No local parquet refs found at ./{OUT_PARQUET} -> creating them from JSON")

        with fsspec.open(KERCHUNK_CATALOG, "rt") as f:
            kc = json.load(f)

        kc = patch_kc_refs_inplace(kc)

        # open now (from in-memory dict)
        ds = xr.open_dataset(kc, engine="kerchunk", chunks={})

        # write parquet refs cache for next time
        import kerchunk.df as kcdf
        kcdf.refs_to_dataframe(kc, OUT_PARQUET)
        print("✅ Wrote kerchunk parquet refs to:", OUT_PARQUET)

ds

Running in HTTPS mode: https://data-fair2adapt.ifremer.fr/fpaul/tmp/riomar_3months.json
ℹ️ No local parquet refs found at ./riomar_3months_.parq -> creating them from JSON


## Create the region interest fro the demo
The demo requires region of `x_rho =slice(390,452), y_rho =slice(400,430) )[['temp','salt','zeta']]`
We create small slice of it and save it to local to make some regridding test on the next notebook 'simple_regrid.ipynb'


In [None]:
n=5
small_ds=ds.isel(x_rho =slice(390-n,452+n), 
                 y_rho =slice(400-n,430+n),
                 time_counter=0
                 #slice(0,10)
                )[['temp','salt','zeta']]#chunk(chunks={"time_counter":"10M"}).persist() 

In [None]:
%%time
small_ds=small_ds.persist()

In [None]:
small_ds.to_zarr('/Users/todaka/data/RIOMAR/small.zarr',mode='w')

In [None]:
#small_ds=ds.isel(x_rho =slice(390,452), y_rho =slice(400,430) )[['temp','salt','zeta']].chunk(chunks={"time_counter":"10M"}).compute()
small_ds=ds.isel(time_counter=1, s_rho=0)[['temp']].compute()
small_ds

In [None]:
small_ds.temp.plot(y='nav_lat_rho', x='nav_lon_rho', ylim=(42, 52))

In [None]:
small_ds.temp.isel(x_rho =slice(390,452),y_rho =slice(400,430)).plot(y='nav_lat_rho', x='nav_lon_rho', ylim=(47, 47.5))