# Using virtualizarr in the cloud

In [17]:
import earthaccess
import xarray as xr
import fsspec

In [2]:
from virtualizarr import open_virtual_dataset

In [3]:
earthaccess.login()

Enter your Earthdata Login username:  deanh808
Enter your Earthdata password:  ········


<earthaccess.auth.Auth at 0x7fac617922a0>

## Try with the *MUR25-JPL-L4-GLOB-v04.2* collection

In [4]:
granule_metadata = earthaccess.search_data(short_name="MUR25-JPL-L4-GLOB-v04.2", count=100, cloud_hosted=True)

In [5]:
fileobjs = earthaccess.open(granule_metadata)

QUEUEING TASKS | :   0%|          | 0/100 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/100 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
s3paths = [f.full_name for f in fileobjs]
s3paths[0]

's3://podaac-ops-cumulus-protected/MUR25-JPL-L4-GLOB-v04.2/20020901090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc'

**dmrpp file paths can be obtained by appending "dmrpp" to the granule paths**

In [7]:
paths_dmrpp = [p+".dmrpp" for p in s3paths]
paths_dmrpp[0]

's3://podaac-ops-cumulus-protected/MUR25-JPL-L4-GLOB-v04.2/20020901090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc.dmrpp'

In [31]:
fs_s3 = earthaccess.get_s3fs_session(daac="PODAAC")

In [34]:
%%time
virtual_datasets = [
    open_virtual_dataset(p, indexes={}, filetype="dmrpp", reader_options={"storage_options": fs_s3.storage_options})
    for p in paths_dmrpp
]

CPU times: user 582 ms, sys: 20.8 ms, total: 602 ms
Wall time: 6.16 s


In [35]:
virtual_datasets[0]

In [36]:
%%time
virtual_ds_combined = xr.combine_nested(virtual_datasets, concat_dim=['time'], coords='minimal', compat='override', combine_attrs='drop_conflicts')

CPU times: user 27.8 ms, sys: 0 ns, total: 27.8 ms
Wall time: 27.4 ms


In [37]:
virtual_ds_combined

In [38]:
virtual_ds_combined.virtualize.to_kerchunk('combined.json', format='json')

In [39]:
fs_local = fsspec.filesystem('reference', fo='combined.json')
m = fs_local.get_mapper('')

ds = xr.open_dataset(
    m, engine='kerchunk', 
    chunks={}, 
    backend_kwargs={
        "storage_options": {
            "remote_protocol": "s3",
            "remote_options": fs_s3.storage_options,
            }
    })

ReferenceNotReachable: Reference "analysed_sst/0.0.0" failed to fetch target ['s3://podaac-ops-cumulus-protected/MUR25-JPL-L4-GLOB-v04.2/20020901090000-JPL-L4_GHRSST-SSTfnd-MUR25-GLOB-v02.0-fv04.2.nc', 60292, 826371]

In [40]:
data = xr.open_dataset(
    "reference://",
    engine="zarr",
    chunks={},
    backend_kwargs={
        "consolidated": False,
        "storage_options": {
            "fo": "combined.json",
            "remote_protocol": "s3",
            "remote_options": fs_s3.storage_options,
        }
    },
)

ValueError: Shuffle buffer is not an integer multiple of elementsize

## Opening netcdf files for comparison

In [26]:
%%time
data_nc = xr.open_mfdataset(earthaccess.open(granule_metadata[:100]))

QUEUEING TASKS | :   0%|          | 0/100 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/100 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/100 [00:00<?, ?it/s]

CPU times: user 5.34 s, sys: 673 ms, total: 6.01 s
Wall time: 20.7 s


In [5]:
data_nc

Unnamed: 0,Array,Chunk
Bytes,39.55 MiB,3.96 MiB
Shape,"(10, 720, 1440)","(1, 720, 1440)"
Dask graph,10 chunks in 21 graph layers,10 chunks in 21 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 39.55 MiB 3.96 MiB Shape (10, 720, 1440) (1, 720, 1440) Dask graph 10 chunks in 21 graph layers Data type float32 numpy.ndarray",1440  720  10,

Unnamed: 0,Array,Chunk
Bytes,39.55 MiB,3.96 MiB
Shape,"(10, 720, 1440)","(1, 720, 1440)"
Dask graph,10 chunks in 21 graph layers,10 chunks in 21 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.55 MiB,3.96 MiB
Shape,"(10, 720, 1440)","(1, 720, 1440)"
Dask graph,10 chunks in 21 graph layers,10 chunks in 21 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 39.55 MiB 3.96 MiB Shape (10, 720, 1440) (1, 720, 1440) Dask graph 10 chunks in 21 graph layers Data type float32 numpy.ndarray",1440  720  10,

Unnamed: 0,Array,Chunk
Bytes,39.55 MiB,3.96 MiB
Shape,"(10, 720, 1440)","(1, 720, 1440)"
Dask graph,10 chunks in 21 graph layers,10 chunks in 21 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.55 MiB,3.96 MiB
Shape,"(10, 720, 1440)","(1, 720, 1440)"
Dask graph,10 chunks in 21 graph layers,10 chunks in 21 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 39.55 MiB 3.96 MiB Shape (10, 720, 1440) (1, 720, 1440) Dask graph 10 chunks in 21 graph layers Data type float32 numpy.ndarray",1440  720  10,

Unnamed: 0,Array,Chunk
Bytes,39.55 MiB,3.96 MiB
Shape,"(10, 720, 1440)","(1, 720, 1440)"
Dask graph,10 chunks in 21 graph layers,10 chunks in 21 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.55 MiB,3.96 MiB
Shape,"(10, 720, 1440)","(1, 720, 1440)"
Dask graph,10 chunks in 21 graph layers,10 chunks in 21 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 39.55 MiB 3.96 MiB Shape (10, 720, 1440) (1, 720, 1440) Dask graph 10 chunks in 21 graph layers Data type float32 numpy.ndarray",1440  720  10,

Unnamed: 0,Array,Chunk
Bytes,39.55 MiB,3.96 MiB
Shape,"(10, 720, 1440)","(1, 720, 1440)"
Dask graph,10 chunks in 21 graph layers,10 chunks in 21 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.55 MiB,3.96 MiB
Shape,"(10, 720, 1440)","(1, 720, 1440)"
Dask graph,10 chunks in 21 graph layers,10 chunks in 21 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 39.55 MiB 3.96 MiB Shape (10, 720, 1440) (1, 720, 1440) Dask graph 10 chunks in 21 graph layers Data type float32 numpy.ndarray",1440  720  10,

Unnamed: 0,Array,Chunk
Bytes,39.55 MiB,3.96 MiB
Shape,"(10, 720, 1440)","(1, 720, 1440)"
Dask graph,10 chunks in 21 graph layers,10 chunks in 21 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
