# Virtual data set (VDS) reference file for ECCO_L4_TEMP_SALINITY_05DEG_DAILY_V4R4 using Virtualizarr

Saves VDS as JSON file. ECCO_L4_TEMP_SALINITY_05DEG_DAILY_V4R4 (https://doi.org/10.5067/ECG5D-OTS44) is an L4 regularly gridded data set.

Note this was run using Zarr v3.

In [1]:
# Built-in packages
import os
import sys
import shutil

# Filesystem management 
import fsspec
import earthaccess

# Data handling
import numpy as np
import xarray as xr
from virtualizarr import open_virtual_dataset
import pandas as pd

# Parallel computing 
import multiprocessing
from dask import delayed
import dask.array as da
from dask.distributed import Client

# Other
import matplotlib.pyplot as plt

## 1. Get Data File S3 endpoints in Earthdata Cloud

In [2]:
# Get Earthdata creds
earthaccess.login()

Enter your Earthdata Login username:  deanh808
Enter your Earthdata password:  ········


<earthaccess.auth.Auth at 0x72a03d2c7a10>

In [3]:
# Get AWS creds. Note that if you spend more than 1 hour in the notebook, you may have to re-run this line!!!
fs = earthaccess.get_s3_filesystem(daac="PODAAC")

In [4]:
# Locate CCMP file information / metadata:
granule_info = earthaccess.search_data(
    short_name="ECCO_L4_TEMP_SALINITY_05DEG_DAILY_V4R4",
    )

In [5]:
# Get S3 endpoints for all files:
data_s3links = [g.data_links(access="direct")[0] for g in granule_info]
print("Number of granules found =", len(data_s3links))
print("First few granules:")
data_s3links[0:3]

Number of granules found = 9497
First few granules:


['s3://podaac-ops-cumulus-protected/ECCO_L4_TEMP_SALINITY_05DEG_DAILY_V4R4/OCEAN_TEMPERATURE_SALINITY_day_mean_1992-01-01_ECCO_V4r4_latlon_0p50deg.nc',
 's3://podaac-ops-cumulus-protected/ECCO_L4_TEMP_SALINITY_05DEG_DAILY_V4R4/OCEAN_TEMPERATURE_SALINITY_day_mean_1992-01-02_ECCO_V4r4_latlon_0p50deg.nc',
 's3://podaac-ops-cumulus-protected/ECCO_L4_TEMP_SALINITY_05DEG_DAILY_V4R4/OCEAN_TEMPERATURE_SALINITY_day_mean_1992-01-03_ECCO_V4r4_latlon_0p50deg.nc']

## 2. Generate single-orbit reference files

One file per orbit, so one reference file per orbit.

In [6]:
# This will be assigned to 'loadable_variables' and needs to be modified per the specific 
# coord names of the data set:
coord_vars = ["time", "latitude", "longitude", "Z"]

In [7]:
print("CPU count =", multiprocessing.cpu_count())

CPU count = 16


In [8]:
reader_opts = {"storage_options": fs.storage_options} # S3 filesystem creds from previous section.

In [9]:
# Start up cluster and print some information about it:
client = Client(n_workers=16, threads_per_worker=1)
print(client.cluster)
print("View any work being done on the cluster here", client.dashboard_link)

LocalCluster(ff75e981, 'tcp://127.0.0.1:38279', workers=16, threads=16, memory=60.77 GiB)
View any work being done on the cluster here https://cluster-zzkxl.dask.host/jupyter/proxy/8787/status


In [10]:
%%time
# Create individual references:
open_vds_par = delayed(open_virtual_dataset)
tasks = [
    open_vds_par(p, indexes={}, reader_options=reader_opts, loadable_variables=coord_vars) 
    for p in data_s3links[:]
    ]
virtual_ds_list = list(da.compute(*tasks)) # The xr.combine_nested() function below needs a list rather than a tuple.

CPU times: user 3min 13s, sys: 33.6 s, total: 3min 47s
Wall time: 13min 24s


In [11]:
len(virtual_ds_list)

9497

## 3. Generate combined reference file

In [12]:
%%time
# Create the combined reference
virtual_ds_combined = xr.combine_nested(virtual_ds_list, concat_dim='time', coords='minimal', compat='override', combine_attrs='drop_conflicts')

CPU times: user 7.78 s, sys: 209 ms, total: 7.99 s
Wall time: 7.82 s


In [13]:
# Save in JSON or PARQUET format:
fname_combined_json = 'ECCO_L4_TEMP_SALINITY_05DEG_DAILY_V4R4.json'
virtual_ds_combined.virtualize.to_kerchunk(fname_combined_json, format='json')

## 4. Test combined reference files

In [16]:
def opends_withref(ref, fs_data):
    """
    "ref" is a reference file or object. "fs_data" is a filesystem with credentials to
    access the actual data files. 
    """
    storage_opts = {"fo": ref, "remote_protocol": "s3", "remote_options": fs_data.storage_options}
    fs_ref = fsspec.filesystem('reference', **storage_opts)
    m = fs_ref.get_mapper('')
    data = xr.open_dataset(
        m, engine="zarr", chunks={},
        backend_kwargs={"consolidated": False}
    )
    return data

### 4.1 JSON reference file

In [14]:
%%time
data = xr.open_dataset(
    fname_combined_json,
    engine="kerchunk",
    chunks={},
    backend_kwargs={
        "storage_options": {
            "remote_protocol": "s3",
            "remote_options": fs.storage_options
        }
    }
)
data



Unnamed: 0,Array,Chunk
Bytes,148.39 kiB,16 B
Shape,"(9497, 2)","(1, 2)"
Dask graph,9497 chunks in 2 graph layers,9497 chunks in 2 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray
"Array Chunk Bytes 148.39 kiB 16 B Shape (9497, 2) (1, 2) Dask graph 9497 chunks in 2 graph layers Data type datetime64[ns] numpy.ndarray",2  9497,

Unnamed: 0,Array,Chunk
Bytes,148.39 kiB,16 B
Shape,"(9497, 2)","(1, 2)"
Dask graph,9497 chunks in 2 graph layers,9497 chunks in 2 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.81 kiB,2.81 kiB
Shape,"(360, 2)","(360, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.81 kiB 2.81 kiB Shape (360, 2) (360, 2) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",2  360,

Unnamed: 0,Array,Chunk
Bytes,2.81 kiB,2.81 kiB
Shape,"(360, 2)","(360, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.62 kiB,5.62 kiB
Shape,"(720, 2)","(720, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.62 kiB 5.62 kiB Shape (720, 2) (720, 2) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",2  720,

Unnamed: 0,Array,Chunk
Bytes,5.62 kiB,5.62 kiB
Shape,"(720, 2)","(720, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,400 B,400 B
Shape,"(50, 2)","(50, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 400 B 400 B Shape (50, 2) (50, 2) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",2  50,

Unnamed: 0,Array,Chunk
Bytes,400 B,400 B
Shape,"(50, 2)","(50, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,458.51 GiB,6.18 MiB
Shape,"(9497, 50, 360, 720)","(1, 25, 180, 360)"
Dask graph,75976 chunks in 2 graph layers,75976 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 458.51 GiB 6.18 MiB Shape (9497, 50, 360, 720) (1, 25, 180, 360) Dask graph 75976 chunks in 2 graph layers Data type float32 numpy.ndarray",9497  1  720  360  50,

Unnamed: 0,Array,Chunk
Bytes,458.51 GiB,6.18 MiB
Shape,"(9497, 50, 360, 720)","(1, 25, 180, 360)"
Dask graph,75976 chunks in 2 graph layers,75976 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,458.51 GiB,6.18 MiB
Shape,"(9497, 50, 360, 720)","(1, 25, 180, 360)"
Dask graph,75976 chunks in 2 graph layers,75976 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 458.51 GiB 6.18 MiB Shape (9497, 50, 360, 720) (1, 25, 180, 360) Dask graph 75976 chunks in 2 graph layers Data type float32 numpy.ndarray",9497  1  720  360  50,

Unnamed: 0,Array,Chunk
Bytes,458.51 GiB,6.18 MiB
Shape,"(9497, 50, 360, 720)","(1, 25, 180, 360)"
Dask graph,75976 chunks in 2 graph layers,75976 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [21]:
lat_range = (35, 45)
lon_range = (-135, -125)
time_range = (np.datetime64("2010-01-01"), np.datetime64("2011-01-01"))
data_subset = data.sel(latitude=slice(*lat_range), longitude=slice(*lon_range), time=slice(*time_range))
data_subset

Unnamed: 0,Array,Chunk
Bytes,5.70 kiB,16 B
Shape,"(365, 2)","(1, 2)"
Dask graph,365 chunks in 3 graph layers,365 chunks in 3 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray
"Array Chunk Bytes 5.70 kiB 16 B Shape (365, 2) (1, 2) Dask graph 365 chunks in 3 graph layers Data type datetime64[ns] numpy.ndarray",2  365,

Unnamed: 0,Array,Chunk
Bytes,5.70 kiB,16 B
Shape,"(365, 2)","(1, 2)"
Dask graph,365 chunks in 3 graph layers,365 chunks in 3 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,160 B,160 B
Shape,"(20, 2)","(20, 2)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 160 B 160 B Shape (20, 2) (20, 2) Dask graph 1 chunks in 3 graph layers Data type float32 numpy.ndarray",2  20,

Unnamed: 0,Array,Chunk
Bytes,160 B,160 B
Shape,"(20, 2)","(20, 2)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,160 B,160 B
Shape,"(20, 2)","(20, 2)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 160 B 160 B Shape (20, 2) (20, 2) Dask graph 1 chunks in 3 graph layers Data type float32 numpy.ndarray",2  20,

Unnamed: 0,Array,Chunk
Bytes,160 B,160 B
Shape,"(20, 2)","(20, 2)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,400 B,400 B
Shape,"(50, 2)","(50, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 400 B 400 B Shape (50, 2) (50, 2) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",2  50,

Unnamed: 0,Array,Chunk
Bytes,400 B,400 B
Shape,"(50, 2)","(50, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,27.85 MiB,39.06 kiB
Shape,"(365, 50, 20, 20)","(1, 25, 20, 20)"
Dask graph,730 chunks in 3 graph layers,730 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 27.85 MiB 39.06 kiB Shape (365, 50, 20, 20) (1, 25, 20, 20) Dask graph 730 chunks in 3 graph layers Data type float32 numpy.ndarray",365  1  20  20  50,

Unnamed: 0,Array,Chunk
Bytes,27.85 MiB,39.06 kiB
Shape,"(365, 50, 20, 20)","(1, 25, 20, 20)"
Dask graph,730 chunks in 3 graph layers,730 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,27.85 MiB,39.06 kiB
Shape,"(365, 50, 20, 20)","(1, 25, 20, 20)"
Dask graph,730 chunks in 3 graph layers,730 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 27.85 MiB 39.06 kiB Shape (365, 50, 20, 20) (1, 25, 20, 20) Dask graph 730 chunks in 3 graph layers Data type float32 numpy.ndarray",365  1  20  20  50,

Unnamed: 0,Array,Chunk
Bytes,27.85 MiB,39.06 kiB
Shape,"(365, 50, 20, 20)","(1, 25, 20, 20)"
Dask graph,730 chunks in 3 graph layers,730 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [None]:
%%time
theta_mean_map = data_subset["THETA"].isel(Z=0).mean(dim="time").compute()

In [None]:
theta_mean_map.plot()