# Virtual data set (VDS) reference file for SWOT_L4_DAWG_SOS_DISCHARGE using Virtualizarr

Saves VDS as json and parquet files. SWOT_L4_DAWG_SOS_DISCHARGE (https://doi.org/10.5067/SWOT-SOS-V1 ) is an L2 data set with non-standard dimensions (e.g. not cross_track, along_track). It also contains files on the 10 GB scale, although only a handful of them.

In [2]:
# Built-in packages
import os
import sys
import shutil

# Filesystem management 
import fsspec
import earthaccess

# Data handling
import numpy as np
import xarray as xr
from virtualizarr import open_virtual_dataset
import pandas as pd

# Parallel computing 
import multiprocessing
from dask import delayed
import dask.array as da
from dask.distributed import Client
import coiled

# Other
#import matplotlib.pyplot as plt

## 1. Get Data File S3 endpoints in Earthdata Cloud

In [3]:
# Get Earthdata creds
earthaccess.login()

Enter your Earthdata Login username:  deanh808
Enter your Earthdata password:  ········


<earthaccess.auth.Auth at 0x7fcb4cc34440>

In [4]:
# Get AWS creds. Note that if you spend more than 1 hour in the notebook, you may have to re-run this line!!!
fs = earthaccess.get_s3_filesystem(daac="PODAAC")

In [6]:
# Locate CCMP file information / metadata:
granule_info = earthaccess.search_data(
    short_name="SWOT_L4_DAWG_SOS_DISCHARGE",
    )

In [7]:
# Get S3 endpoints for all files:
data_s3links = [g.data_links(access="direct")[0] for g in granule_info]
print("Number of granules found =", len(data_s3links))
print("First few granules:")
data_s3links[0:3]

Number of granules found = 12
First few granules:


['s3://podaac-ops-cumulus-protected/SWOT_L4_DAWG_SOS_DISCHARGE/na_sword_v16_SOS_unconstrained_0001_20240611T010141_results.nc',
 's3://podaac-ops-cumulus-protected/SWOT_L4_DAWG_SOS_DISCHARGE/na_sword_v16_SOS_unconstrained_0001_20240726T123358_results.nc',
 's3://podaac-ops-cumulus-protected/SWOT_L4_DAWG_SOS_DISCHARGE/eu_sword_v16_SOS_unconstrained_0001_20240726T123345_results.nc']

## 2. Generate single reference file

In [9]:
%%time
reader_opts = {"storage_options": fs.storage_options} # S3 filesystem creds from previous section.

# Create reference for the first data file:
virtual_ds_example = open_virtual_dataset(
    data_s3links[0], indexes={}, 
    reader_options=reader_opts, #loadable_variables=coord_vars
    )
print(virtual_ds_example)

Object of type int32 is not JSON serializable
Traceback (most recent call last):
  File "/opt/coiled/env/lib/python3.13/site-packages/kerchunk/hdf.py", line 474, in _translator
    za = self._zroot.require_dataset(
        h5obj.name,
    ...<7 lines>...
        **kwargs,
    )
  File "/opt/coiled/env/lib/python3.13/site-packages/zarr/hierarchy.py", line 1152, in require_dataset
    return self._write_op(
           ~~~~~~~~~~~~~~^
        self._require_dataset_nosync, name, shape=shape, dtype=dtype, exact=exact, **kwargs
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/opt/coiled/env/lib/python3.13/site-packages/zarr/hierarchy.py", line 952, in _write_op
    return f(*args, **kwargs)
  File "/opt/coiled/env/lib/python3.13/site-packages/zarr/hierarchy.py", line 1190, in _require_dataset_nosync
    return self._create_dataset_nosync(name, shape=shape, dtype=dtype, **kwargs)
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^

ValueError: Multiple HDF Groups found. Must specify group= keyword to select one of ['', 'hivdi/', 'metroman/', 'moi/', 'moi/geobam/', 'moi/hivdi/', 'moi/metroman/', 'moi/momma/', 'moi/sad/', 'moi/sic4dvar/', 'momma/', 'neobam/', 'neobam/q/', 'nodes/', 'offline/', 'postdiagnostics/', 'postdiagnostics/basin/', 'postdiagnostics/reach/', 'prediagnostics/', 'prediagnostics/node/', 'prediagnostics/reach/', 'reaches/', 'sad/', 'sic4dvar/', 'validation/']

In [2]:
# Get Earthdata creds
earthaccess.login()

Enter your Earthdata Login username:  deanh808
Enter your Earthdata password:  ········


<earthaccess.auth.Auth at 0x7ff904b550a0>

In [3]:
# Get AWS creds. Note that if you spend more than 1 hour in the notebook, you may have to re-run this line!!!
fs = earthaccess.get_s3_filesystem(daac="PODAAC")

In [4]:
# Locate CCMP file information / metadata:
granule_info = earthaccess.search_data(
    short_name="CCMP_WINDS_10M6HR_L4_V3.1",
    )

In [5]:
# Get S3 endpoints for all files:
data_s3links = [g.data_links(access="direct")[0] for g in granule_info]
print("Number of granules found =", len(data_s3links))
print("First few granules:")
data_s3links[0:3]

Number of granules found = 11674
First few granules:


['s3://podaac-ops-cumulus-protected/CCMP_WINDS_10M6HR_L4_V3.1/CCMP_Wind_Analysis_19930102_V03.1_L4.nc',
 's3://podaac-ops-cumulus-protected/CCMP_WINDS_10M6HR_L4_V3.1/CCMP_Wind_Analysis_19930103_V03.1_L4.nc',
 's3://podaac-ops-cumulus-protected/CCMP_WINDS_10M6HR_L4_V3.1/CCMP_Wind_Analysis_19930105_V03.1_L4.nc']