# Pangeo Forge Exploration

Learning how to use Pangeo Forge by following the basic NetCDF -> Zarr tutorial: https://pangeo-forge.readthedocs.io/en/latest/tutorials/netcdf_zarr_sequential.html

In [None]:
import s3fs, tempfile, logging

from tqdm.notebook import tqdm

# from dask.distributed import Client

from fsspec.implementations.local import LocalFileSystem

from pangeo_forge_recipes.patterns import pattern_from_file_sequence
from pangeo_forge_recipes.recipes import XarrayZarrRecipe
from pangeo_forge_recipes.storage import FSSpecTarget, CacheFSSpecTarget, MetadataTarget, StorageConfig

import pandas as pd
import numpy as np
import xarray as xr

In [None]:
s3 = s3fs.S3FileSystem(anon=False)

Enable logging so we can monitor progress:

In [None]:
debug = False

if debug:
    logger = logging.getLogger('pangeo_forge_recipes')
    formatter = logging.Formatter('%(name)s:%(levelname)s - %(message)s')
    handler = logging.StreamHandler()
    handler.setLevel(logging.INFO)
    handler.setFormatter(formatter)
    logger.setLevel(logging.INFO)
    logger.addHandler(handler)

Glob a list of input file URLs:

In [None]:
# define filesystem protocol
protocol = 's3://'

# define bucket name
bucket = 'eis-dh-hydro'

# define input directory within bucket
netcdf_dir = 'LIS_NETCDF'
ds_dir = 'DELTA_2km/7CONST_RA_LAKE/ROUTING'

# define url pattern
url_pattern = protocol + '/'.join([bucket, netcdf_dir, ds_dir, '**/LIS_HIST*.nc'])

# build input urls
input_urls = [protocol + s for s in s3.glob(url_pattern)]

# inspect a url
input_urls[0]

Create a Pangeo Forge `pattern`:

In [None]:
# define recipe file pattern
pattern = pattern_from_file_sequence(input_urls, 'time', nitems_per_file=1)

pattern

Inspect the data in the pattern:

In [None]:
# pattern is designed to be iterated over, so get the first key:
for key in pattern:
    break
key

Inspect the pattern key:

In [None]:
pattern[key]

Define preprocesing functions:

In [None]:
# define preprocessing function

def add_latlon_coords(ds: xr.Dataset)->xr.Dataset:
    """Adds lat/lon as dimensions and coordinates to an xarray.Dataset object."""
    
    # get attributes from dataset
    attrs = ds.attrs
    
    # get x, y resolutions
    dx = round(float(attrs['DX']), 3)
    dy = round(float(attrs['DY']), 3)
    
    # get grid cells in x, y dimensions
    ew_len = len(ds['east_west'])
    ns_len = len(ds['north_south'])
    
    # get lower-left lat and lon
    ll_lat = round(float(attrs['SOUTH_WEST_CORNER_LAT']), 3)
    ll_lon = round(float(attrs['SOUTH_WEST_CORNER_LON']), 3)
    
    # calculate upper-right lat and lon
    ur_lat =  ll_lat + (dy * ns_len)
    ur_lon = ll_lon + (dx * ew_len)
    
    # define the new coordinates
    coords = {
        # create an arrays containing the lat/lon at each gridcell
        'lat': np.linspace(ll_lat, ur_lat, ns_len, dtype=np.float32, endpoint=False),
        'lon': np.linspace(ll_lon, ur_lon, ew_len, dtype=np.float32, endpoint=False)
    }
    
    lon_attrs = ds.lon.attrs
    lat_attrs = ds.lat.attrs
    
    # rename the original lat and lon variables
    ds = ds.rename({'lon':'orig_lon', 'lat':'orig_lat'})
    # rename the grid dimensions to lat and lon
    ds = ds.rename({'north_south': 'lat', 'east_west': 'lon'})
    # assign the coords above as coordinates
    ds = ds.assign_coords(coords)
    ds.lon.attrs = lon_attrs
    ds.lat.attrs = lat_attrs
    
    return ds

In [None]:
fs_local = LocalFileSystem()

temp_dir = '/home/jovyan/efs/tmp'
fs_temp = CacheFSSpecTarget(fs_local, temp_dir)

zarr_dir = 'TEMP'
zarr_name = 'rechunk_test.zarr'

target_path = '/'.join(['eis-dh-sealevel', zarr_dir, ds_dir, zarr_name])
fs_target = FSSpecTarget(fs=s3, root_path=target_path)

meta_dir = tempfile.TemporaryDirectory(dir=temp_dir)
fs_meta = MetadataTarget(fs_local, meta_dir.name)

storage_config = StorageConfig(
    fs_target,
    fs_temp,
    fs_meta
)

Define the chunking scheme for the target Zarr store:

In [None]:
target_chunks = {'time': 100, 'lon': 100, 'lat': 100}

Create the `XarrayZarrRecipe`:

In [None]:
recipe = XarrayZarrRecipe(pattern,                         # file URL pattern
                          inputs_per_chunk=100,            # input files per chunk
                          storage_config=storage_config,   # storage configuration for caches and target
                          process_chunk=add_latlon_coords, # preprocess func
                          cache_inputs=False,              # read inputs directly from S3
                          target_chunks=target_chunks)     # set chunking scheme for output

Inspect the recipe:

In [None]:
recipe

In [None]:
all_inputs = list(recipe.iter_inputs())
len(all_inputs)

In [None]:
all_chunks = list(recipe.iter_chunks())
len(all_chunks)

In [None]:
# for input_file in recipe.inputs_for_chunk(all_chunks[0]):
#     recipe.cache_input(input_file)

In [None]:
# with recipe.open_chunk(all_chunks[0]) as ds:
#     display(ds)
#     ds.load()

In [None]:
%xmode minimal
with recipe.open_chunk(all_chunks[0]) as ds:
    display(ds)

In [None]:
recipe.prepare_target()

In [None]:
import zarr
# zgroup = zarr.open(target_dir.name)
zgroup = zarr.open_consolidated('s3://' + target_path)
print(zgroup.tree())

In [None]:
for chunk in tqdm(recipe.iter_chunks(), total=len(all_chunks)):
    recipe.store_chunk(chunk)
    
recipe.finalize_target()