# Basic test of Coiled processing NASA earthdata using Xarray and s3fs libraries with kerhunk

In [7]:
# Access libraries
import s3fs
import requests
import earthaccess

# Parallel computing libraries
import coiled
from dask.distributed import Client, LocalCluster

# Data analysis libraries
import xarray as xr

# Visualization libraries
import matplotlib.pyplot as plt
%matplotlib inline

## Start cluster

In [10]:
cluster = coiled.Cluster(
    n_workers=10, 
    account="podaac-science", 
    region="us-west-2", 
    #worker_memory="64 GiB",
    #name='podaac-science-c0a69b8d-e'
    ) 
client = cluster.get_client()

Output()

Output()

In [11]:
client

0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: https://cluster-vgtwb.dask.host/mjLag4L621GzYUaN/status,

0,1
Dashboard: https://cluster-vgtwb.dask.host/mjLag4L621GzYUaN/status,Workers: 10
Total threads: 40,Total memory: 148.33 GiB

0,1
Comm: tls://10.0.53.213:8786,Workers: 10
Dashboard: http://10.0.53.213:8787/status,Total threads: 40
Started: 1 minute ago,Total memory: 148.33 GiB

0,1
Comm: tls://10.0.55.40:38561,Total threads: 4
Dashboard: http://10.0.55.40:8787/status,Memory: 14.83 GiB
Nanny: tls://10.0.55.40:33399,
Local directory: /scratch/dask-scratch-space/worker-kr79ve50,Local directory: /scratch/dask-scratch-space/worker-kr79ve50

0,1
Comm: tls://10.0.56.86:35751,Total threads: 4
Dashboard: http://10.0.56.86:8787/status,Memory: 14.84 GiB
Nanny: tls://10.0.56.86:40249,
Local directory: /scratch/dask-scratch-space/worker-pv7n9fnk,Local directory: /scratch/dask-scratch-space/worker-pv7n9fnk

0,1
Comm: tls://10.0.52.38:33487,Total threads: 4
Dashboard: http://10.0.52.38:8787/status,Memory: 14.84 GiB
Nanny: tls://10.0.52.38:38353,
Local directory: /scratch/dask-scratch-space/worker-5xx9mvd4,Local directory: /scratch/dask-scratch-space/worker-5xx9mvd4

0,1
Comm: tls://10.0.50.226:42511,Total threads: 4
Dashboard: http://10.0.50.226:8787/status,Memory: 14.84 GiB
Nanny: tls://10.0.50.226:43205,
Local directory: /scratch/dask-scratch-space/worker-4t1a8rel,Local directory: /scratch/dask-scratch-space/worker-4t1a8rel

0,1
Comm: tls://10.0.55.203:41025,Total threads: 4
Dashboard: http://10.0.55.203:8787/status,Memory: 14.83 GiB
Nanny: tls://10.0.55.203:37029,
Local directory: /scratch/dask-scratch-space/worker-1266kpxf,Local directory: /scratch/dask-scratch-space/worker-1266kpxf

0,1
Comm: tls://10.0.59.58:44161,Total threads: 4
Dashboard: http://10.0.59.58:8787/status,Memory: 14.83 GiB
Nanny: tls://10.0.59.58:46355,
Local directory: /scratch/dask-scratch-space/worker-affm8qlg,Local directory: /scratch/dask-scratch-space/worker-affm8qlg

0,1
Comm: tls://10.0.58.95:33949,Total threads: 4
Dashboard: http://10.0.58.95:8787/status,Memory: 14.82 GiB
Nanny: tls://10.0.58.95:39165,
Local directory: /scratch/dask-scratch-space/worker-klb1o_t7,Local directory: /scratch/dask-scratch-space/worker-klb1o_t7

0,1
Comm: tls://10.0.63.41:34519,Total threads: 4
Dashboard: http://10.0.63.41:8787/status,Memory: 14.84 GiB
Nanny: tls://10.0.63.41:37007,
Local directory: /scratch/dask-scratch-space/worker-f9fze2r0,Local directory: /scratch/dask-scratch-space/worker-f9fze2r0

0,1
Comm: tls://10.0.48.187:46187,Total threads: 4
Dashboard: http://10.0.48.187:8787/status,Memory: 14.84 GiB
Nanny: tls://10.0.48.187:46661,
Local directory: /scratch/dask-scratch-space/worker-nkw8_gkp,Local directory: /scratch/dask-scratch-space/worker-nkw8_gkp

0,1
Comm: tls://10.0.55.221:44291,Total threads: 4
Dashboard: http://10.0.55.221:8787/status,Memory: 14.83 GiB
Nanny: tls://10.0.55.221:42971,
Local directory: /scratch/dask-scratch-space/worker-wrym2wgr,Local directory: /scratch/dask-scratch-space/worker-wrym2wgr


## Locate files using `earthaccess`

In [12]:
earthaccess.login("interactive")

We are already authenticated with NASA EDL


<earthaccess.auth.Auth at 0x12075ae50>

In [13]:
results = earthaccess.search_data(
    short_name="MUR-JPL-L4-GLOB-v4.1",
    cloud_hosted=True,
    #bounding_box=bounding_box,
    #temporal=("2021-04-01", "2021-05-6"),
    count=100,
    )

Granules found: 7743


## Load data with kerchunk and xarray

In [14]:
# Consolidated metdata file
metadata = earthaccess.consolidate_metadata(
    results,
    kerchunk_options={
        "concat_dims": "time", 
        "identical_dims": ['lat', 'lon']
        }, 
    #outfile="some S3 path"
    )

In [15]:
fs = earthaccess.get_s3fs_session(daac="PODAAC")
ds = xr.open_dataset(
    "reference://",
    engine="zarr",
    chunks={},
    backend_kwargs={
        "consolidated": False,
        "storage_options": {
            "fo": metadata,
            "remote_protocol": "s3",
            "remote_options": fs.storage_options,
        }
    },
)

In [25]:
ds['analysed_sst']

Unnamed: 0,Array,Chunk
Bytes,241.39 GiB,7.99 MiB
Shape,"(100, 17999, 36000)","(1, 1023, 2047)"
Dask graph,32400 chunks in 2 graph layers,32400 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 241.39 GiB 7.99 MiB Shape (100, 17999, 36000) (1, 1023, 2047) Dask graph 32400 chunks in 2 graph layers Data type float32 numpy.ndarray",36000  17999  100,

Unnamed: 0,Array,Chunk
Bytes,241.39 GiB,7.99 MiB
Shape,"(100, 17999, 36000)","(1, 1023, 2047)"
Dask graph,32400 chunks in 2 graph layers,32400 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## Computations

In [26]:
# Function meant to be run with dask.
# Takes an xarray.DataArray (with dimensions lat, lon), subsets spatially using
# input 2-tuples, returns mean over the subsetted region.
def regional_spatial_mean(sstdata, latbnds, lonbnds):
    future_result = sstdata.sel(lat=slice(*latbnds), lon=slice(*lonbnds)) # subset
    future_result = future_result.mean(dim=['lat','lon'], skipna=True) # mean
    return future_result.load()

In [27]:
## Bounding boxes for two regions to perform computations over

# Sub-tropical western Atlantic region:
latbnds1 = (20, 30)
lonbnds1 = (-82, -72)

# Mid-latitude eastern Pacific region:
latbnds2 = (45, 55)
lonbnds2 = (-137, -127)

In [28]:
%%time

# Computations
region1_mean_sst = regional_spatial_mean(ds['analysed_sst'], latbnds1, lonbnds1)
region2_mean_sst = regional_spatial_mean(ds['analysed_sst'], latbnds2, lonbnds2)

KeyError: 20

In [None]:
## Plot results

fig = plt.figure(figsize=(8, 4))
axes = plt.axes()

axes.plot(region1_mean_sst['time'], region1_mean_sst.values, label='western sub-tropical Atlantic')
axes.plot(region2_mean_sst['time'], region2_mean_sst.values, label='eastern mid-latitude Pacific')

axes.legend()
axes.set_ylabel('SST [K]', fontsize=14)

In [29]:
cluster.shutdown()