In [1]:
import xarray as xr
import glob

from dask.distributed import Client, LocalCluster
from dask_jobqueue import SLURMCluster

# ERA5 Land data: from NetCDF3 to Zarr

In this notebook we load ERA5 Land data stored in a collection of NetCDF3 files and we save it as a (chunked) Zarr store. It is global.

## Input variables

In [2]:
year = 2012

In [3]:
ROOT_DIR = '/gpfs/work2/0/ttse0619'
ERA5_DIR = (
    f'{ROOT_DIR}/qianqian/global_data_Qianqian/'
    f'1input_data/{year}global/era5land/'
)
OUT_DIR = (
    f'{ROOT_DIR}/qianqian/global_data_Qianqian/'
    f'1input_data/{year}global/era5land'
)

## Setup Dask cluster

NOTE: when working with NetCDF files (and the netcdf4 library) it is much better to work with many processes and few threads per process: netcdf4 can only read from one thread per process.

In [4]:
# cluster = LocalCluster(n_workers=16, threads_per_worker=1)

In [5]:
cluster = SLURMCluster(
    name='dask-worker',
    cores=16,
    processes=16,
    queue='fat',
    memory='120GiB',
    local_directory='$TMPDIR',
    walltime='3:00:00'
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36437 instead


We use in total 4 * 16 = 64 single-threaded workers, and ~480 GiB total memory: 

In [6]:
cluster.scale(jobs=4)

In [7]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: /proxy/36437/status,

0,1
Dashboard: /proxy/36437/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://145.136.63.1:43843,Workers: 0
Dashboard: /proxy/36437/status,Total threads: 0
Started: Just now,Total memory: 0 B


## Converting the dataset 

In [4]:
era5_paths = sorted(
    glob.glob(f'{ERA5_DIR}/era5-land_*.nc')
)

In [5]:
len(era5_paths)

96

In [6]:
ds = xr.open_mfdataset(era5_paths, chunks={'longitude': 250, 'latitude': 250})
ds = ds.chunk({'time': 750})

The resulting dataset has 8 variables, with chunks of approximately 180 MiB:

In [7]:
ds

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 212.16 GiB 178.81 MiB Shape (8784, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8784,

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 212.16 GiB 178.81 MiB Shape (8784, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8784,

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 212.16 GiB 178.81 MiB Shape (8784, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8784,

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 212.16 GiB 178.81 MiB Shape (8784, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8784,

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 212.16 GiB 178.81 MiB Shape (8784, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8784,

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 212.16 GiB 178.81 MiB Shape (8784, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8784,

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 212.16 GiB 178.81 MiB Shape (8784, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8784,

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 212.16 GiB 178.81 MiB Shape (8784, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8784,

Unnamed: 0,Array,Chunk
Bytes,212.16 GiB,178.81 MiB
Shape,"(8784, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [14]:
%%time
zarr_path = f'{OUT_DIR}/era5land.zarr'
ds.to_zarr(zarr_path, mode='w')

CPU times: user 26min 17s, sys: 1min 4s, total: 27min 22s
Wall time: 2h 16min 25s


<xarray.backends.zarr.ZarrStore at 0x1547b9fd6120>

When done with the conversion we switch off the cluster to release resources:

In [15]:
client.shutdown()

The overall Zarr archive is ~200 GB:

In [4]:
zarr_path = f'{OUT_DIR}/era5land.zarr'
!du -h $zarr_path

31G	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2012global/era5land/era5land.zarr/sp
18G	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2012global/era5land/era5land.zarr/ssrd
30G	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2012global/era5land/era5land.zarr/v10
2.0K	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2012global/era5land/era5land.zarr/longitude
2.0K	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2012global/era5land/era5land.zarr/latitude
31G	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2012global/era5land/era5land.zarr/d2m
11G	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2012global/era5land/era5land.zarr/tp
29G	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2012global/era5land/era5land.zarr/u10
2.0K	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2012global/era5land/era5land.zarr/time
30G	/gpfs/work2/0/ttse0