In [1]:
import xarray as xr
import glob

from dask.distributed import Client, LocalCluster
from dask_jobqueue import SLURMCluster

# ERA5 Land data: from NetCDF3 to Zarr

In this notebook we load ERA5 Land data stored in a collection of NetCDF3 files and we save it as a (chunked) Zarr store. It is global.

## Input variables

In [15]:
year = 2015

In [16]:
ROOT_DIR = '/gpfs/work2/0/ttse0619'
ERA5_DIR = (
    f'{ROOT_DIR}/qianqian/global_data_Qianqian/'
    f'1input_data/{year}global/era5land/'
)
OUT_DIR = (
    f'{ROOT_DIR}/francesco/Projects/EcoExtreML/Data/'
    f'1input_data/{year}global/era5land'
)

## Setup Dask cluster

NOTE: when working with NetCDF files (and the netcdf4 library) it is much better to work with many processes and few threads per process: netcdf4 can only read from one thread per process.

In [3]:
# cluster = LocalCluster(n_workers=16, threads_per_worker=1)

In [5]:
cluster = SLURMCluster(
    name='dask-worker',
    cores=16,
    processes=16,
    queue='fat',
    memory='120GiB',
    local_directory='$TMPDIR',
    walltime='5:00:00'
)

We use in total 4 * 16 = 64 single-threaded workers, and ~480 GiB total memory: 

In [9]:
cluster.scale(jobs=4)

In [10]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: /proxy/8787/status,

0,1
Dashboard: /proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://145.136.56.49:33215,Workers: 0
Dashboard: /proxy/8787/status,Total threads: 0
Started: 1 minute ago,Total memory: 0 B


## Converting the dataset 

In [18]:
era5_paths = sorted(
    glob.glob(f'{ERA5_DIR}/era5-land_*.nc')
)

In [19]:
ds = xr.open_mfdataset(era5_paths, chunks={'longitude': 250, 'latitude': 250})
ds = ds.chunk({'time': 750})

The resulting dataset has 8 variables, with chunks of approximately 180 MiB:

In [22]:
ds

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 211.58 GiB 178.81 MiB Shape (8760, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8760,

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 211.58 GiB 178.81 MiB Shape (8760, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8760,

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 211.58 GiB 178.81 MiB Shape (8760, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8760,

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 211.58 GiB 178.81 MiB Shape (8760, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8760,

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 211.58 GiB 178.81 MiB Shape (8760, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8760,

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 211.58 GiB 178.81 MiB Shape (8760, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8760,

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 211.58 GiB 178.81 MiB Shape (8760, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8760,

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 211.58 GiB 178.81 MiB Shape (8760, 1801, 3600) (750, 250, 250) Dask graph 1440 chunks in 26 graph layers Data type float32 numpy.ndarray",3600  1801  8760,

Unnamed: 0,Array,Chunk
Bytes,211.58 GiB,178.81 MiB
Shape,"(8760, 1801, 3600)","(750, 250, 250)"
Dask graph,1440 chunks in 26 graph layers,1440 chunks in 26 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [24]:
%%time
zarr_path = f'{OUT_DIR}/era5land.zarr'
ds.to_zarr(zarr_path, mode='w')

CPU times: user 6min 45s, sys: 18.8 s, total: 7min 4s
Wall time: 1h 12min 54s


<xarray.backends.zarr.ZarrStore at 0x14eb12e96270>

When done with the conversion we switch off the cluster to release resources:

In [25]:
client.shutdown()

The overall Zarr archive is ~200 GB:

In [29]:
!du -h $zarr_path

31G	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/era5land/era5land.zarr/sp
18G	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/era5land/era5land.zarr/ssrd
30G	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/era5land/era5land.zarr/v10
2.0K	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/era5land/era5land.zarr/longitude
2.0K	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/era5land/era5land.zarr/latitude
30G	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/era5land/era5land.zarr/d2m
11G	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/era5land/era5land.zarr/tp
29G	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/era5land/era5land.zarr/u10
2.0K	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/era5la