In [1]:
# import dask.config as dc
import dask.array as da
import numpy as np
import rioxarray
import xarray as xr
import glob

from dask.distributed import Client, LocalCluster
from dask_jobqueue import SLURMCluster

# LAI data: from NetCDF to Zarr

The Leaf Area Index (LAI) dataset is provided as chunked NetCDF4/HDF5 data. We reproject the spatial extend to match the ERA5 land dataset using rioxarray, which, unfortunately does not support Dask.

## Input variables

In [2]:
year = 2015

In [3]:
ROOT_DIR = '/gpfs/work2/0/ttse0619'
LAI_DIR = (
    f'{ROOT_DIR}/qianqian/global_data_Qianqian/'
    f'1input_data/{year}global/lai/'
)
OUT_DIR = (
    f'{ROOT_DIR}/francesco/Projects/EcoExtreML/Data/'
    f'1input_data/{year}global/lai'
)

In [4]:
ERA5_PATH = (
    f'{ROOT_DIR}/francesco/Projects/EcoExtreML/Data/'
    f'1input_data/{year}global/era5land/era5land.zarr'
)

## Setup Dask cluster

NOTE: when working with NetCDF files (and the netcdf4 library) it is much better to work with many processes and few threads per process: netcdf4 can only read from one thread per process.

In [5]:
# cluster = LocalCluster(n_workers=4, threads_per_worker=1)

In [6]:
cluster = SLURMCluster(
    name='dask-worker',
    cores=16,
    processes=16,
    queue='fat',
    memory='120GiB',
    local_directory='$TMPDIR',
    walltime='5:00:00'
)

In [7]:
cluster.scale(jobs=4)

In [8]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: /proxy/8787/status,

0,1
Dashboard: /proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://145.136.57.179:43287,Workers: 0
Dashboard: /proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


## Converting the dataset

We start by loading the ERA5-land Zarr dataset, which we will use as the target for reprojection: 

In [9]:
ds = xr.open_zarr(ERA5_PATH)

In [10]:
ds = ds.rio.write_crs('WGS84')
target = ds['sp'].isel(time=0, drop=True)

We now open the LAI NetCDF files. NOTE: we do not set chunks in the spatial dimension because we need to load all values anyway to perform the reprojection. 

In [23]:
lai_paths = sorted(
    glob.glob(f'{LAI_DIR}/c_gls_LAI_*_GLOBE_PROBAV_V1.5.1.nc')
)

In [24]:
lai = xr.open_mfdataset(lai_paths, chunks={})
lai = lai['LAI']  # keep only one variable
lai.rio.write_nodata(np.nan, inplace=True)

We fix the longitude ordering and make sure the CRS is in place:

In [25]:
lai = lai.rio.write_crs('WGS84')
lai = lai.rename(
    lon='longitude',
    lat='latitude',
)
lai = lai.assign_coords(
    longitude=(lai.longitude + 360) % 360,
    latitude=lai.latitude
)
# roll array to reorder coordinates 
nroll = (lai.longitude.values >= 180 ).sum()
lai = lai.roll(longitude=-nroll, roll_coords=True)

We define the template for reprojection and run it!

In [26]:
template = xr.DataArray(
    data=da.zeros(
        (len(lai.time), len(target.latitude), len(target.longitude)),
        chunks=(1, -1, -1),
    ),
    dims=('time', 'latitude', 'longitude'),
    coords={
        'time': lai.time, 
        'latitude': target.latitude, 
        'longitude': target.longitude,
    },
)

In [27]:
def reproject(source):
    # import here, otherwise not seen by workers
    import rioxarray
    reprojected = source.rio.reproject_match(
        target,
        nodata=np.nan,
        resampling=Resampling.average
    )
    reprojected = reprojected.drop_vars('crs')
    return reprojected.rename(x='longitude', y='latitude')

lai_reprojected = xr.map_blocks(
    reproject,
    lai,
    template=template,
)

We rechunk and save it as Zarr:

In [28]:
lai_reprojected = lai_reprojected.chunk(
    time=-1, 
    longitude=250, 
    latitude=250
)
lai_reprojected = lai_reprojected.to_dataset(name='LAI')

In [29]:
%%time
zarr_path = f'{OUT_DIR}/lai.zarr'
lai_reprojected.to_zarr(zarr_path, mode='w')

CPU times: user 5.2 s, sys: 263 ms, total: 5.47 s
Wall time: 39.5 s


<xarray.backends.zarr.ZarrStore at 0x14a917625ba0>

When done with the conversion we switch off the cluster to release resources:

In [30]:
client.shutdown()

In [31]:
!du -h $zarr_path

149M	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/lai/lai.zarr/LAI
2.0K	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/lai/lai.zarr/longitude
2.0K	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/lai/lai.zarr/latitude
2.0K	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/lai/lai.zarr/time
149M	/gpfs/work2/0/ttse0619/francesco/Projects/EcoExtreML/Data/1input_data/2015global/lai/lai.zarr
