In [1]:
# import dask.config as dc
import dask.array as da
import numpy as np
import rioxarray
import xarray as xr
import glob

from dask.distributed import Client, LocalCluster
from dask_jobqueue import SLURMCluster

# LAI data: from NetCDF to Zarr

The Leaf Area Index (LAI) dataset is provided as chunked NetCDF4/HDF5 data. We reproject the spatial extend to match the ERA5 land dataset using rioxarray, which, unfortunately does not support Dask. It is global.

## Input variables

In [50]:
year = 2018

In [51]:
ROOT_DIR = '/gpfs/work2/0/ttse0619'
LAI_DIR = (
    f'{ROOT_DIR}/qianqian/global_data_Qianqian/'
    f'1input_data/{year}global/lai_v2/'
)
OUT_DIR = (
    f'{ROOT_DIR}/qianqian/global_data_Qianqian/'
    f'1input_data/{year}global/lai_v2' #put the result of 2014 and 2015 in 2015 folder
)

In [52]:
LAI_DIR_before = (
    f'{ROOT_DIR}/qianqian/global_data_Qianqian/'
    f'1input_data/{year-1}global/lai_v2/'
)
LAI_DIR_after = (
    f'{ROOT_DIR}/qianqian/global_data_Qianqian/'
    f'1input_data/{year+1}global/lai_v2/'
)

In [53]:
ERA5_PATH = (
    f'{ROOT_DIR}/francesco/Projects/EcoExtreML/Data/'
    f'1input_data/{2015}global/era5land/era5land.zarr'
)

## Setup Dask cluster

NOTE: when working with NetCDF files (and the netcdf4 library) it is much better to work with many processes and few threads per process: netcdf4 can only read from one thread per process.

In [30]:
# cluster = LocalCluster(n_workers=4, threads_per_worker=1)

In [7]:
cluster = SLURMCluster(
    name='dask-worker',
    cores=16,
    processes=16,
    queue='fat',
    memory='120GiB',
    local_directory='$TMPDIR',
    walltime='1:00:00'
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46315 instead


In [8]:
cluster.scale(jobs=4)

In [9]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: /proxy/46315/status,

0,1
Dashboard: /proxy/46315/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://145.136.63.41:40417,Workers: 0
Dashboard: /proxy/46315/status,Total threads: 0
Started: Just now,Total memory: 0 B


## Converting the dataset

We start by loading the ERA5-land Zarr dataset, which we will use as the target for reprojection: 

In [54]:
ds = xr.open_zarr(ERA5_PATH)

In [55]:
ds = ds.rio.write_crs('WGS84')
target = ds['sp'].isel(time=0, drop=True)

In [56]:
LAI_DIR

'/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2018global/lai_v2/'

In [57]:
lai_paths_year = sorted(
    glob.glob(f'{LAI_DIR}/c_gls_LAI*_GLOBE_*.nc') #different from LAI_V1
)
lai_paths_before = sorted(
    glob.glob(f'{LAI_DIR_before}/c_gls_LAI*_GLOBE_*.nc')
)[-1]
lai_paths_after = sorted(
    glob.glob(f'{LAI_DIR_after}/c_gls_LAI*_GLOBE_*.nc')
)[0]

We now open the LAI NetCDF files. NOTE: we do not set chunks in the spatial dimension because we need to load all values anyway to perform the reprojection. 

In [58]:
lai_paths = []
lai_paths.append(lai_paths_before)
lai_paths.extend(lai_paths_year)
lai_paths.append(lai_paths_after)

In [59]:
len(lai_paths)

38

In [60]:
lai = xr.open_mfdataset(lai_paths, chunks={})
lai = lai['LAI']  # keep only one variable
lai.rio.write_nodata(np.nan, inplace=True)

Unnamed: 0,Array,Chunk
Bytes,89.50 GiB,2.36 GiB
Shape,"(38, 15680, 40320)","(1, 15680, 40320)"
Dask graph,38 chunks in 77 graph layers,38 chunks in 77 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 89.50 GiB 2.36 GiB Shape (38, 15680, 40320) (1, 15680, 40320) Dask graph 38 chunks in 77 graph layers Data type float32 numpy.ndarray",40320  15680  38,

Unnamed: 0,Array,Chunk
Bytes,89.50 GiB,2.36 GiB
Shape,"(38, 15680, 40320)","(1, 15680, 40320)"
Dask graph,38 chunks in 77 graph layers,38 chunks in 77 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


We fix the longitude ordering and make sure the CRS is in place:

In [61]:
lai = lai.rio.write_crs('WGS84')
lai = lai.rename(
    lon='longitude',
    lat='latitude',
)
lai = lai.assign_coords(
    longitude=(lai.longitude + 360) % 360,
    latitude=lai.latitude
)
# roll array to reorder coordinates 
nroll = (lai.longitude.values >= 180 ).sum()
lai = lai.roll(longitude=-nroll, roll_coords=True)

We define the template for reprojection and run it!

In [62]:
template = xr.DataArray(
    data=da.zeros(
        (len(lai.time), len(target.latitude), len(target.longitude)),
        chunks=(1, -1, -1),
    ),
    dims=('time', 'latitude', 'longitude'),
    coords={
        'time': lai.time, 
        'latitude': target.latitude, 
        'longitude': target.longitude,
    },
)

In [63]:
def reproject(source):
    # import here, otherwise not seen by workers
    import rioxarray
    from rasterio.warp import reproject, Resampling
    reprojected = source.rio.reproject_match(
        target,
        nodata=np.nan,
        resampling=Resampling.average
    )
    reprojected = reprojected.drop_vars('crs')
    return reprojected.rename(x='longitude', y='latitude')

lai_reprojected = xr.map_blocks(
    reproject,
    lai,
    template=template,
)

We rechunk and save it as Zarr:

In [64]:
lai_reprojected = lai_reprojected.chunk(
    time=-1, 
    longitude=250, 
    latitude=250
)
lai_reprojected = lai_reprojected.to_dataset(name='LAI')

In [65]:
lai_reprojected

Unnamed: 0,Array,Chunk
Bytes,1.84 GiB,18.12 MiB
Shape,"(38, 1801, 3600)","(38, 250, 250)"
Dask graph,120 chunks in 86 graph layers,120 chunks in 86 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.84 GiB 18.12 MiB Shape (38, 1801, 3600) (38, 250, 250) Dask graph 120 chunks in 86 graph layers Data type float64 numpy.ndarray",3600  1801  38,

Unnamed: 0,Array,Chunk
Bytes,1.84 GiB,18.12 MiB
Shape,"(38, 1801, 3600)","(38, 250, 250)"
Dask graph,120 chunks in 86 graph layers,120 chunks in 86 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [66]:
%%time
zarr_path = f'{OUT_DIR}/lai_v2.zarr'
lai_reprojected.to_zarr(zarr_path, mode='w')

CPU times: user 11.3 s, sys: 524 ms, total: 11.8 s
Wall time: 47.1 s


<xarray.backends.zarr.ZarrStore at 0x14bb44274ac0>

When done with the conversion we switch off the cluster to release resources:

In [71]:
client.shutdown()

In [76]:
zarr_path = f'{OUT_DIR}/lai_v2.zarr'
!du -h $zarr_path

237M	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2018global/lai_v2/lai_v2.zarr/LAI
2.0K	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2018global/lai_v2/lai_v2.zarr/longitude
2.0K	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2018global/lai_v2/lai_v2.zarr/latitude
2.0K	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2018global/lai_v2/lai_v2.zarr/time
237M	/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2018global/lai_v2/lai_v2.zarr


In [48]:
v2 = xr.open_zarr(f'{OUT_DIR}/lai_v2.zarr').LAI

In [57]:
# v2[:,400,400].plot()

In [58]:
# v1[:,400,400].plot()

In [53]:
v1 = xr.open_zarr("/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2014global/lai/lai0213.zarr").LAI