In [65]:
import netCDF4
import xarray as xr
from rasterio.warp import reproject, Resampling

# canopy height data: from NetCDF to Zarr

The canopy height dataset is provided as chunked NetCDF4/HDF5 data. It was reprojected and downloaded from GEE, and reprojected with ERA5Land in crib. It is only for Europe, need to download other area from GEE.

## Input variables

In [66]:
year = 2015

In [67]:
ROOT_DIR = '/gpfs/work2/0/ttse0619'
hc_PATH = (
    f'{ROOT_DIR}/qianqian/global_data_Qianqian/'
    f'1input_data/{year}global/canopy_height/canopy_height_11kmEurope20230921.tif'
)
OUT_PATH = (
    f'{ROOT_DIR}/qianqian/global_data_Qianqian/'
    f'1input_data/{year}global/canopy_height/hc.zarr'
)

In [68]:
ERA5_PATH = (
    f'{ROOT_DIR}/francesco/Projects/EcoExtreML/Data/'
    f'1input_data/{year}global/era5land/era5land.zarr'
)

## Converting the dataset

We start by loading the ERA5-land Zarr dataset, which we will use as the target for reprojection: 

In [69]:
ds = xr.open_zarr(ERA5_PATH)

In [77]:
ds = ds.rio.write_crs('WGS84')
target = ds['sp'].isel(time=0, drop=True)

In [78]:
target

Flushing oldest 4 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


Unnamed: 0,Array,Chunk
Bytes,24.73 MiB,244.14 kiB
Shape,"(1801, 3600)","(250, 250)"
Dask graph,120 chunks in 3 graph layers,120 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 24.73 MiB 244.14 kiB Shape (1801, 3600) (250, 250) Dask graph 120 chunks in 3 graph layers Data type float32 numpy.ndarray",3600  1801,

Unnamed: 0,Array,Chunk
Bytes,24.73 MiB,244.14 kiB
Shape,"(1801, 3600)","(250, 250)"
Dask graph,120 chunks in 3 graph layers,120 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [83]:
# convert from [0,360] to [-180,180]
target['longitude'] = (target['longitude'] + 180) % 360 - 180
target = target.sortby(["longitude"])

In [84]:
target

Unnamed: 0,Array,Chunk
Bytes,24.73 MiB,244.14 kiB
Shape,"(1801, 3600)","(250, 250)"
Dask graph,128 chunks in 4 graph layers,128 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 24.73 MiB 244.14 kiB Shape (1801, 3600) (250, 250) Dask graph 128 chunks in 4 graph layers Data type float32 numpy.ndarray",3600  1801,

Unnamed: 0,Array,Chunk
Bytes,24.73 MiB,244.14 kiB
Shape,"(1801, 3600)","(250, 250)"
Dask graph,128 chunks in 4 graph layers,128 chunks in 4 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [17]:
hc_PATH

'/gpfs/work2/0/ttse0619/qianqian/global_data_Qianqian/1input_data/2015global/canopy_height/canopy_height_11kmEurope20230921.tif'

We now open the landcover NetCDF files using the same chunking scheme as in the NetCDF file:

In [45]:
hc = xr.open_dataset(hc_PATH).band_data
hc = hc.squeeze('band')  # drop band dimension
hc = hc.sortby(["x", "y"])

In [46]:
hc

In [47]:
import numpy as np
hc.rio.write_nodata(np.nan, inplace=True)
hc = hc.rio.write_crs('EPSG:4326')
target = target.rio.write_crs('EPSG:4326')
hc_reprojected = hc.rio.reproject_match(target, resampling=Resampling.average, nodata=np.nan)
hc_reprojected = hc_reprojected.assign_coords({
    "x": hc_reprojected.x,
    "y": hc_reprojected.y,
})
# fix naming of coordinates
hc_reprojected = hc_reprojected.rename(
    x='longitude',
    y='latitude'
)
print(hc_reprojected)

<xarray.DataArray 'band_data' (latitude: 1801, longitude: 3600)>
array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=float32)
Coordinates:
  * longitude    (longitude) float32 -180.0 -179.9 -179.8 ... 179.7 179.8 179.9
  * latitude     (latitude) float32 90.0 89.9 89.8 89.7 ... -89.8 -89.9 -90.0
    band         int64 1
    spatial_ref  int64 0
Attributes:
    AREA_OR_POINT:  Area
    long_name:      b1
    _FillValue:     nan


In [48]:
hc_reprojected.mean()

In [49]:
# (-180;180) -> (0;360)
hc_reprojected = hc_reprojected.assign_coords(
    longitude=(hc_reprojected.longitude + 360) % 360,
    latitude=hc_reprojected.latitude
)#.sortby('longitude')

In [50]:
hc_reprojected

In [51]:
# roll array to reorder coordinates 
nroll = (hc_reprojected.longitude.values >= 180 ).sum()
hc_reprojected = hc_reprojected.roll(longitude=-nroll, roll_coords=True)

In [52]:
hc_reprojected.mean()


Flushing oldest 4 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


In [53]:
hc_reprojected

In [54]:
hc.mean().compute()

In [62]:
hc_reprojected = hc_reprojected.to_dataset(name='hc')

In [63]:
hc_reprojected = hc_reprojected.chunk(
    longitude=250, 
    latitude=250
)
hc_reprojected.to_zarr(OUT_PATH, mode='w')

<xarray.backends.zarr.ZarrStore at 0x1544e751af90>

In [64]:
xr.open_zarr(OUT_PATH).hc.mean().compute()