In [1]:
%matplotlib inline
import xarray as xr
import numpy as np 
import os 

import intake
import zarr 
import gcsfs
from rechunker import rechunk

In [2]:
import dask
import dask.array as da
import dask.distributed as dd
import rhg_compute_tools.kubernetes as rhgk

In [3]:
client, cluster = rhgk.get_standard_cluster()

In [4]:
cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

pull in some CMIP6 data

In [5]:
# search the catalog
col = intake.open_esm_datastore("https://storage.googleapis.com/cmip6/pangeo-cmip6.json")
cat = col.search(activity_id='ScenarioMIP', experiment_id='ssp370', table_id='day', variable_id='tasmax')

In [6]:
# access the data and do some cleanup
ds_model = cat['ScenarioMIP.NOAA-GFDL.GFDL-ESM4.ssp370.day.gr1'].to_dask(
    ).isel(member_id=0).squeeze(drop=True).drop(['height', 'member_id'])

ds_model.lon.values[ds_model.lon.values > 180] -= 360
ds_model = ds_model.roll(lon=72, roll_coords=True)

In [7]:
fs = gcsfs.GCSFileSystem(token='/opt/gcsfuse_tokens/impactlab-data.json')
store_filename = 'gs://impactlab-data/climate/source_data/GFDL-ESM4.ssp370.zarr'
store = fs.get_mapper(store_filename, check=False)

if not fs.exists(store_filename): 
    # save as a zarr store for rechunking 
    ds_model.to_zarr(store, consolidated=True, mode="w")

In [None]:
# load CMIP6 zarr store, this opens it as a dataset for inspection (note: MUST use path from above)
ds_reloaded = xr.open_zarr(store)

In [8]:
# load CMIP6 zarr store as a zarr group, this is the version that rechunker needs 
source_group = zarr.open_consolidated(store, mode='r')
print(source_group.tree())

/
 ├── lat (180,) float64
 ├── lon (288,) float64
 ├── tasmax (31390, 180, 288) float32
 └── time (31390,) int64


In [9]:
source_array = source_group['tasmax']

use `rechunker` package to rechunk CMIP6 model output from time chunks to space chunks 

In [38]:
target_chunks = {'time': len(source_group['time']), 'lat': 30, 'lon': 30}

# unsure why, but specifying as a string was not working
max_mem = 113004000

tmp_storename = 'gs://impactlab-data/climate/source_data/GFDL-ESM4.ssp370_tmpchunked.zarr'
target_storename = 'gs://impactlab-data/climate/source_data/GFDL-ESM4.ssp370_spacechunks.zarr'

temp_store = fs.get_mapper(tmp_storename, create=True)
target_store = fs.get_mapper(target_storename, create=True)

array_plan = rechunk(source_array, target_chunks, max_mem, target_store, temp_store=temp_store)
array_plan

In [39]:
from dask.diagnostics import ProgressBar
with ProgressBar():
    result = array_plan.execute()

check target array to see if the output got rechunked 

In [40]:
# open target array and double check that chunks are what we specified above
source_array_rechunked = zarr.open(target_store, mode='r')
print(source_array_rechunked)

<zarr.core.Array (31390, 180, 288) float32 read-only>


In [41]:
# check chunks
source_array_rechunked.chunks

(31390, 30, 30)

In [42]:
# save as a zarr group (rechunking as a group didn't work, so here we create a group from the array so that the array can be read in by xarray)
grp_filename = 'gs://impactlab-data/climate/source_data/GFDL-ESM4.ssp370_spacechunks_group.zarr'
grp_store = fs.get_mapper(grp_filename, check=False)

if not fs.exists(grp_filename): 
    # save as a zarr store for rechunking 
    source_array_rechunked.to_zarr(grp_store, consolidated=True, mode="w")

In [43]:
# check to be sure that xarray can read it 
ds_rechunked = xr.open_zarr(fs.get_mapper(grp_filename, check=False))

In [44]:
ds_rechunked

Unnamed: 0,Array,Chunk
Bytes,6.51 GB,113.00 MB
Shape,"(31390, 180, 288)","(31390, 30, 30)"
Count,61 Tasks,60 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 6.51 GB 113.00 MB Shape (31390, 180, 288) (31390, 30, 30) Count 61 Tasks 60 Chunks Type float32 numpy.ndarray",288  180  31390,

Unnamed: 0,Array,Chunk
Bytes,6.51 GB,113.00 MB
Shape,"(31390, 180, 288)","(31390, 30, 30)"
Count,61 Tasks,60 Chunks
Type,float32,numpy.ndarray
