# This Tutorial covers speeding up dataset loading with the intake catalog.
### It assumes you are using a large (or larger) ARE instance. If you are using a smaller ARE instance, some things may not work due to lack of computational resources

In [4]:
import intake # For the catalog
from dask.distributed import Client # Dask client config
import datetime # We'll use this to time some slow operations
catalog = intake.cat.access_nri
catalog

Unnamed: 0_level_0,model,description,realm,frequency,variable
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01deg_jra55_ryf_Control,{ACCESS-OM2-01},"{0.1° ACCESS-OM2 repeat year forcing control run for the simulations performed in Huguenin et al. (2024, GRL)}","{ocean, seaIce}","{1mon, fx}","{kmt, uocn_m, neutral, sfc_hflux_pme_on_nrho, total_ocean_evap, alvdf_ai_m, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, area_t, sea_levelsq, shear_m, grid_xt_ocean, area_u, total_ocean_melt, flat..."
01deg_jra55_ryf_ENFull,{ACCESS-OM2},"{0.1° ACCESS-OM2 El Níño run for the simulations performed in Huguenin et al. (2024, GRL)}","{ocean, seaIce}","{1mon, fx}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, area_t, sea_levelsq, shear_m, grid_xt_ocean, area_u, total_ocean_melt, flatn_ai_m, tau_y, grid_yu_..."
01deg_jra55_ryf_LNFull,{ACCESS-OM2},"{0.1° ACCESS-OM2 La Níña run for the simulations performed in Huguenin et al. (2024, GRL)}","{ocean, seaIce}","{1mon, fx}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, area_t, sea_levelsq, shear_m, grid_xt_ocean, area_u, total_ocean_melt, flatn_ai_m, tau_y, grid_yu_..."
01deg_jra55v13_ryf9091,{ACCESS-OM2-01},{0.1 degree ACCESS-OM2 global model configuration with JRA55-do v1.3 RYF9091 repeat year forcing (May 1990 to Apr 1991)},"{ocean, seaIce}","{1mon, 1day, fx, 3mon, 3hr}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean..."
01deg_jra55v13_ryf9091_easterlies_down10,{ACCESS-OM2-01},{0.1 degree ACCESS-OM2 global model configuration with JRA55-do v1.3 RYF9091 repeat year forcing (May 1990 to Apr 1991) and zonal/meridional wind speed around Antarctica decreased by 10%.},"{ocean, seaIce}","{1day, fx, 1mon}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean..."
01deg_jra55v13_ryf9091_easterlies_up10,{ACCESS-OM2-01},{0.1 degree ACCESS-OM2 global model configuration with JRA55-do v1.3 RYF9091 repeat year forcing (May 1990 to Apr 1991) and zonal/meridional wind speed around Antarctica increased by 10%.},"{ocean, seaIce}","{1day, fx, 1mon}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean..."
01deg_jra55v13_ryf9091_easterlies_up10_meridional,{ACCESS-OM2-01},{0.1 degree ACCESS-OM2 global model configuration with JRA55-do v1.3 RYF9091 repeat year forcing (May 1990 to Apr 1991) and meridional wind speed around Antarctica increased by 10%.},"{ocean, seaIce}","{1day, fx, 1mon}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean..."
01deg_jra55v13_ryf9091_easterlies_up10_zonal,{ACCESS-OM2-01},{0.1 degree ACCESS-OM2 global model configuration with JRA55-do v1.3 RYF9091 repeat year forcing (May 1990 to Apr 1991) and zonal wind speed around Antarctica increased by 10%.},"{ocean, seaIce}","{1day, fx, 1mon}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean..."
01deg_jra55v13_ryf9091_qian_wthmp,{ACCESS-OM2},"{Future perturbations with wind, thermal and meltwater forcing, branching off 01deg_jra55v13_ryf9091, as described in Li et al. 2023, https://www.nature.com/articles/s41586-023-05762-w}","{ocean, seaIce}","{1mon, fx}","{kmt, uocn_m, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean_melt, fl..."
01deg_jra55v13_ryf9091_qian_wthp,{ACCESS-OM2},"{Future perturbation with wind and thermal forcing, branching off 01deg_jra55v13_ryf9091, as described in Li et al. 2023, https://www.nature.com/articles/s41586-023-05762-w}","{ocean, seaIce}","{1mon, fx}","{kmt, uocn_m, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean_melt, fl..."


In [46]:
client = Client(threads_per_worker = 1)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 45845 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: /proxy/45845/status,

0,1
Dashboard: /proxy/45845/status,Workers: 7
Total threads: 7,Total memory: 32.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:36799,Workers: 7
Dashboard: /proxy/45845/status,Total threads: 7
Started: Just now,Total memory: 32.00 GiB

0,1
Comm: tcp://127.0.0.1:37055,Total threads: 1
Dashboard: /proxy/35593/status,Memory: 4.57 GiB
Nanny: tcp://127.0.0.1:39911,
Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-zoctoljv,Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-zoctoljv

0,1
Comm: tcp://127.0.0.1:34799,Total threads: 1
Dashboard: /proxy/43207/status,Memory: 4.57 GiB
Nanny: tcp://127.0.0.1:46265,
Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-flq_t4iz,Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-flq_t4iz

0,1
Comm: tcp://127.0.0.1:36089,Total threads: 1
Dashboard: /proxy/33231/status,Memory: 4.57 GiB
Nanny: tcp://127.0.0.1:43177,
Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-aipvcvfw,Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-aipvcvfw

0,1
Comm: tcp://127.0.0.1:41219,Total threads: 1
Dashboard: /proxy/42447/status,Memory: 4.57 GiB
Nanny: tcp://127.0.0.1:40349,
Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-fs1a5x7s,Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-fs1a5x7s

0,1
Comm: tcp://127.0.0.1:35049,Total threads: 1
Dashboard: /proxy/40919/status,Memory: 4.57 GiB
Nanny: tcp://127.0.0.1:35927,
Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-xhsmwtcb,Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-xhsmwtcb

0,1
Comm: tcp://127.0.0.1:33905,Total threads: 1
Dashboard: /proxy/41675/status,Memory: 4.57 GiB
Nanny: tcp://127.0.0.1:41625,
Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-ef6rmcs6,Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-ef6rmcs6

0,1
Comm: tcp://127.0.0.1:40563,Total threads: 1
Dashboard: /proxy/43293/status,Memory: 4.57 GiB
Nanny: tcp://127.0.0.1:44597,
Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-t6jym_ap,Local directory: /jobfs/136291823.gadi-pbs/dask-scratch-space/worker-t6jym_ap


In [3]:
datastore = catalog['01deg_jra55v13_ryf9091'].search(frequency='1mon',variable='u')
datastore

Unnamed: 0,unique
filename,1
file_id,1
path,920
filename_timestamp,0
frequency,1
start_date,920
end_date,920
variable,46
variable_long_name,42
variable_standard_name,11


In [4]:
%%timeit 
# Please don't run this - it can be very slow!
datastore.to_dask(xarray_open_kwargs = {'decode_timedelta' : False}) # We need this to avoid a bunch of annoying warnings.

17.5 s ± 399 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
ds = datastore.to_dask(xarray_open_kwargs = {'decode_timedelta' : False}) 

ds['u']

Unnamed: 0,Array,Chunk
Bytes,7.32 TiB,3.20 MiB
Shape,"(2760, 75, 2700, 3600)","(1, 7, 300, 400)"
Dask graph,2459160 chunks in 1841 graph layers,2459160 chunks in 1841 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 7.32 TiB 3.20 MiB Shape (2760, 75, 2700, 3600) (1, 7, 300, 400) Dask graph 2459160 chunks in 1841 graph layers Data type float32 numpy.ndarray",2760  1  3600  2700  75,

Unnamed: 0,Array,Chunk
Bytes,7.32 TiB,3.20 MiB
Shape,"(2760, 75, 2700, 3600)","(1, 7, 300, 400)"
Dask graph,2459160 chunks in 1841 graph layers,2459160 chunks in 1841 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## This is a pretty big dataset, but it would be nice if we could open it in less than 17-18 seconds. Lets see if we can do better.

- Step 1: Inspecting chunking.

__Chunking__

Chunking is core to how Dask, and by extension Xarray work. If we choose good chunks, we can often reduce the amount of work needed to do load an array.

Further Reading: 
- [Xarray Documentation](https://docs.xarray.dev/en/stable/user-guide/dask.html#dask-chunks)
- [Choosing good chunk sizes in Dask](https://blog.dask.org/2021/11/02/choosing-dask-chunk-sizes?utm_source=xarray-docs)


In [7]:
ds['u']

Unnamed: 0,Array,Chunk
Bytes,7.32 TiB,3.20 MiB
Shape,"(2760, 75, 2700, 3600)","(1, 7, 300, 400)"
Dask graph,2459160 chunks in 1841 graph layers,2459160 chunks in 1841 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 7.32 TiB 3.20 MiB Shape (2760, 75, 2700, 3600) (1, 7, 300, 400) Dask graph 2459160 chunks in 1841 graph layers Data type float32 numpy.ndarray",2760  1  3600  2700  75,

Unnamed: 0,Array,Chunk
Bytes,7.32 TiB,3.20 MiB
Shape,"(2760, 75, 2700, 3600)","(1, 7, 300, 400)"
Dask graph,2459160 chunks in 1841 graph layers,2459160 chunks in 1841 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


___
### We can see from the output above that we have _lots_ (2459160) chunks, each of which are _very_ small (3.20 MiB).
### What this means is that dask is almost certainly going to be spending lots of time concatenating very small chunks together, for no good reason.

- Since our xarray dataarray is 7.32TiB in total, and we have ~32GiB available, there is no way we can load the datasets without chunks.
- However, our dataarray is probably stored over a number of files. As a first pass, lets check how our dataset is structured on disk, and then try to load each file into a single chunk.
- We can see above we have 2760 timestamps. If we look at our dataset again, we see that it has 920 files - so one file for every 3 timestamps. This probably means we have 4 files per year of model output.

### As a rule of thumb, chunks of ~300MiB are a good starting place. Lets seem what happens if we tell dask to make each file into a single chunk.

### How can we tell dask to load a single chunk per file?

[`xr.open_dataset`](https://docs.xarray.dev/en/stable/generated/xarray.open_dataset.html) has a `chunks` argument, which lets us tell dask what chunking scheme to use for loading files.
`intake-esm` lets us access this with `xaray_open_kwargs`

- In this instance, we want to specify chunks *on a dimension by dimension basis* - so we'll need the dimension names.


In [8]:
ds['u'].dims

('time', 'st_ocean', 'yu_ocean', 'xu_ocean')

- We know we want one chunk per time slice, and we have 408 chunks, so we'll specify 408 chunks for time
- How do we know how to set the chunk size for all the other dimensions?

It turns out it's not necessary - we can use `-1` to represent 'the entire dimension'. So our chunking dict will look like this:

In [20]:
%%timeit
import datetime
chunks_dict = {
    'time' : 920,
    'st_ocean' : -1,
    'yu_ocean' : -1,
    'xu_ocean' : -1,
}

datastore.to_dask(xarray_open_kwargs={'chunks' : chunks_dict, 'decode_timedelta' : False})

14.8 s ± 345 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
chunks_dict = {
    'time' : 920,
    'st_ocean' : -1,
    'yu_ocean' : -1,
    'xu_ocean' : -1,
}
ds_chunked = datastore.to_dask(xarray_open_kwargs={'chunks' : chunks_dict, 'decode_timedelta' : False})

ds_chunked['u']

Unnamed: 0,Array,Chunk
Bytes,7.32 TiB,8.15 GiB
Shape,"(2760, 75, 2700, 3600)","(3, 75, 2700, 3600)"
Dask graph,920 chunks in 1841 graph layers,920 chunks in 1841 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 7.32 TiB 8.15 GiB Shape (2760, 75, 2700, 3600) (3, 75, 2700, 3600) Dask graph 920 chunks in 1841 graph layers Data type float32 numpy.ndarray",2760  1  3600  2700  75,

Unnamed: 0,Array,Chunk
Bytes,7.32 TiB,8.15 GiB
Shape,"(2760, 75, 2700, 3600)","(3, 75, 2700, 3600)"
Dask graph,920 chunks in 1841 graph layers,920 chunks in 1841 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [22]:
%%timeit 
# That made things a bit faster - but now our chunks are much too big. Lets split up on `xu_ocean` too - our chunks are about ~30x too big. 
chunks_dict = {
    'time' : 920,
    'st_ocean' : -1,
    'yu_ocean' : -1,
    'xu_ocean' : 120,
}

datastore.to_dask(xarray_open_kwargs={'chunks' : chunks_dict, 'decode_timedelta' : False})

  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)
  ds = xr.open_dataset(url, **xarray_open_kwargs)


15.2 s ± 401 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
# Note the warnings above - if you specify chunks and don't make a good choice, you can make the performance worse, not better 
chunks_dict = {
    'time' : 920,
    'st_ocean' : -1,
    'yu_ocean' : -1,
    'xu_ocean' : 120,
}

ds_chunked = datastore.to_dask(xarray_open_kwargs={'chunks' : chunks_dict, 'decode_timedelta' : False})

ds_chunked['u']

Unnamed: 0,Array,Chunk
Bytes,7.32 TiB,278.09 MiB
Shape,"(2760, 75, 2700, 3600)","(3, 75, 2700, 120)"
Dask graph,27600 chunks in 1841 graph layers,27600 chunks in 1841 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 7.32 TiB 278.09 MiB Shape (2760, 75, 2700, 3600) (3, 75, 2700, 120) Dask graph 27600 chunks in 1841 graph layers Data type float32 numpy.ndarray",2760  1  3600  2700  75,

Unnamed: 0,Array,Chunk
Bytes,7.32 TiB,278.09 MiB
Shape,"(2760, 75, 2700, 3600)","(3, 75, 2700, 120)"
Dask graph,27600 chunks in 1841 graph layers,27600 chunks in 1841 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [30]:
# That sped things up a little bit - but now lets look at what happens when we take a mean. For brevity, we'll just look at the first time step.
# We could make this faster by selecting the time step when we look at the datastore.

t0 = datetime.datetime.utcnow()
mean_nochunks = datastore.to_dask(xarray_open_kwargs={'decode_timedelta' : False}).isel(time=0).mean(dim=['st_ocean','yu_ocean','xu_ocean']).compute()
t1 = datetime.datetime.utcnow()

dt = t1 - t0

nochunk_str = f"No chunking: took ~= {dt.seconds // 60} minutes, {dt.seconds % 60 } seconds to compute a mean = {mean_nochunks}"

t0 = datetime.datetime.utcnow()
mean_chunks = datastore.to_dask(xarray_open_kwargs={'chunks' : chunks_dict, 'decode_timedelta' : False}).isel(time=0).mean(dim=['st_ocean','yu_ocean','xu_ocean']).compute()
t1 = datetime.datetime.utcnow()

dt = t1 - t0

chunk_str = f"~300MiB chunks: took ~= {dt.seconds // 60} minutes, {dt.seconds % 60 } seconds to compute a mean = {mean_chunks}"

print(nochunk_str)
print(chunk_str)

  ds = xr.open_dataset(url, **xarray_open_kwargs)


No chunking: took ~= 0 minutes, 29 seconds to compute a mean = <xarray.Dataset> Size: 12B
Dimensions:  ()
Coordinates:
    time     object 8B 1950-01-16 12:00:00
Data variables:
    u        float32 4B 0.007727
~300MiB chunks: took ~= 0 minutes, 47 seconds to compute a mean = <xarray.Dataset> Size: 12B
Dimensions:  ()
Coordinates:
    time     object 8B 1950-01-16 12:00:00
Data variables:
    u        float32 4B 0.007727


# As you can see, getting the chunks right can be more of an art than a science.

- We followed the 300MiB chunk rule of thumb above, and slowed down loading our dataset - so the warnings about degrading performance were right.
- You can also try `'chunks' : 'auto'` to let xarray decide - there's also a chance this will speed things up.


In [32]:
%%timeit 
mean_chunks = datastore.to_dask(xarray_open_kwargs={'chunks' : 'auto', 'decode_timedelta' : False}).isel(time=0).mean(dim=['st_ocean','yu_ocean','xu_ocean']).compute()

24.3 s ± 347 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Can you improve on what we've done here? Things to try:
1. Select the data we want to open at the datastore stage, not after opening the dataset.
2. Play around with chunking. What's the best you can do? For more info on chunking, see [here](https://docs.xarray.dev/en/stable/generated/xarray.open_dataset.html#xarray-open-dataset) and [here](https://docs.dask.org/en/latest/array-chunks.html)

In [None]:
# Exercise 1.
%%timeit
datastore.search(...).to_dask(xarray_open_kwargs={'chunks' : 'auto', 'decode_timedelta' : False}).isel(time=0).mean(dim=['st_ocean','yu_ocean','xu_ocean']).compute()

In [None]:
# Exercise 2.
%%timeit
datastore.to_dask(xarray_open_kwargs={'chunks' : ... 'decode_timedelta' : False}).isel(time=0).mean(dim=['st_ocean','yu_ocean','xu_ocean']).compute()

## What about a dataset where chunking isn't really the problem?

In [5]:
datastore = catalog['025deg_era5_ryf'].search(frequency='1mon',file_id='iceh_XXXX_XX',variable='aicen_m')
datastore

Unnamed: 0,unique
filename,408
file_id,1
path,408
filename_timestamp,408
frequency,1
start_date,408
end_date,408
variable,88
variable_long_name,88
variable_standard_name,1


In [6]:
# Please don't run this - it can be super slow!
# I've used the datetime module rather than %time or %%timeit as they take even longer!
# However, profiling like this can be very inaccurate: see eg. https://github.com/Kai-Striega/EuroSciPy-2023-Speech/blob/main/EuroSciPy_Speech.pdf
# for a detailed discussion on profiling.
t0 = datetime.datetime.utcnow()
ds = datastore.to_dask()
t1 = datetime.datetime.utcnow()

dt = t1 - t0

print(f"took ~= {dt.seconds // 60} minutes, {dt.seconds % 60 } seconds to load")
ds

took ~= 4 minutes, 13 seconds to load


Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,1.48 MiB
Shape,"(1080, 1440)","(540, 720)"
Dask graph,4 chunks in 2037 graph layers,4 chunks in 2037 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 1.48 MiB Shape (1080, 1440) (540, 720) Dask graph 4 chunks in 2037 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,1.48 MiB
Shape,"(1080, 1440)","(540, 720)"
Dask graph,4 chunks in 2037 graph layers,4 chunks in 2037 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,1.48 MiB
Shape,"(1080, 1440)","(540, 720)"
Dask graph,4 chunks in 2037 graph layers,4 chunks in 2037 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 1.48 MiB Shape (1080, 1440) (540, 720) Dask graph 4 chunks in 2037 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,1.48 MiB
Shape,"(1080, 1440)","(540, 720)"
Dask graph,4 chunks in 2037 graph layers,4 chunks in 2037 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,1.48 MiB
Shape,"(1080, 1440)","(540, 720)"
Dask graph,4 chunks in 2037 graph layers,4 chunks in 2037 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 1.48 MiB Shape (1080, 1440) (540, 720) Dask graph 4 chunks in 2037 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,1.48 MiB
Shape,"(1080, 1440)","(540, 720)"
Dask graph,4 chunks in 2037 graph layers,4 chunks in 2037 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,1.48 MiB
Shape,"(1080, 1440)","(540, 720)"
Dask graph,4 chunks in 2037 graph layers,4 chunks in 2037 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 1.48 MiB Shape (1080, 1440) (540, 720) Dask graph 4 chunks in 2037 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,1.48 MiB
Shape,"(1080, 1440)","(540, 720)"
Dask graph,4 chunks in 2037 graph layers,4 chunks in 2037 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20 B,20 B
Shape,"(5,)","(5,)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 20 B 20 B Shape (5,) (5,) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",5  1,

Unnamed: 0,Array,Chunk
Bytes,20 B,20 B
Shape,"(5,)","(5,)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,11.82 GiB,1.48 MiB
Shape,"(408, 5, 1080, 1440)","(1, 1, 540, 720)"
Dask graph,8160 chunks in 817 graph layers,8160 chunks in 817 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 11.82 GiB 1.48 MiB Shape (408, 5, 1080, 1440) (1, 1, 540, 720) Dask graph 8160 chunks in 817 graph layers Data type float32 numpy.ndarray",408  1  1440  1080  5,

Unnamed: 0,Array,Chunk
Bytes,11.82 GiB,1.48 MiB
Shape,"(408, 5, 1080, 1440)","(1, 1, 540, 720)"
Dask graph,8160 chunks in 817 graph layers,8160 chunks in 817 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [7]:
# Like we did above, lets try to set one chunk per file to speed things up.
t0 = datetime.datetime.utcnow()
ds = datastore.to_dask(xarray_open_kwargs={'chunks' : {'time' :408 , 'nc' : -1, 'ni' : 1440, 'nj' : 1080}})
t1 = datetime.datetime.utcnow()

dt = t1 - t0

print(f"took ~= {dt.seconds // 60} minutes, {dt.seconds % 60 } seconds to load")
ds

took ~= 4 minutes, 55 seconds to load


Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20 B,20 B
Shape,"(5,)","(5,)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 20 B 20 B Shape (5,) (5,) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",5  1,

Unnamed: 0,Array,Chunk
Bytes,20 B,20 B
Shape,"(5,)","(5,)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,11.82 GiB,29.66 MiB
Shape,"(408, 5, 1080, 1440)","(1, 5, 1080, 1440)"
Dask graph,408 chunks in 817 graph layers,408 chunks in 817 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 11.82 GiB 29.66 MiB Shape (408, 5, 1080, 1440) (1, 5, 1080, 1440) Dask graph 408 chunks in 817 graph layers Data type float32 numpy.ndarray",408  1  1440  1080  5,

Unnamed: 0,Array,Chunk
Bytes,11.82 GiB,29.66 MiB
Shape,"(408, 5, 1080, 1440)","(1, 5, 1080, 1440)"
Dask graph,408 chunks in 817 graph layers,408 chunks in 817 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


### Unfortunately, that didn't seem to help much - it might have even made things a bit slower. 
- So what is the issue?

It turns our that xarray is checking that all our coordinates are consistent. Doing that with the 2D arrays `(ni,nj)` can be really quite slow. Fortunately, we have options to turn these checks off too, if we are confident we don't need them. In this instance, they come from a consistent model grid, so we know we can get rid of them.

#### We don't use `xarray_open_kwargs` for this: we use `xarray_combine_by_kwargs`

Lets see if we can beat four minutes...
___
Step 1: Lets concatenate together the minimal set of variables

In [8]:
t0 = datetime.datetime.utcnow()
datastore.to_dask(
    xarray_combine_by_coords_kwargs={ 
        'data_vars': 'minimal',
        'coords': 'minimal'
    }
)
t1 = datetime.datetime.utcnow()

dt = t1 - t0

print(f"took ~= {dt.seconds // 60} minutes, {dt.seconds % 60 } seconds to load")
ds

took ~= 11 minutes, 16 seconds to load


Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20 B,20 B
Shape,"(5,)","(5,)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 20 B 20 B Shape (5,) (5,) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",5  1,

Unnamed: 0,Array,Chunk
Bytes,20 B,20 B
Shape,"(5,)","(5,)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,11.82 GiB,29.66 MiB
Shape,"(408, 5, 1080, 1440)","(1, 5, 1080, 1440)"
Dask graph,408 chunks in 817 graph layers,408 chunks in 817 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 11.82 GiB 29.66 MiB Shape (408, 5, 1080, 1440) (1, 5, 1080, 1440) Dask graph 408 chunks in 817 graph layers Data type float32 numpy.ndarray",408  1  1440  1080  5,

Unnamed: 0,Array,Chunk
Bytes,11.82 GiB,29.66 MiB
Shape,"(408, 5, 1080, 1440)","(1, 5, 1080, 1440)"
Dask graph,408 chunks in 817 graph layers,408 chunks in 817 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


#### So this actually slowed things down pretty substantially - that's not ideal!

Step 2: Let's set the `compat` flag to `override`. This skips a bunch of checks that slow things down a bunch.
Note however: if we don't set `'datavars' : 'minimal'` and `'coords' : 'minimal'`, this can throw an error.


In [9]:
t0 = datetime.datetime.utcnow()
datastore.to_dask(
    xarray_combine_by_coords_kwargs={ 
        'compat' : 'override',
        'data_vars': 'minimal',
        'coords': 'minimal'
    }
)
t1 = datetime.datetime.utcnow()

dt = t1 - t0

print(f"took ~= {dt.seconds // 60} minutes, {dt.seconds % 60 } seconds to load")
ds

took ~= 0 minutes, 12 seconds to load


Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20 B,20 B
Shape,"(5,)","(5,)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 20 B 20 B Shape (5,) (5,) Dask graph 1 chunks in 2035 graph layers Data type float32 numpy.ndarray",5  1,

Unnamed: 0,Array,Chunk
Bytes,20 B,20 B
Shape,"(5,)","(5,)"
Dask graph,1 chunks in 2035 graph layers,1 chunks in 2035 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,11.82 GiB,29.66 MiB
Shape,"(408, 5, 1080, 1440)","(1, 5, 1080, 1440)"
Dask graph,408 chunks in 817 graph layers,408 chunks in 817 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 11.82 GiB 29.66 MiB Shape (408, 5, 1080, 1440) (1, 5, 1080, 1440) Dask graph 408 chunks in 817 graph layers Data type float32 numpy.ndarray",408  1  1440  1080  5,

Unnamed: 0,Array,Chunk
Bytes,11.82 GiB,29.66 MiB
Shape,"(408, 5, 1080, 1440)","(1, 5, 1080, 1440)"
Dask graph,408 chunks in 817 graph layers,408 chunks in 817 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


## That made a huge difference - we've gone down from 4 minutes to 12 seconds. Can we do better by setting the chunking too now?

In [11]:
%%timeit
# Finally, lets combine it all, and see how fast we can get!

chunks_dict = {
    'time' : 408,
    'nc' : -1,
    'nj' : -1,
    'ni' : -1,
}

datastore.to_dask(
    xarray_open_kwargs={'chunks' : chunks_dict},
    xarray_combine_by_coords_kwargs={ 'compat' : 'override', 'data_vars': 'minimal', 'coords': 'minimal'}
)

10.1 s ± 292 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
chunks_dict = {
    'time' : 408,
    'nc' : -1,
    'nj' : -1,
    'ni' : -1,
}

ds = datastore.to_dask(
    xarray_open_kwargs={'chunks' : chunks_dict},
    xarray_combine_by_coords_kwargs={ 'compat' : 'override', 'data_vars': 'minimal', 'coords': 'minimal'}
)

ds['aicen_m']

Unnamed: 0,Array,Chunk
Bytes,11.82 GiB,29.66 MiB
Shape,"(408, 5, 1080, 1440)","(1, 5, 1080, 1440)"
Dask graph,408 chunks in 817 graph layers,408 chunks in 817 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 11.82 GiB 29.66 MiB Shape (408, 5, 1080, 1440) (1, 5, 1080, 1440) Dask graph 408 chunks in 817 graph layers Data type float32 numpy.ndarray",408  1  1440  1080  5,

Unnamed: 0,Array,Chunk
Bytes,11.82 GiB,29.66 MiB
Shape,"(408, 5, 1080, 1440)","(1, 5, 1080, 1440)"
Dask graph,408 chunks in 817 graph layers,408 chunks in 817 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 5.93 MiB 5.93 MiB Shape (1080, 1440) (1080, 1440) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",1440  1080,

Unnamed: 0,Array,Chunk
Bytes,5.93 MiB,5.93 MiB
Shape,"(1080, 1440)","(1080, 1440)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,20 B,20 B
Shape,"(5,)","(5,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 20 B 20 B Shape (5,) (5,) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",5  1,

Unnamed: 0,Array,Chunk
Bytes,20 B,20 B
Shape,"(5,)","(5,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


### So, for this dataset, we can go from 4 minutes to ten seconds with some thought - or 4 minutes to 12 seconds using the `xarray_combine_by_coords_kwargs`.

TLDR; 
- If your dataset is taking a long time to load, start by adding `xarray_combine_by_coords_kwargs={ 'compat' : 'override', 'data_vars': 'minimal', 'coords': 'minimal'}` to your `datastore.to_dask()` call.
- Chunking may be able to improve things further - but it might also make it worse. It is more likely to be a source of issues once you start working with, rather than just loading, the data.
- Subsetting to the minimal dataset you want to open, *before* you open it in xarray with `.to_dask()`, will make a massive difference to load times.
- By using `xarray_combine_by_coords_kwargs` and `xarray_open_kwargs`, you can achieve a lot of control over how xarray opens your dataset - see [combine by coords](https://docs.xarray.dev/en/stable/generated/xarray.combine_by_coords.html#xarray.combine_by_coords) and [open dataset](https://docs.xarray.dev/en/stable/generated/xarray.open_dataset.html#xarray-open-dataset) for all the options.
___

## Exercises

Lets go back to our original dataset, and try to efficiently load some data.

1. Lets try loading the daily data for the first year: first by selecting only the data for the first year, and secondly by opening all the data as efficiently as possible. Which works better?
2. Lets plot the average of top grid cell temperature over the whole dataset. Now, can we make it faster using chunks?

In [25]:
catalog

Unnamed: 0_level_0,model,description,realm,frequency,variable
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01deg_jra55_ryf_Control,{ACCESS-OM2-01},"{0.1° ACCESS-OM2 repeat year forcing control run for the simulations performed in Huguenin et al. (2024, GRL)}","{ocean, seaIce}","{1mon, fx}","{kmt, uocn_m, neutral, sfc_hflux_pme_on_nrho, total_ocean_evap, alvdf_ai_m, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, area_t, sea_levelsq, shear_m, grid_xt_ocean, area_u, total_ocean_melt, flat..."
01deg_jra55_ryf_ENFull,{ACCESS-OM2},"{0.1° ACCESS-OM2 El Níño run for the simulations performed in Huguenin et al. (2024, GRL)}","{ocean, seaIce}","{1mon, fx}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, area_t, sea_levelsq, shear_m, grid_xt_ocean, area_u, total_ocean_melt, flatn_ai_m, tau_y, grid_yu_..."
01deg_jra55_ryf_LNFull,{ACCESS-OM2},"{0.1° ACCESS-OM2 La Níña run for the simulations performed in Huguenin et al. (2024, GRL)}","{ocean, seaIce}","{1mon, fx}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, area_t, sea_levelsq, shear_m, grid_xt_ocean, area_u, total_ocean_melt, flatn_ai_m, tau_y, grid_yu_..."
01deg_jra55v13_ryf9091,{ACCESS-OM2-01},{0.1 degree ACCESS-OM2 global model configuration with JRA55-do v1.3 RYF9091 repeat year forcing (May 1990 to Apr 1991)},"{ocean, seaIce}","{1mon, 1day, fx, 3mon, 3hr}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean..."
01deg_jra55v13_ryf9091_easterlies_down10,{ACCESS-OM2-01},{0.1 degree ACCESS-OM2 global model configuration with JRA55-do v1.3 RYF9091 repeat year forcing (May 1990 to Apr 1991) and zonal/meridional wind speed around Antarctica decreased by 10%.},"{ocean, seaIce}","{1day, fx, 1mon}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean..."
01deg_jra55v13_ryf9091_easterlies_up10,{ACCESS-OM2-01},{0.1 degree ACCESS-OM2 global model configuration with JRA55-do v1.3 RYF9091 repeat year forcing (May 1990 to Apr 1991) and zonal/meridional wind speed around Antarctica increased by 10%.},"{ocean, seaIce}","{1day, fx, 1mon}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean..."
01deg_jra55v13_ryf9091_easterlies_up10_meridional,{ACCESS-OM2-01},{0.1 degree ACCESS-OM2 global model configuration with JRA55-do v1.3 RYF9091 repeat year forcing (May 1990 to Apr 1991) and meridional wind speed around Antarctica increased by 10%.},"{ocean, seaIce}","{1day, fx, 1mon}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean..."
01deg_jra55v13_ryf9091_easterlies_up10_zonal,{ACCESS-OM2-01},{0.1 degree ACCESS-OM2 global model configuration with JRA55-do v1.3 RYF9091 repeat year forcing (May 1990 to Apr 1991) and zonal wind speed around Antarctica increased by 10%.},"{ocean, seaIce}","{1day, fx, 1mon}","{kmt, uocn_m, neutral, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean..."
01deg_jra55v13_ryf9091_qian_wthmp,{ACCESS-OM2},"{Future perturbations with wind, thermal and meltwater forcing, branching off 01deg_jra55v13_ryf9091, as described in Li et al. 2023, https://www.nature.com/articles/s41586-023-05762-w}","{ocean, seaIce}","{1mon, fx}","{kmt, uocn_m, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean_melt, fl..."
01deg_jra55v13_ryf9091_qian_wthp,{ACCESS-OM2},"{Future perturbation with wind and thermal forcing, branching off 01deg_jra55v13_ryf9091, as described in Li et al. 2023, https://www.nature.com/articles/s41586-023-05762-w}","{ocean, seaIce}","{1mon, fx}","{kmt, uocn_m, alvdf_ai_m, total_ocean_evap, total_ocean_swflx, alvdr_ai_m, tx_trans_rho, surface_salt, area_t, sea_levelsq, shear_m, grid_xt_ocean, u_dot_grad_vert_pv, area_u, total_ocean_melt, fl..."


In [None]:
datastore = catalog['01deg_jra55v13_ryf9091'].search(frequency='1day',variable='u')

In [None]:
t0 = datetime.datetime.utcnow()

datastore.search(...).to_dask() ### Make cahnges here

t1 = datetime.datetime.utcnow()

dt = t1 - t0

print(f"took ~= {dt.seconds // 60} minutes, {dt.seconds % 60 } seconds to load")

Unnamed: 0,unique
filename,13
file_id,2
path,473
filename_timestamp,12
frequency,1
start_date,472
end_date,472
variable,11
variable_long_name,11
variable_standard_name,2


In [None]:
t0 = datetime.datetime.utcnow()

datastore.to_dask(...) ### Make changes here

t1 = datetime.datetime.utcnow()

dt = t1 - t0

print(f"took ~= {dt.seconds // 60} minutes, {dt.seconds % 60 } seconds to load")

In [None]:
datastore = catalog['01deg_jra55v13_ryf9091'].search(variable='temp',frequency='1mon')
datastore

# Now let's use chunking and combining cordinates to try to speed up our plot


# datastore.to_dask(xarray_open_kwargs={'decode_timedelta' : False, 'chunks' :  ...}, xarray_combine_by_kwargs = { ...} ).mean(dim='time').isel(
datastore.to_dask(xarray_open_kwargs={'decode_timedelta' : False})['temp'].mean(dim='time').isel(st_ocean=0).plot()
# ^ This will take forever, but by being clever about how we load the data and using the tricks above, we can make it quite a bit faster.

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
