In [1]:
import xarray as xr

In [2]:
cd ACS/Tools/Dataset_Finder

/home/565/ag0738/ACS/Tools/Dataset_Finder


In [3]:
# the preceeding cd line will depend on your working directory - adjust as required to make the notebook run
from dataset_finder import *

# Basic Usage

## Finding Data
Display every dataset from the "ACS_DD" (dynamically downscaled - i.e. before bias correction) collection. The path format is defined within paths.yml and more can be added freely (described later). Multiple paths can be given for any given collection  "ACS_DD" draws on both kj66 and ia39 seamlessly.

This tool was specifically designed with "ACS_DD" and "ACS_BC" (bias corrected ACS datasets) in mind, though I have supplied a few more paths that may be of use.

Using this without any filters applied is a good starting point to see the available options.

The table that is displayed is divided vertically in two:
- The terms to the left of the dividing line are the properties that define the dataset, and each dataset defined this way may only have one term per column.
- The terms to the right of the dividing line are what the contents of the dataset - here, there will almost certainly be multiple elements per column summarising the available data.

In [4]:
%%time
all_data = get_datasets("ACS_DD")
all_data

# Note that even after the wall time output appears, the cell will require extra time for displaying the table (about 5 seconds on first loading this table, for example)

CPU times: user 5.63 s, sys: 1.66 s, total: 7.29 s
Wall time: 15.3 s


Unnamed: 0,grid,org,gcm,scenario,mdl_run,rcm,ver,timescale,Unnamed: 9,Unnamed: 10,var,date_created,year
0,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014
1,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
2,AUST-05i,BOM,ACCESS-CM2,ssp370,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
3,AUST-05i,BOM,ACCESS-ESM1-5,historical,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014
4,AUST-05i,BOM,ACCESS-ESM1-5,ssp126,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
5,AUST-05i,BOM,ACCESS-ESM1-5,ssp370,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
6,AUST-05i,BOM,CESM2,historical,r11i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014
7,AUST-05i,BOM,CESM2,ssp126,r11i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
8,AUST-05i,BOM,CESM2,ssp370,r11i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
9,AUST-05i,BOM,CMCC-ESM2,historical,r1i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014


The object returned by the get_datasets command is a "dataset_info_collection" that contains many "dataset_info" objects (each corresponding to a row). 

We can apply ".filter" to this "all_data" object to see more specific results. By default, filter will match substrings (i.e. "ACCESS" will match both "ACCESS-CM2" and "ACCESS-ESM1-5"). It also only matches on the dataset properties (left of the dividing line).

In [5]:
filtered_data = all_data.filter(gcm = "ACCESS")
filtered_data

Unnamed: 0,grid,org,gcm,scenario,mdl_run,rcm,ver,timescale,Unnamed: 9,Unnamed: 10,var,date_created,year
0,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014
1,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
2,AUST-05i,BOM,ACCESS-CM2,ssp370,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
3,AUST-05i,BOM,ACCESS-ESM1-5,historical,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014
4,AUST-05i,BOM,ACCESS-ESM1-5,ssp126,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
5,AUST-05i,BOM,ACCESS-ESM1-5,ssp370,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
6,AUST-05i,CSIRO,ACCESS-CM2,historical,r4i1p1f1,CCAM-v2203-SN,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1951 to 2014
7,AUST-05i,CSIRO,ACCESS-CM2,ssp126,r4i1p1f1,CCAM-v2203-SN,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2099
8,AUST-05i,CSIRO,ACCESS-CM2,ssp370,r4i1p1f1,CCAM-v2203-SN,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2099
9,AUST-05i,CSIRO,ACCESS-ESM1-5,historical,r6i1p1f1,CCAM-v2203-SN,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1951 to 2014


We can apply multiple filters at once. The order they are supplied in does not matter. We can also offer multiple options to match against.

In [6]:
filtered_data = all_data.filter(gcm = "ACCESS", org = ("BOM", "CSIRO"), timescale = "day")
filtered_data

Unnamed: 0,grid,org,gcm,scenario,mdl_run,rcm,ver,timescale,Unnamed: 9,Unnamed: 10,var,date_created,year
0,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014
1,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
2,AUST-05i,BOM,ACCESS-CM2,ssp370,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
3,AUST-05i,BOM,ACCESS-ESM1-5,historical,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014
4,AUST-05i,BOM,ACCESS-ESM1-5,ssp126,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
5,AUST-05i,BOM,ACCESS-ESM1-5,ssp370,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
6,AUST-05i,CSIRO,ACCESS-CM2,historical,r4i1p1f1,CCAM-v2203-SN,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1951 to 2014
7,AUST-05i,CSIRO,ACCESS-CM2,ssp126,r4i1p1f1,CCAM-v2203-SN,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2099
8,AUST-05i,CSIRO,ACCESS-CM2,ssp370,r4i1p1f1,CCAM-v2203-SN,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2099
9,AUST-05i,CSIRO,ACCESS-ESM1-5,historical,r6i1p1f1,CCAM-v2203-SN,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1951 to 2014


Now, let's look at what's contained within the datasets to narrow down our results. If we apply ".select", we can look at specific variables and rule out datasets that are missing them. In this case, it will rule out almost all of the UQ-DEC runs (which were at the bottom of the table), as they did not supply sfcWindmax.

In [7]:
selected_data = all_data.select(var = "sfcWindmax")
selected_data

Unnamed: 0,grid,org,gcm,scenario,mdl_run,rcm,ver,timescale,Unnamed: 9,Unnamed: 10,var,date_created,year
0,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,day,,,sfcWindmax,"latest, v20241216",1960 to 2014
1,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,day,,,sfcWindmax,"latest, v20241216",2015 to 2100
2,AUST-05i,BOM,ACCESS-CM2,ssp370,r4i1p1f1,BARPA-R,v1-r1,day,,,sfcWindmax,"latest, v20241216",2015 to 2100
3,AUST-05i,BOM,ACCESS-ESM1-5,historical,r6i1p1f1,BARPA-R,v1-r1,day,,,sfcWindmax,"latest, v20241216",1960 to 2014
4,AUST-05i,BOM,ACCESS-ESM1-5,ssp126,r6i1p1f1,BARPA-R,v1-r1,day,,,sfcWindmax,"latest, v20241216",2015 to 2100
5,AUST-05i,BOM,ACCESS-ESM1-5,ssp370,r6i1p1f1,BARPA-R,v1-r1,day,,,sfcWindmax,"latest, v20241216",2015 to 2100
6,AUST-05i,BOM,CESM2,historical,r11i1p1f1,BARPA-R,v1-r1,day,,,sfcWindmax,"latest, v20241216",1960 to 2014
7,AUST-05i,BOM,CESM2,ssp126,r11i1p1f1,BARPA-R,v1-r1,day,,,sfcWindmax,"latest, v20241216",2015 to 2100
8,AUST-05i,BOM,CESM2,ssp370,r11i1p1f1,BARPA-R,v1-r1,day,,,sfcWindmax,"latest, v20241216",2015 to 2100
9,AUST-05i,BOM,CMCC-ESM2,historical,r1i1p1f1,BARPA-R,v1-r1,day,,,sfcWindmax,"latest, v20241216",1960 to 2014


Alternatively, to save space, we can supply keyword arguments to the original get_datasets call to do the filtering and selecting off the bat. Note the use of year_range for year selection - this will match inclusively on both ends.

In [8]:
specific_data = get_datasets("ACS_DD", gcm = "ACCESS", timescale = "day", scenario = "historical", var = ("sfcWindmax", "psl"), year = year_range(1970, 1974))
specific_data

Unnamed: 0,grid,org,gcm,scenario,mdl_run,rcm,ver,timescale,Unnamed: 9,Unnamed: 10,var,date_created,year
0,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,day,,,"psl, sfcWindmax","latest, v20241216, v20250311",1970 to 1974
1,AUST-05i,BOM,ACCESS-ESM1-5,historical,r6i1p1f1,BARPA-R,v1-r1,day,,,"psl, sfcWindmax","latest, v20241216, v20250311",1970 to 1974
2,AUST-05i,CSIRO,ACCESS-CM2,historical,r4i1p1f1,CCAM-v2203-SN,v1-r1,day,,,"psl, sfcWindmax","latest, v20241216, v20250311",1970 to 1974
3,AUST-05i,CSIRO,ACCESS-ESM1-5,historical,r6i1p1f1,CCAM-v2203-SN,v1-r1,day,,,"psl, sfcWindmax","latest, v20241216, v20250311",1970 to 1974
4,AUST-05i,NSW-Government,ACCESS-ESM1-5,historical,r6i1p1f1,NARCliM2-0-WRF412R3,v1-r1,day,,,"psl, sfcWindmax","v20241216, v20250311",1970 to 1974
5,AUST-05i,NSW-Government,ACCESS-ESM1-5,historical,r6i1p1f1,NARCliM2-0-WRF412R5,v1-r1,day,,,"psl, sfcWindmax","v20241216, v20250311",1970 to 1974
6,AUST-05i,UQ-DEC,ACCESS-CM2,historical,r2i1p1f1,CCAMoc-v2112,v1-r1,day,,,psl,v20250311,1966 to 1975
7,AUST-05i,UQ-DEC,ACCESS-ESM1-5,historical,r20i1p1f1,CCAMoc-v2112,v1-r1,day,,,psl,v20250311,1966 to 1975
8,AUST-05i,UQ-DEC,ACCESS-ESM1-5,historical,r40i1p1f1,CCAMoc-v2112,v1-r1,day,,,"psl, sfcWindmax","v20241216, v20250311",1966 to 1975
9,AUST-05i,UQ-DEC,ACCESS-ESM1-5,historical,r6i1p1f1,CCAM-v2105,v1-r1,day,,,psl,v20250311,1966 to 1975


We can get a list of values for a given property.

In [9]:
specific_data.get_all("rcm")

['BARPA-R',
 'CCAM-v2203-SN',
 'NARCliM2-0-WRF412R3',
 'NARCliM2-0-WRF412R5',
 'CCAMoc-v2112',
 'CCAM-v2105']

## Loading data

Once we're happy with the selection of data, we can now easily load it by choosing an index from the table and using xarray's open_mfdataset. As kj66 has both a "latest" and "v20241216" folder for each dataset, it will supply a quick message saying that it has chosen to load "latest" - this was specified in paths.yml.

In [10]:
%%time
sample_data = xr.open_mfdataset(specific_data[0])
sample_data

INFO: Clash on date_created: Chose "latest" over "v20241216" for var = sfcWindmax; year = 1970 to 1974
CPU times: user 6.58 s, sys: 469 ms, total: 7.05 s
Wall time: 8.17 s


Unnamed: 0,Array,Chunk
Bytes,4.16 GiB,2.34 MiB
Shape,"(1826, 691, 886)","(1, 691, 886)"
Dask graph,1826 chunks in 11 graph layers,1826 chunks in 11 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 4.16 GiB 2.34 MiB Shape (1826, 691, 886) (1, 691, 886) Dask graph 1826 chunks in 11 graph layers Data type float32 numpy.ndarray",886  691  1826,

Unnamed: 0,Array,Chunk
Bytes,4.16 GiB,2.34 MiB
Shape,"(1826, 691, 886)","(1, 691, 886)"
Dask graph,1826 chunks in 11 graph layers,1826 chunks in 11 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,19.25 MiB,3.86 MiB
Shape,"(1826, 691, 2)","(366, 691, 2)"
Dask graph,5 chunks in 21 graph layers,5 chunks in 21 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 19.25 MiB 3.86 MiB Shape (1826, 691, 2) (366, 691, 2) Dask graph 5 chunks in 21 graph layers Data type float64 numpy.ndarray",2  691  1826,

Unnamed: 0,Array,Chunk
Bytes,19.25 MiB,3.86 MiB
Shape,"(1826, 691, 2)","(366, 691, 2)"
Dask graph,5 chunks in 21 graph layers,5 chunks in 21 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,24.69 MiB,4.95 MiB
Shape,"(1826, 886, 2)","(366, 886, 2)"
Dask graph,5 chunks in 21 graph layers,5 chunks in 21 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 24.69 MiB 4.95 MiB Shape (1826, 886, 2) (366, 886, 2) Dask graph 5 chunks in 21 graph layers Data type float64 numpy.ndarray",2  886  1826,

Unnamed: 0,Array,Chunk
Bytes,24.69 MiB,4.95 MiB
Shape,"(1826, 886, 2)","(366, 886, 2)"
Dask graph,5 chunks in 21 graph layers,5 chunks in 21 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,28.53 kiB,5.72 kiB
Shape,"(1826, 2)","(366, 2)"
Dask graph,5 chunks in 16 graph layers,5 chunks in 16 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray
"Array Chunk Bytes 28.53 kiB 5.72 kiB Shape (1826, 2) (366, 2) Dask graph 5 chunks in 16 graph layers Data type datetime64[ns] numpy.ndarray",2  1826,

Unnamed: 0,Array,Chunk
Bytes,28.53 kiB,5.72 kiB
Shape,"(1826, 2)","(366, 2)"
Dask graph,5 chunks in 16 graph layers,5 chunks in 16 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.16 GiB,2.34 MiB
Shape,"(1826, 691, 886)","(1, 691, 886)"
Dask graph,1826 chunks in 11 graph layers,1826 chunks in 11 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 4.16 GiB 2.34 MiB Shape (1826, 691, 886) (1, 691, 886) Dask graph 1826 chunks in 11 graph layers Data type float32 numpy.ndarray",886  691  1826,

Unnamed: 0,Array,Chunk
Bytes,4.16 GiB,2.34 MiB
Shape,"(1826, 691, 886)","(1, 691, 886)"
Dask graph,1826 chunks in 11 graph layers,1826 chunks in 11 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


Alternatively, we can call ".get_files" to see exactly which files will be supplied to open_mfdataset.

In [11]:
specific_data[0].get_files()

INFO: Clash on date_created: Chose "latest" over "v20241216" for var = sfcWindmax; year = 1970 to 1974


['/g/data/kj66/CORDEX/output/CMIP6/DD/AUST-05i/BOM/ACCESS-CM2/historical/r4i1p1f1/BARPA-R/v1-r1/day/sfcWindmax/latest/sfcWindmax_AUST-05i_ACCESS-CM2_historical_r4i1p1f1_BOM_BARPA-R_v1-r1_day_19700101-19701231.nc',
 '/g/data/kj66/CORDEX/output/CMIP6/DD/AUST-05i/BOM/ACCESS-CM2/historical/r4i1p1f1/BARPA-R/v1-r1/day/sfcWindmax/latest/sfcWindmax_AUST-05i_ACCESS-CM2_historical_r4i1p1f1_BOM_BARPA-R_v1-r1_day_19710101-19711231.nc',
 '/g/data/kj66/CORDEX/output/CMIP6/DD/AUST-05i/BOM/ACCESS-CM2/historical/r4i1p1f1/BARPA-R/v1-r1/day/sfcWindmax/latest/sfcWindmax_AUST-05i_ACCESS-CM2_historical_r4i1p1f1_BOM_BARPA-R_v1-r1_day_19720101-19721231.nc',
 '/g/data/kj66/CORDEX/output/CMIP6/DD/AUST-05i/BOM/ACCESS-CM2/historical/r4i1p1f1/BARPA-R/v1-r1/day/sfcWindmax/latest/sfcWindmax_AUST-05i_ACCESS-CM2_historical_r4i1p1f1_BOM_BARPA-R_v1-r1_day_19730101-19731231.nc',
 '/g/data/kj66/CORDEX/output/CMIP6/DD/AUST-05i/BOM/ACCESS-CM2/historical/r4i1p1f1/BARPA-R/v1-r1/day/sfcWindmax/latest/sfcWindmax_AUST-05i_ACCESS

# More Functionality

## Condense scenarios into one row

There are a few extra useful features that should be helpful. For example, "condense" can merge together datasets by moving a certain property across the line - specifically, this is useful for "scenario".

In [12]:
%%time
all_data = get_datasets("ACS_DD", org = "BOM", timescale = "day", var = "hursmax").condense("scenario")
all_data

CPU times: user 2.35 s, sys: 286 ms, total: 2.63 s
Wall time: 3.28 s


Unnamed: 0,grid,org,gcm,mdl_run,rcm,ver,timescale,Unnamed: 8,Unnamed: 9,var,date_created,scenario,year
0,AUST-05i,BOM,ACCESS-CM2,r4i1p1f1,BARPA-R,v1-r1,day,,,hursmax,"latest, v20241216","historical, ssp126, ssp370",1960 to 2100
1,AUST-05i,BOM,ACCESS-ESM1-5,r6i1p1f1,BARPA-R,v1-r1,day,,,hursmax,"latest, v20241216","historical, ssp126, ssp370",1960 to 2100
2,AUST-05i,BOM,CESM2,r11i1p1f1,BARPA-R,v1-r1,day,,,hursmax,"latest, v20241216","historical, ssp126, ssp370",1960 to 2100
3,AUST-05i,BOM,CMCC-ESM2,r1i1p1f1,BARPA-R,v1-r1,day,,,hursmax,"latest, v20241216","historical, ssp126, ssp370",1960 to 2100
4,AUST-05i,BOM,EC-Earth3,r1i1p1f1,BARPA-R,v1-r1,day,,,hursmax,"latest, v20241216","historical, ssp126, ssp370",1960 to 2100
5,AUST-05i,BOM,MPI-ESM1-2-HR,r1i1p1f1,BARPA-R,v1-r1,day,,,hursmax,"latest, v20241216","historical, ssp126, ssp370",1960 to 2100
6,AUST-05i,BOM,NorESM2-MM,r1i1p1f1,BARPA-R,v1-r1,day,,,hursmax,"latest, v20241216","historical, ssp126, ssp370",1960 to 2100
7,AUST-05i,BOM,ERA5,hres,BARRAR2,v1,day,,,hursmax,v20241216,historical,1979 to 2023


Note that if we try and load a dataset without specifying which ssp we want to use, it will fail as there will be an unresolved clash.

In [13]:
# This cell uses a try and except block to catch and print the error for the sake of example so that the notebook can be run all at once without halting here.
try:
    files = all_data[0].get_files()
except Exception as e:
    print(e)

Unresolved clash between ssp126 and ssp370 for scenario - please select one with ".select(scenario = ...)"


We can resolve this using either select or ".prioritise".

In [14]:
sample_data = xr.open_mfdataset(all_data[0].select(scenario = ("historical", "ssp370")))
sample_data

INFO: Clash on date_created: Chose "latest" over "v20241216" for var = hursmax; scenario = historical, ssp370; year = 1960 to 2100


Unnamed: 0,Array,Chunk
Bytes,117.46 GiB,2.34 MiB
Shape,"(51500, 691, 886)","(1, 691, 886)"
Dask graph,51500 chunks in 283 graph layers,51500 chunks in 283 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 117.46 GiB 2.34 MiB Shape (51500, 691, 886) (1, 691, 886) Dask graph 51500 chunks in 283 graph layers Data type float32 numpy.ndarray",886  691  51500,

Unnamed: 0,Array,Chunk
Bytes,117.46 GiB,2.34 MiB
Shape,"(51500, 691, 886)","(1, 691, 886)"
Dask graph,51500 chunks in 283 graph layers,51500 chunks in 283 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,543.01 MiB,3.86 MiB
Shape,"(51500, 691, 2)","(366, 691, 2)"
Dask graph,141 chunks in 424 graph layers,141 chunks in 424 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 543.01 MiB 3.86 MiB Shape (51500, 691, 2) (366, 691, 2) Dask graph 141 chunks in 424 graph layers Data type float64 numpy.ndarray",2  691  51500,

Unnamed: 0,Array,Chunk
Bytes,543.01 MiB,3.86 MiB
Shape,"(51500, 691, 2)","(366, 691, 2)"
Dask graph,141 chunks in 424 graph layers,141 chunks in 424 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,696.24 MiB,4.95 MiB
Shape,"(51500, 886, 2)","(366, 886, 2)"
Dask graph,141 chunks in 424 graph layers,141 chunks in 424 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 696.24 MiB 4.95 MiB Shape (51500, 886, 2) (366, 886, 2) Dask graph 141 chunks in 424 graph layers Data type float64 numpy.ndarray",2  886  51500,

Unnamed: 0,Array,Chunk
Bytes,696.24 MiB,4.95 MiB
Shape,"(51500, 886, 2)","(366, 886, 2)"
Dask graph,141 chunks in 424 graph layers,141 chunks in 424 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,804.69 kiB,5.72 kiB
Shape,"(51500, 2)","(366, 2)"
Dask graph,141 chunks in 283 graph layers,141 chunks in 283 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray
"Array Chunk Bytes 804.69 kiB 5.72 kiB Shape (51500, 2) (366, 2) Dask graph 141 chunks in 283 graph layers Data type datetime64[ns] numpy.ndarray",2  51500,

Unnamed: 0,Array,Chunk
Bytes,804.69 kiB,5.72 kiB
Shape,"(51500, 2)","(366, 2)"
Dask graph,141 chunks in 283 graph layers,141 chunks in 283 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray


In [15]:
sample_data = xr.open_mfdataset(all_data[1].prioritise("scenario", "ssp370"))
sample_data

INFO: Clash on date_created: Chose "latest" over "v20241216" for var = hursmax; scenario = historical, ssp126, ssp370; year = 1960 to 2100
INFO: Clash on scenario: Chose "ssp370" over "ssp126" for var = hursmax; date_created = latest; year = 2016 to 2100


Unnamed: 0,Array,Chunk
Bytes,117.46 GiB,2.34 MiB
Shape,"(51500, 691, 886)","(1, 691, 886)"
Dask graph,51500 chunks in 283 graph layers,51500 chunks in 283 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 117.46 GiB 2.34 MiB Shape (51500, 691, 886) (1, 691, 886) Dask graph 51500 chunks in 283 graph layers Data type float32 numpy.ndarray",886  691  51500,

Unnamed: 0,Array,Chunk
Bytes,117.46 GiB,2.34 MiB
Shape,"(51500, 691, 886)","(1, 691, 886)"
Dask graph,51500 chunks in 283 graph layers,51500 chunks in 283 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,543.01 MiB,3.86 MiB
Shape,"(51500, 691, 2)","(366, 691, 2)"
Dask graph,141 chunks in 424 graph layers,141 chunks in 424 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 543.01 MiB 3.86 MiB Shape (51500, 691, 2) (366, 691, 2) Dask graph 141 chunks in 424 graph layers Data type float64 numpy.ndarray",2  691  51500,

Unnamed: 0,Array,Chunk
Bytes,543.01 MiB,3.86 MiB
Shape,"(51500, 691, 2)","(366, 691, 2)"
Dask graph,141 chunks in 424 graph layers,141 chunks in 424 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,696.24 MiB,4.95 MiB
Shape,"(51500, 886, 2)","(366, 886, 2)"
Dask graph,141 chunks in 424 graph layers,141 chunks in 424 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 696.24 MiB 4.95 MiB Shape (51500, 886, 2) (366, 886, 2) Dask graph 141 chunks in 424 graph layers Data type float64 numpy.ndarray",2  886  51500,

Unnamed: 0,Array,Chunk
Bytes,696.24 MiB,4.95 MiB
Shape,"(51500, 886, 2)","(366, 886, 2)"
Dask graph,141 chunks in 424 graph layers,141 chunks in 424 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,804.69 kiB,5.72 kiB
Shape,"(51500, 2)","(366, 2)"
Dask graph,141 chunks in 283 graph layers,141 chunks in 283 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray
"Array Chunk Bytes 804.69 kiB 5.72 kiB Shape (51500, 2) (366, 2) Dask graph 141 chunks in 283 graph layers Data type datetime64[ns] numpy.ndarray",2  51500,

Unnamed: 0,Array,Chunk
Bytes,804.69 kiB,5.72 kiB
Shape,"(51500, 2)","(366, 2)"
Dask graph,141 chunks in 283 graph layers,141 chunks in 283 graph layers
Data type,datetime64[ns] numpy.ndarray,datetime64[ns] numpy.ndarray


## Comparing dataset collections

There are ".find_matches" and ".find_missing" methods to compare properties between different dataset collections. For example, we can find bias corrected datasets corresponding to a certain dynamically downscaled dataset.

In [16]:
dd_data = get_datasets("ACS_DD", org = "BOM", gcm = "ACCESS", timescale = "day")
dd_data

Unnamed: 0,grid,org,gcm,scenario,mdl_run,rcm,ver,timescale,Unnamed: 9,Unnamed: 10,var,date_created,year
0,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014
1,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
2,AUST-05i,BOM,ACCESS-CM2,ssp370,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
3,AUST-05i,BOM,ACCESS-ESM1-5,historical,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014
4,AUST-05i,BOM,ACCESS-ESM1-5,ssp126,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
5,AUST-05i,BOM,ACCESS-ESM1-5,ssp370,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100


In [17]:
bc_data = get_datasets("ACS_BC")
bc_data

Unnamed: 0,grid,org,gcm,scenario,mdl_run,rcm,ver,bc_org,bc,ref,bc_period,timescale,Unnamed: 13,Unnamed: 14,var,date_created,year
0,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,AGCDv1,1960-2022,day,,,"prAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",1960 to 2014
1,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,BARRAR2,1980-2022,day,,,"hursmaxAdjust, hursminAdjust, prAdjust, rsdsAdjust, sfcWindmaxAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",1960 to 2014
2,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,ACS,QME,AGCDv1,1960-2022,day,,,"prAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",1960 to 2014
3,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,ACS,QME,BARRAR2,1980-2022,day,,,"hursmaxAdjust, hursminAdjust, prAdjust, pslAdjust, rsdsAdjust, sfcWindAdjust, sfcWindmaxAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216, v20250311",1960 to 2014
4,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,AGCDv1,1960-2022,day,,,"prAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",2015 to 2100
5,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,BARRAR2,1980-2022,day,,,"hursmaxAdjust, hursminAdjust, prAdjust, rsdsAdjust, sfcWindmaxAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",2015 to 2100
6,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,ACS,QME,AGCDv1,1960-2022,day,,,"prAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",2015 to 2100
7,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,ACS,QME,BARRAR2,1980-2022,day,,,"hursmaxAdjust, hursminAdjust, prAdjust, pslAdjust, rsdsAdjust, sfcWindAdjust, sfcWindmaxAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216, v20250311",2015 to 2100
8,AUST-05i,BOM,ACCESS-CM2,ssp370,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,AGCDv1,1960-2022,day,,,"prAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",2015 to 2100
9,AUST-05i,BOM,ACCESS-CM2,ssp370,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,BARRAR2,1980-2022,day,,,"hursmaxAdjust, hursminAdjust, prAdjust, rsdsAdjust, sfcWindmaxAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",2015 to 2100


In [18]:
bc_data.find_matches(dd_data[0])

Unnamed: 0,grid,org,gcm,scenario,mdl_run,rcm,ver,bc_org,bc,ref,bc_period,timescale,Unnamed: 13,Unnamed: 14,var,date_created,year
0,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,AGCDv1,1960-2022,day,,,"prAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",1960 to 2014
1,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,BARRAR2,1980-2022,day,,,"hursmaxAdjust, hursminAdjust, prAdjust, rsdsAdjust, sfcWindmaxAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",1960 to 2014
2,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,ACS,QME,AGCDv1,1960-2022,day,,,"prAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",1960 to 2014
3,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,ACS,QME,BARRAR2,1980-2022,day,,,"hursmaxAdjust, hursminAdjust, prAdjust, pslAdjust, rsdsAdjust, sfcWindAdjust, sfcWindmaxAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216, v20250311",1960 to 2014
4,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,NHP,MRNBC,BARRAR2,1980-2022,day,,,"hursAdjust, hussAdjust, prAdjust, prsnAdjust, psAdjust, pslAdjust, rldsAdjust, rsdsAdjust, sfcWindAdjust, tasAdjust, tasmaxAdjust, tasminAdjust",v20250311,1960 to 2014


We can also pass the whole collection instead to match multiple at a time.

In [19]:
bc_data.find_matches(dd_data)

Unnamed: 0,grid,org,gcm,scenario,mdl_run,rcm,ver,bc_org,bc,ref,bc_period,timescale,Unnamed: 13,Unnamed: 14,var,date_created,year
0,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,AGCDv1,1960-2022,day,,,"prAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",1960 to 2014
1,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,BARRAR2,1980-2022,day,,,"hursmaxAdjust, hursminAdjust, prAdjust, rsdsAdjust, sfcWindmaxAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",1960 to 2014
2,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,ACS,QME,AGCDv1,1960-2022,day,,,"prAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",1960 to 2014
3,AUST-05i,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,ACS,QME,BARRAR2,1980-2022,day,,,"hursmaxAdjust, hursminAdjust, prAdjust, pslAdjust, rsdsAdjust, sfcWindAdjust, sfcWindmaxAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216, v20250311",1960 to 2014
4,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,AGCDv1,1960-2022,day,,,"prAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",2015 to 2100
5,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,BARRAR2,1980-2022,day,,,"hursmaxAdjust, hursminAdjust, prAdjust, rsdsAdjust, sfcWindmaxAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",2015 to 2100
6,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,ACS,QME,AGCDv1,1960-2022,day,,,"prAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",2015 to 2100
7,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,ACS,QME,BARRAR2,1980-2022,day,,,"hursmaxAdjust, hursminAdjust, prAdjust, pslAdjust, rsdsAdjust, sfcWindAdjust, sfcWindmaxAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216, v20250311",2015 to 2100
8,AUST-05i,BOM,ACCESS-CM2,ssp370,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,AGCDv1,1960-2022,day,,,"prAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",2015 to 2100
9,AUST-05i,BOM,ACCESS-CM2,ssp370,r4i1p1f1,BARPA-R,v1-r1,ACS,MRNBC,BARRAR2,1980-2022,day,,,"hursmaxAdjust, hursminAdjust, prAdjust, rsdsAdjust, sfcWindmaxAdjust, tasmaxAdjust, tasminAdjust","latest, v20241216",2015 to 2100


The find_missing method will show the opposite - datasets that did _not_ match. Here we can use it to check which datasets have not been processed for NHP yet.

In [20]:
get_datasets("ACS_DD", org = ("BOM", "CSIRO"), timescale = "day").find_missing(bc_data.filter(bc_org = "NHP"))

Unnamed: 0,grid,org,gcm,scenario,mdl_run,rcm,ver,timescale,Unnamed: 9,Unnamed: 10,var,date_created,year
0,AUST-05i,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
1,AUST-05i,BOM,ACCESS-ESM1-5,ssp126,r6i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
2,AUST-05i,BOM,CESM2,ssp126,r11i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
3,AUST-05i,BOM,CMCC-ESM2,historical,r1i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014
4,AUST-05i,BOM,CMCC-ESM2,ssp126,r1i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
5,AUST-05i,BOM,CMCC-ESM2,ssp370,r1i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
6,AUST-05i,BOM,EC-Earth3,historical,r1i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014
7,AUST-05i,BOM,EC-Earth3,ssp126,r1i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
8,AUST-05i,BOM,EC-Earth3,ssp370,r1i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",2015 to 2100
9,AUST-05i,BOM,MPI-ESM1-2-HR,historical,r1i1p1f1,BARPA-R,v1-r1,day,,,"hurs, hursmax, hursmin, huss, pr, prsn, ps, psl, rlds, rsds, sfcWind, sfcWindmax, tas, tasmax, tasmin","latest, v20241216, v20250311",1960 to 2014


## Multi year files

Files containing multiple years can be handled without much special attention. However, it should be noted that the extra years found within the files will not be removed automatically. For example, this table still has "1981 to 1995" displayed despite the selection being 1983 to 1992.

In [21]:
all_data = get_datasets("ACS_DD", org = "UQ-DEC", gcm = "CMCC", var = "psl", year = year_range(1983, 1992))
all_data

Unnamed: 0,grid,org,gcm,scenario,mdl_run,rcm,ver,timescale,Unnamed: 9,Unnamed: 10,var,date_created,year
0,AUST-05i,UQ-DEC,CMCC-ESM2,historical,r1i1p1f1,CCAM-v2105,v1-r1,day,,,psl,v20250311,1981 to 1995


Likewise, when opening with open_mfdataset, the extra years will be present.

In [22]:
all_data[0].get_files()

['/g/data/ia39/australian-climate-service/release/CORDEX/output-Adjust/CMIP6/bias-adjusted-input/AUST-05i/UQ-DEC/CMCC-ESM2/historical/r1i1p1f1/CCAM-v2105/v1-r1/day/psl/v20250311/psl_AUST-05i_CMCC-ESM2_historical_r1i1p1f1_UQ-DEC_CCAM-v2105_v1-r1_day_19810101-19851231.nc',
 '/g/data/ia39/australian-climate-service/release/CORDEX/output-Adjust/CMIP6/bias-adjusted-input/AUST-05i/UQ-DEC/CMCC-ESM2/historical/r1i1p1f1/CCAM-v2105/v1-r1/day/psl/v20250311/psl_AUST-05i_CMCC-ESM2_historical_r1i1p1f1_UQ-DEC_CCAM-v2105_v1-r1_day_19860101-19901231.nc',
 '/g/data/ia39/australian-climate-service/release/CORDEX/output-Adjust/CMIP6/bias-adjusted-input/AUST-05i/UQ-DEC/CMCC-ESM2/historical/r1i1p1f1/CCAM-v2105/v1-r1/day/psl/v20250311/psl_AUST-05i_CMCC-ESM2_historical_r1i1p1f1_UQ-DEC_CCAM-v2105_v1-r1_day_19910101-19951231.nc']

In [23]:
xr.open_mfdataset(all_data[0])

Unnamed: 0,Array,Chunk
Bytes,12.49 GiB,2.34 MiB
Shape,"(5475, 691, 886)","(1, 691, 886)"
Dask graph,5475 chunks in 7 graph layers,5475 chunks in 7 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 12.49 GiB 2.34 MiB Shape (5475, 691, 886) (1, 691, 886) Dask graph 5475 chunks in 7 graph layers Data type float32 numpy.ndarray",886  691  5475,

Unnamed: 0,Array,Chunk
Bytes,12.49 GiB,2.34 MiB
Shape,"(5475, 691, 886)","(1, 691, 886)"
Dask graph,5475 chunks in 7 graph layers,5475 chunks in 7 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,57.73 MiB,19.24 MiB
Shape,"(5475, 691, 2)","(1825, 691, 2)"
Dask graph,3 chunks in 10 graph layers,3 chunks in 10 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 57.73 MiB 19.24 MiB Shape (5475, 691, 2) (1825, 691, 2) Dask graph 3 chunks in 10 graph layers Data type float64 numpy.ndarray",2  691  5475,

Unnamed: 0,Array,Chunk
Bytes,57.73 MiB,19.24 MiB
Shape,"(5475, 691, 2)","(1825, 691, 2)"
Dask graph,3 chunks in 10 graph layers,3 chunks in 10 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,74.02 MiB,24.67 MiB
Shape,"(5475, 886, 2)","(1825, 886, 2)"
Dask graph,3 chunks in 10 graph layers,3 chunks in 10 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 74.02 MiB 24.67 MiB Shape (5475, 886, 2) (1825, 886, 2) Dask graph 3 chunks in 10 graph layers Data type float64 numpy.ndarray",2  886  5475,

Unnamed: 0,Array,Chunk
Bytes,74.02 MiB,24.67 MiB
Shape,"(5475, 886, 2)","(1825, 886, 2)"
Dask graph,3 chunks in 10 graph layers,3 chunks in 10 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,85.55 kiB,28.52 kiB
Shape,"(5475, 2)","(1825, 2)"
Dask graph,3 chunks in 7 graph layers,3 chunks in 7 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 85.55 kiB 28.52 kiB Shape (5475, 2) (1825, 2) Dask graph 3 chunks in 7 graph layers Data type object numpy.ndarray",2  5475,

Unnamed: 0,Array,Chunk
Bytes,85.55 kiB,28.52 kiB
Shape,"(5475, 2)","(1825, 2)"
Dask graph,3 chunks in 7 graph layers,3 chunks in 7 graph layers
Data type,object numpy.ndarray,object numpy.ndarray


# Custom Use

More paths can be added to the paths.yml. Here is the listing for ACS_DD which has been used throughout this notebook:

"format_dirs" describes the directory structure of the datasets. Any name enclosed with {} is considered a property of that dataset. There can be multiple supplied paths if the collection is split.

"format_file" describes the directory and file format of the contents of the dataset. Any name enclosed within {} that is not already a property in format_dirs is considered a selectable element of the dataset. There can be multiple supplied formats supplied - the tool will start with the first one and move down the list if it does not fit.

The exact splitting point between format_dirs and format_file can be adjusted to suit user need - I chose a point where it could make sense to load everything within concurrently.

"unique" describes elements of the dataset in format_file that should not be loaded together, and options for resolving clashes. In this case it is simply "date_created" - since different versions of the same files are stored within these folders, it does not make sense to load them together.

There are two parameters beneath named unique elements: "preferences" and "default". Preferences is a given list of options that should be given precedent in that order. Default is the behaviour for choosing between two options that are not in the preferences list, and be either "high" (prioritising the alphabetically higher option), "low" (prioritising the alphabetically lower option) or "error" (which creates an error, forcing the user to be more specific in their selection before loading). There will always be an info message printed when the dataset finder makes a choice on behalf of the user to let them know for future reference (as seen in certain examples earlier) when open_mfdataset or get_files is used.

We can also specify path strings in code if we do not wish to modify paths.yml. In this case, we must use "filter_all" instead of "get_datasets" (functionally the same - get_datasets will call filter_all with the relevant parameters it loads from paths.yml).

In [24]:
%%time
format_dir = "/g/data/lp01/CORDEX-CMIP6/CMIP6/DD/{grid}/{org}/{gcm}/{scenario}/{mdl_run}/{rcm}/{ver}/{timescale}/"
format_file = "/{var}/{date_created}/{var}_{grid}_{gcm}_{scenario}_{mdl_run}_{org}_{rcm}_{ver}_{timescale}_{year!start:4}01-{year!end:4}12.nc"
unique = {"date_created": {"default": "high"}}
all_data = filter_all(format_dir, format_file, unique)
all_data

CPU times: user 2.07 s, sys: 4.9 s, total: 6.96 s
Wall time: 16.4 s


Unnamed: 0,grid,org,gcm,scenario,mdl_run,rcm,ver,timescale,Unnamed: 9,Unnamed: 10,var,date_created,year
0,gr1.5,BOM,ACCESS-CM2,historical,r4i1p1f1,BARPA-R,v1-r1,mon,,,"pr, psl, tas, tasmax, tasmin, ts",v20231001,1960 to 2014
1,gr1.5,BOM,ACCESS-CM2,ssp126,r4i1p1f1,BARPA-R,v1-r1,mon,,,"pr, tas",v20231001,2015 to 2100
2,gr1.5,BOM,ACCESS-CM2,ssp370,r4i1p1f1,BARPA-R,v1-r1,mon,,,"pr, psl, tas, tasmax, tasmin, ts",v20231001,2015 to 2100
3,gr1.5,BOM,ACCESS-CM2,ssp585,r4i1p1f1,BARPA-R,v1-r1,mon,,,"pr, tas",v20231001,2015 to 2100
4,gr1.5,BOM,ACCESS-ESM1-5,historical,r6i1p1f1,BARPA-R,v1-r1,mon,,,"pr, psl, tas, tasmax, tasmin, ts",v20231001,1960 to 2014
5,gr1.5,BOM,ACCESS-ESM1-5,ssp126,r6i1p1f1,BARPA-R,v1-r1,mon,,,"pr, tas",v20231001,2015 to 2100
6,gr1.5,BOM,ACCESS-ESM1-5,ssp370,r6i1p1f1,BARPA-R,v1-r1,mon,,,"pr, psl, tas, tasmax, tasmin, ts",v20231001,2015 to 2100
7,gr1.5,BOM,CESM2,historical,r11i1p1f1,BARPA-R,v1-r1,mon,,,"pr, psl, tas, tasmax, tasmin, ts",v20231001,1960 to 2014
8,gr1.5,BOM,CESM2,ssp126,r11i1p1f1,BARPA-R,v1-r1,mon,,,"pr, tas",v20231001,2015 to 2100
9,gr1.5,BOM,CESM2,ssp370,r11i1p1f1,BARPA-R,v1-r1,mon,,,"pr, psl, tas, tasmax, tasmin, ts",v20231001,2015 to 2100
