# ACDtools dev sandbox 

#### Date: 11 October, 2024

Author = {"name": "Thomas Moore", "affiliation": "CSIRO", "email": "thomas.moore@csiro.au", "orcid": "0000-0003-3930-1946"}

# Install ACDtools locally

In [1]:
# this needs to be set via a custom edit per user at the moment
!pip install --user -e /g/data/es60/users/thomas_moore/code/ACDtools

Obtaining file:///g/data/es60/users/thomas_moore/code/ACDtools
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: ACDtools
  Building editable for ACDtools (pyproject.toml) ... [?25ldone
[?25h  Created wheel for ACDtools: filename=ACDtools-0.1-0.editable-py3-none-any.whl size=3514 sha256=baa71eebb492acd94640e308401f1bbeeee657ccca29c97acaedf0f9f6439f87
  Stored in directory: /jobfs/127022066.gadi-pbs/pip-ephem-wheel-cache-62ilutmd/wheels/b6/a3/f2/6ce45fbdc116ad50e421d6a11cb060cc796e867501807af446
Successfully built ACDtools
Installing collected packages: ACDtools
  Attempting uninstall: ACDtools
    Found existing installation: ACDtools 0.1
    Uninstalling ACDtools-0.1:
      Successfully uninstalled ACDtools-0.1
Successfully installed ACDtool

In [2]:
# Enable autoreload in the notebook
%load_ext autoreload
%autoreload 1 
%aimport ACDtools.util
%aimport ACDtools.ard
# Importing from your local package util.py
from ACDtools.util import test_function
from ACDtools.util import detect_compute_platform
from ACDtools.util import load_config
from ACDtools.util import start_dask_cluster_from_config
from ACDtools.util import report_esm_unique
from ACDtools.util import var_name_info
from ACDtools.util import list_catalog_query_kwargs
from ACDtools.util import load_cmip6_fs38_datastore
from ACDtools.util import show_methods
# ard.py
from ACDtools.ard import load_ACCESS_ESM_ensemble
from ACDtools.ard import find_chunking_info

# Notebook settings

### filter warnings

In [3]:
import warnings
warnings.filterwarnings("ignore") # Suppress warnings

# Dask cluster from config
`client, cluster = start_dask_cluster_from_config('netcdf_work')`
<br>OR<br>
`client, cluster = start_dask_cluster_from_config('zarr_work')`

In [4]:
client, cluster = start_dask_cluster_from_config('netcdf_work')

Cluster started with 28 workers.
Dashboard available at: /proxy/8787/status


# Issue: write function to load ACCESS-ESM1.5 data object using intake catalogs at NCI
- https://github.com/Thomas-Moore-Creative/ACDtools/issues/1

## utilise CMIP6 data catalogs for NCI holdings

##### Information on climate data catalogs across Australian HPC

**ACCESS-NRI** https://access-nri-intake-catalog.readthedocs.io/en/latest/usage/how.html <br>
**NCI** https://opus.nci.org.au/pages/viewpage.action?pageId=213713098


##### $\bigstar$ Get inspiration from ACCESS-NRI intake catalog docs: ACCESS-ESM1-5 CMIP6 example
https://access-nri-intake-catalog.readthedocs.io/en/latest/usage/quickstart.html

# import packages

In [5]:
import intake
import xarray as xr
import numpy as np
import gc
import json

### import the ACCESS-NRI catalog

In [None]:
catalog = intake.cat.access_nri

### (1) "I know I want Australian CMIP6 data - so that's fs38 and I need access to that NCI project"

In [None]:
cmip6_fs38_datastore = catalog.search(name='cmip6_fs38').to_source()

### (2) "what are the realms covered by cmip6_fs38?"

In [None]:
report_esm_unique(cmip6_fs38_datastore,keep_list=['realm'])

### (3) I want to see what variables, over what frequencies, are available in both the 'ocean' & 'oceanBgchem' realms

In [None]:
cmip6_fs38_ocean_datastore = cmip6_fs38_datastore.search(realm=['ocean','ocnBgchem'])

In [None]:
[sorted_unique_dict, table_data] = report_esm_unique(cmip6_fs38_ocean_datastore,return_results=True)

# what is the long name of a particular variable?

In [None]:
var_name_info(cmip6_fs38_ocean_datastore,'intpp')

# filter catalog for final ACCESS-ESM1.5 dataset

In [None]:
final_search = cmip6_fs38_ocean_datastore.search(file_type='l',
                    variable_id='intpp',source_id='ACCESS-ESM1-5',experiment_id='historical')

In [None]:
report_esm_unique(final_search)

# what is the chunking of the files in this final_search catalog?

In [None]:
final_search.df['path'].iloc[0]

In [None]:
find_chunking_info(final_search,'intpp',return_results=False)

# load without specifying any chunking

In [None]:
%%time
ds_ESM15_esorted = load_ACCESS_ESM_ensemble(final_search)

In [None]:
ds_ESM15_esorted

#### One still needs to know what dimensions (1, 300, 360 ; ) refers to and something about MB size per chunk to set the time to 220 . . . these rules of thumb should be in the yaml settings file until much more complicated heuristics could be coded

In [None]:
%%time
ds_ESM15_esorted = load_ACCESS_ESM_ensemble(final_search,chunking_settings={'chunks':{'member':1,'time':220,'j':300,'i':360}})

In [None]:
ds_ESM15_esorted

In [None]:
%%time
ds_ESM15_esorted = load_ACCESS_ESM_ensemble(final_search,chunking_key='ACCESS_ESM15_2D')

In [None]:
ds_ESM15_esorted

In [None]:
ds_ESM15_esorted.isel(member=0).mean('time').intpp.plot()

# 3D dataset?

In [None]:
thetao_search = cmip6_fs38_ocean_datastore.search(file_type='l',
                    variable_id='thetao',source_id='ACCESS-ESM1-5',experiment_id='historical')

In [None]:
report_esm_unique(thetao_search)

In [None]:
find_chunking_info(thetao_search,'thetao',return_results=False)

In [None]:
find_chunking_info(thetao_search,'thetao',return_results=True)

In [None]:
xr.open_mfdataset('/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r3i1p1f1/Omon/thetao/gn/v20191203/thetao_Omon_ACCESS-ESM1-5_historical_r3i1p1f1_gn_189001-189912.nc')

In [None]:
%%time
ds_ESM15_esorted = load_ACCESS_ESM_ensemble(thetao_search)

In [None]:
ds_ESM15_esorted

In [None]:
%%time
ds_ESM15_esorted = load_ACCESS_ESM_ensemble(thetao_search,chunking_key='ACCESS_ESM15_3D')

In [None]:
ds_ESM15_esorted

# let's use the tools as they exist to try to start the workflow

## I want Australian CMIP6 data

In [6]:
cmip6_fs38_datastore = load_cmip6_fs38_datastore()

In [7]:
report_esm_unique(cmip6_fs38_datastore.search(**load_config()['catalog_search_query_dict']['ACCESS_ESM15']['CSEPTA']))

╒════════════════╤═════════════════╕
│ Category       │ Unique values   │
╞════════════════╪═════════════════╡
│ experiment_id  │ piControl       │
├────────────────┼─────────────────┤
│ file_type      │ l               │
├────────────────┼─────────────────┤
│ frequency      │ mon             │
├────────────────┼─────────────────┤
│ grid_label     │ gn              │
├────────────────┼─────────────────┤
│ institution_id │ CSIRO           │
├────────────────┼─────────────────┤
│ project_id     │ CMIP            │
├────────────────┼─────────────────┤
│ realm          │ ocnBgchem       │
├────────────────┼─────────────────┤
│ source_id      │ ACCESS-ESM1-5   │
├────────────────┼─────────────────┤
│ table_id       │ Omon            │
├────────────────┼─────────────────┤
│ variable_id    │ intpp           │
╘════════════════╧═════════════════╛


In [8]:
CSEPTA_intpp_catalog = cmip6_fs38_datastore.search(**load_config()['catalog_search_query_dict']['ACCESS_ESM15']['CSEPTA'])

In [9]:
CSEPTA_intpp_catalog

Unnamed: 0,unique
path,6
file_type,1
realm,1
frequency,1
table_id,1
project_id,1
institution_id,1
source_id,1
experiment_id,1
member_id,1


In [10]:
show_methods(CSEPTA_intpp_catalog)

keys
keys_info
nunique
search
serialize
to_dask
to_dataset_dict
to_datatree
unique


In [11]:
report_esm_unique(CSEPTA_intpp_catalog)

╒════════════════╤═════════════════╕
│ Category       │ Unique values   │
╞════════════════╪═════════════════╡
│ experiment_id  │ piControl       │
├────────────────┼─────────────────┤
│ file_type      │ l               │
├────────────────┼─────────────────┤
│ frequency      │ mon             │
├────────────────┼─────────────────┤
│ grid_label     │ gn              │
├────────────────┼─────────────────┤
│ institution_id │ CSIRO           │
├────────────────┼─────────────────┤
│ project_id     │ CMIP            │
├────────────────┼─────────────────┤
│ realm          │ ocnBgchem       │
├────────────────┼─────────────────┤
│ source_id      │ ACCESS-ESM1-5   │
├────────────────┼─────────────────┤
│ table_id       │ Omon            │
├────────────────┼─────────────────┤
│ variable_id    │ intpp           │
╘════════════════╧═════════════════╛


In [17]:
CSEPTA_intpp_catalog.unique()['path']

['/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Omon/intpp/gn/v20191214/intpp_Omon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-060012.nc',
 '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Omon/intpp/gn/v20210316/intpp_Omon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-060012.nc',
 '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Omon/intpp/gn/v20191112/intpp_Omon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_010101-060012.nc',
 '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Omon/intpp/gn/v20210316/intpp_Omon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_100101-110012.nc',
 '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Omon/intpp/gn/v20191214/intpp_Omon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_060101-100012.nc',
 '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/Omon/intpp/gn/v20210316/intpp_Omon_ACCESS-ESM1-5_piControl_r1i1p1f1_gn_060101-1000

In [53]:
search_dict = dict(experiment_id = 'historical',source_id = 'ACCESS-ESM1-5',variable_id = ['intpp'],realm = ['ocnBgchem'], frequency = 'mon',file_type='f')

In [54]:
search = cmip6_fs38_datastore.search(**search_dict)
search

Unnamed: 0,unique
path,40
file_type,1
realm,1
frequency,1
table_id,1
project_id,1
institution_id,1
source_id,1
experiment_id,1
member_id,40


In [55]:
search.unique()['path']

['/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r21i1p1f1/Omon/intpp/gn/files/d20200922/intpp_Omon_ACCESS-ESM1-5_historical_r21i1p1f1_gn_185001-201412.nc',
 '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r15i1p1f1/Omon/intpp/gn/files/d20200803/intpp_Omon_ACCESS-ESM1-5_historical_r15i1p1f1_gn_185001-201412.nc',
 '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r4i1p1f1/Omon/intpp/gn/files/d20200529/intpp_Omon_ACCESS-ESM1-5_historical_r4i1p1f1_gn_185001-201412.nc',
 '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r6i1p1f1/Omon/intpp/gn/files/d20200529/intpp_Omon_ACCESS-ESM1-5_historical_r6i1p1f1_gn_185001-201412.nc',
 '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r33i1p1f1/Omon/intpp/gn/files/d20210525/intpp_Omon_ACCESS-ESM1-5_historical_r33i1p1f1_gn_185001-201412.nc',
 '/g/data/fs38/publications/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/historical/r5i1p1f1/Omon/intpp/gn/files/d20200601/intpp

In [None]:
CSEPTA_datatree = CSEPTA_intpp_catalog.to_datatree(index=["experiment_id"],progressbar=False)

In [None]:
# Iterate over the experiments in the datatree
for experiment_id, node in CSEPTA_datatree.items():
    # Access the dataset
    ds = node.ds
    print(f"Working with dataset for experiment: {experiment_id}")
    
    # Perform operations on the dataset, for example, print variable names
    print(ds.variables)