# CCAM intake sandbox

In [1]:
import glob
import pathlib
import traceback
from datetime import datetime

import xarray as xr

from ecgtools import Builder
from ecgtools.builder import INVALID_ASSET, TRACEBACK

  warn("Couldn't import ipywidgets properly, progress bar will use console behavior")


In [39]:
import os
import dask.config
from dask.distributed import Client,LocalCluster
from dask_jobqueue import PBSCluster
walltime = '01:00:00'
cores = 48
memory = '192GB'
  
cluster = PBSCluster(walltime=str(walltime), cores=cores, memory=str(memory),processes=cores,
                     job_extra=['-q normal','-P xv83','-l ncpus='+str(cores),'-l mem='+str(memory),
                                '-l storage=gdata/xv83+gdata/v14+gdata/ux62+scratch/xv83+gdata/rt52+gdata/ik11+gdata/cj50+gdata/jk72+gdata/hh5'],
                     local_directory='$TMPDIR',
                     header_skip=["select"])
cluster.scale(jobs=1)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43963 instead


In [49]:
cluster

0,1
Dashboard: /proxy/43963/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.6.121.25:35937,Workers: 0
Dashboard: /proxy/43963/status,Total threads: 0
Started: 5 minutes ago,Total memory: 0 B


In [2]:
root = '/g/data/xv83/mxt599/ccam_noresm2-mm_historical_aus-10i_12km/drs_cordex/CORDEX/output/AUS-10i/CSIRO/NCC-NorESM2-MM/historical/r1i1p1f1/CSIRO-CCAM-2203/v1/'

In [3]:
files = sorted(glob.glob(root+'*/*/*'))
len(files)

32964

In [4]:
files[0]

'/g/data/xv83/mxt599/ccam_noresm2-mm_historical_aus-10i_12km/drs_cordex/CORDEX/output/AUS-10i/CSIRO/NCC-NorESM2-MM/historical/r1i1p1f1/CSIRO-CCAM-2203/v1/1hr/CAPE/CAPE_AUS-10i_NCC-NorESM2-MM_historical_r1i1p1f1_CSIRO-CCAM-2203_v1_1hr_19510101-19511231.nc'

In [None]:
ds = xr.open_mfdataset(files[0:11],parallel=True)
ds

In [None]:
path = pathlib.Path(files[0])
path.stem.split('_')

# NOTES from Marcus chat

## use this as an error check?

In [None]:
variable_list = [var for var in ds if 'long_name' in ds[var].attrs]
variable_list

In [None]:
split = path.stem.split('_')

In [None]:
split[0]

# do we really need to open all the files?  Given variable name is in the filename?

In [7]:
def parse_CCAM(file):
    """CCAM data stored in"""
    file = pathlib.Path(file)
    info = {}

    try:
        stem = file.stem
        split = stem.split('_')
        variable = split[0]
        domain = split[1]
        host_GCM = split[2]
        run_type = split[3]
        host_ensemble = split[4]
        downscale_model = split[5]
        downscale_version = split[6]
        period = split[7]
        if period == '1hr':
            time_period = 'hourly'
        elif period == '6hr':
            time_period = 'six_hourly'
        elif period == 'day':
            time_period = 'daily'
        elif period == 'mon':
            time_period = 'monthly'
        else:
            time_period = 'fixed'    

        
        info = {
            'variable': variable,
            'domain': domain,
            'host_GCM':host_GCM,
            'run_type':run_type,
            'host_ensemble':host_ensemble,
            'downscale_model':downscale_model,
            'downscale_version':downscale_version,
            'period': period,
            'time_period': time_period,
            'path': str(file),
        }
        return info
    
    except Exception:
        return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}

In [8]:
parse_CCAM(files[0])

{'variable': 'CAPE',
 'domain': 'AUS-10i',
 'host_GCM': 'NCC-NorESM2-MM',
 'run_type': 'historical',
 'host_ensemble': 'r1i1p1f1',
 'downscale_model': 'CSIRO-CCAM-2203',
 'downscale_version': 'v1',
 'period': '1hr',
 'time_period': 'hourly',
 'path': '/g/data/xv83/mxt599/ccam_noresm2-mm_historical_aus-10i_12km/drs_cordex/CORDEX/output/AUS-10i/CSIRO/NCC-NorESM2-MM/historical/r1i1p1f1/CSIRO-CCAM-2203/v1/1hr/CAPE/CAPE_AUS-10i_NCC-NorESM2-MM_historical_r1i1p1f1_CSIRO-CCAM-2203_v1_1hr_19510101-19511231.nc'}

# setup builder object

In [21]:
root2 = '/g/data/xv83/mxt599/ccam_noresm2-mm_historical_aus-10i_12km/drs_cordex/CORDEX/output/AUS-10i/CSIRO/NCC-NorESM2-MM/historical/r1i1p1f1/CSIRO-CCAM-2203/v1'

In [22]:
%%time
b = Builder([root2],depth=2)

CPU times: user 683 µs, sys: 571 µs, total: 1.25 ms
Wall time: 1.26 ms


In [23]:
b

Builder(paths=['/g/data/xv83/mxt599/ccam_noresm2-mm_historical_aus-10i_12km/drs_cordex/CORDEX/output/AUS-10i/CSIRO/NCC-NorESM2-MM/historical/r1i1p1f1/CSIRO-CCAM-2203/v1'], storage_options={}, depth=2, exclude_patterns=[], include_patterns=[], joblib_parallel_kwargs={})

In [24]:
%%time
b.build(parsing_func = parse_CCAM)

CPU times: user 3.01 s, sys: 1.1 s, total: 4.11 s
Wall time: 5.73 s


  ).clean_dataframe()


Builder(paths=['/g/data/xv83/mxt599/ccam_noresm2-mm_historical_aus-10i_12km/drs_cordex/CORDEX/output/AUS-10i/CSIRO/NCC-NorESM2-MM/historical/r1i1p1f1/CSIRO-CCAM-2203/v1'], storage_options={}, depth=2, exclude_patterns=[], include_patterns=[], joblib_parallel_kwargs={})

In [25]:
b.df

Unnamed: 0,variable,domain,host_GCM,run_type,host_ensemble,downscale_model,downscale_version,period,time_period,path
0,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,1hr,hourly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
1,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,1hr,hourly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
2,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,1hr,hourly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
3,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,1hr,hourly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
4,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,1hr,hourly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
...,...,...,...,...,...,...,...,...,...,...
32959,zmla,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
32960,zmla,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
32961,zmla,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
32962,zmla,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...


In [14]:
dir(b)

['__annotations__',
 '__class__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__get_validators__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__post_init_post_parse__',
 '__pydantic_initialised__',
 '__pydantic_model__',
 '__pydantic_run_validation__',
 '__pydantic_validate_values__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__validate__',
 '__weakref__',
 '_root_dirs',
 'assets',
 'build',
 'clean_dataframe',
 'depth',
 'df',
 'entries',
 'exclude_patterns',
 'exclude_regex',
 'get_assets',
 'include_patterns',
 'include_regex',
 'invalid_assets',
 'joblib_parallel_kwargs',
 'parse',
 'paths',
 'save',
 'storage_options']

In [34]:
b.invalid_assets['INVALID_ASSET'].values

array([PosixPath('/g/data/xv83/mxt599/ccam_noresm2-mm_historical_aus-10i_12km/drs_cordex/CORDEX/output/AUS-10i/CSIRO/NCC-NorESM2-MM/historical/r1i1p1f1/CSIRO-CCAM-2203/v1/fx/orog/orog_AUS-10i_NCC-NorESM2-MM_historical_r1i1p1f1_CSIRO-CCAM-2203_v1.nc'),
       PosixPath('/g/data/xv83/mxt599/ccam_noresm2-mm_historical_aus-10i_12km/drs_cordex/CORDEX/output/AUS-10i/CSIRO/NCC-NorESM2-MM/historical/r1i1p1f1/CSIRO-CCAM-2203/v1/fx/sftlaf/sftlaf_AUS-10i_NCC-NorESM2-MM_historical_r1i1p1f1_CSIRO-CCAM-2203_v1.nc'),
       PosixPath('/g/data/xv83/mxt599/ccam_noresm2-mm_historical_aus-10i_12km/drs_cordex/CORDEX/output/AUS-10i/CSIRO/NCC-NorESM2-MM/historical/r1i1p1f1/CSIRO-CCAM-2203/v1/fx/sftlf/sftlf_AUS-10i_NCC-NorESM2-MM_historical_r1i1p1f1_CSIRO-CCAM-2203_v1.nc'),
       PosixPath('/g/data/xv83/mxt599/ccam_noresm2-mm_historical_aus-10i_12km/drs_cordex/CORDEX/output/AUS-10i/CSIRO/NCC-NorESM2-MM/historical/r1i1p1f1/CSIRO-CCAM-2203/v1/fx/sfturf/sfturf_AUS-10i_NCC-NorESM2-MM_historical_r1i1p1f1_CSIRO-C

## example from docs

In [38]:
b.save(
    # File path - could save as .csv (uncompressed csv) or .csv.gz (compressed csv)
    name = "/intake-catalogues/test-CCAM-catalogue",
    # Column name including filepath
    path_column_name='path',
    # Column name including variables
    variable_column_name='variable',
    # Data file format - could be netcdf or zarr (in this case, netcdf)
    data_format="netcdf",
    # Which attributes to groupby when reading in variables using intake-esm
    groupby_attrs=["domain", "host_GCM", "run_type","host_ensemble","downscale_model","downscale_version","period"],
    # Aggregations which are fed into xarray when reading in data using intake
    aggregations=[
        {
            "type": "join_existing",
            "attribute_name": "date",
            "options": {"dim": "time", "coords": "minimal", "compat": "override"},
        }
    ],
)

Successfully wrote ESM catalog json file to: file:///home/599/tm4888//intake-catalogues/test-CCAM-catalogue.json


  b.save(


OSError: Cannot save file into a non-existent directory: '/jobfs/72856130.gadi-pbs/intake-catalogues'

# Can I querry the catalogue? YES!

In [44]:
import intake

In [45]:
data_catalog = intake.open_esm_datastore('/g/data/v14/tm4888/code/ACS/data-catalogue/catalogues/test-CCAM-catalogue.json')
data_catalog

Unnamed: 0,unique
variable,176
domain,1
host_GCM,1
run_type,1
host_ensemble,1
downscale_model,1
downscale_version,1
period,4
time_period,4
path,32960


In [46]:
data_catalog.df

Unnamed: 0,variable,domain,host_GCM,run_type,host_ensemble,downscale_model,downscale_version,period,time_period,path
0,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,1hr,hourly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
1,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,1hr,hourly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
2,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,1hr,hourly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
3,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,1hr,hourly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
4,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,1hr,hourly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
...,...,...,...,...,...,...,...,...,...,...
32955,zmla,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
32956,zmla,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
32957,zmla,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
32958,zmla,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...


In [51]:
search1 = data_catalog.search(variable=['CAPE','zmla'],period='mon')
search1

Unnamed: 0,unique
variable,2
domain,1
host_GCM,1
run_type,1
host_ensemble,1
downscale_model,1
downscale_version,1
period,1
time_period,1
path,128


In [52]:
search1.df

Unnamed: 0,variable,domain,host_GCM,run_type,host_ensemble,downscale_model,downscale_version,period,time_period,path
0,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
1,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
2,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
3,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
4,CAPE,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
...,...,...,...,...,...,...,...,...,...,...
123,zmla,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
124,zmla,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
125,zmla,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...
126,zmla,AUS-10i,NCC-NorESM2-MM,historical,r1i1p1f1,CSIRO-CCAM-2203,v1,mon,monthly,/g/data/xv83/mxt599/ccam_noresm2-mm_historical...


In [53]:
dsets = search1.to_dataset_dict()
dsets.keys()


--> The keys in the returned dictionary of datasets are constructed as follows:
	'domain.host_GCM.run_type.host_ensemble.downscale_model.downscale_version.period'
 |████████████████████████████████████████| 100.00% [1/1 00:25<00:00]

dict_keys(['AUS-10i.NCC-NorESM2-MM.historical.r1i1p1f1.CSIRO-CCAM-2203.v1.mon'])

In [54]:
search_DS = dsets['AUS-10i.NCC-NorESM2-MM.historical.r1i1p1f1.CSIRO-CCAM-2203.v1.mon']

In [55]:
search_DS

Unnamed: 0,Array,Chunk
Bytes,4.77 kiB,4.77 kiB
Shape,"(611, 2)","(611, 2)"
Dask graph,1 chunks in 316 graph layers,1 chunks in 316 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 4.77 kiB 4.77 kiB Shape (611, 2) (611, 2) Dask graph 1 chunks in 316 graph layers Data type float32 numpy.ndarray",2  611,

Unnamed: 0,Array,Chunk
Bytes,4.77 kiB,4.77 kiB
Shape,"(611, 2)","(611, 2)"
Dask graph,1 chunks in 316 graph layers,1 chunks in 316 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.25 kiB,7.25 kiB
Shape,"(928, 2)","(928, 2)"
Dask graph,1 chunks in 316 graph layers,1 chunks in 316 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 7.25 kiB 7.25 kiB Shape (928, 2) (928, 2) Dask graph 1 chunks in 316 graph layers Data type float32 numpy.ndarray",2  928,

Unnamed: 0,Array,Chunk
Bytes,7.25 kiB,7.25 kiB
Shape,"(928, 2)","(928, 2)"
Dask graph,1 chunks in 316 graph layers,1 chunks in 316 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.62 GiB,25.96 MiB
Shape,"(768, 611, 928)","(12, 611, 928)"
Dask graph,64 chunks in 129 graph layers,64 chunks in 129 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.62 GiB 25.96 MiB Shape (768, 611, 928) (12, 611, 928) Dask graph 64 chunks in 129 graph layers Data type float32 numpy.ndarray",928  611  768,

Unnamed: 0,Array,Chunk
Bytes,1.62 GiB,25.96 MiB
Shape,"(768, 611, 928)","(12, 611, 928)"
Dask graph,64 chunks in 129 graph layers,64 chunks in 129 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.62 GiB,25.96 MiB
Shape,"(768, 611, 928)","(12, 611, 928)"
Dask graph,64 chunks in 129 graph layers,64 chunks in 129 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.62 GiB 25.96 MiB Shape (768, 611, 928) (12, 611, 928) Dask graph 64 chunks in 129 graph layers Data type float32 numpy.ndarray",928  611  768,

Unnamed: 0,Array,Chunk
Bytes,1.62 GiB,25.96 MiB
Shape,"(768, 611, 928)","(12, 611, 928)"
Dask graph,64 chunks in 129 graph layers,64 chunks in 129 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [56]:
mean_CAPE = search_DS.CAPE.mean(dim='time')
mean_CAPE

Unnamed: 0,Array,Chunk
Bytes,2.16 MiB,2.16 MiB
Shape,"(611, 928)","(611, 928)"
Dask graph,1 chunks in 133 graph layers,1 chunks in 133 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 2.16 MiB 2.16 MiB Shape (611, 928) (611, 928) Dask graph 1 chunks in 133 graph layers Data type float32 numpy.ndarray",928  611,

Unnamed: 0,Array,Chunk
Bytes,2.16 MiB,2.16 MiB
Shape,"(611, 928)","(611, 928)"
Dask graph,1 chunks in 133 graph layers,1 chunks in 133 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [57]:
%%time
mean_CAPE = mean_CAPE.compute()

CPU times: user 16.1 s, sys: 1.75 s, total: 17.8 s
Wall time: 13 s


In [61]:
mean_CAPE

In [63]:
cluster.close()

