# A database of databases

Exploring the functionality of an intake catalogue of intake-esm catalogues using COSIMA data as a test case.

Concept:

One intake-esm catalogue per "experiment" and another catalogue for finding relevant experiments.

Useful links:

- https://github.com/pangeo-data/pangeo-datastore

In [1]:
%cd /g/data/tm70/ds0092/projects/dev_data_querying/cosima_intake/intake_dbdb

/g/data/tm70/ds0092/projects/dev_data_querying/cosima_intake/intake_dbdb


In [2]:
import os

import pathlib

import intake

import xarray as xr

# Start simple with an intake catalogue of intake-esm catalogues. This won't really be searchable

As a "simple" case, let's use a few cycles of the access-om2-025 `025deg_jra55_iaf_omip2` runs

In [3]:
index_directories = [
    "/g/data/ik11/outputs/access-om2-025/025deg_jra55_iaf_omip2_cycle1",
    "/g/data/ik11/outputs/access-om2-025/025deg_jra55_iaf_omip2_cycle2",
    "/g/data/ik11/outputs/access-om2-025/025deg_jra55_iaf_omip2_cycle3"
]

## Build intake-esm catalogues

In [4]:
from ecgtools import Builder

# from ecgtools_parsers import cosima_parser

  warn("Couldn't import ipywidgets properly, progress bar will use console behavior")


In [5]:
b = Builder(
    index_directories,
    depth=3,
    exclude_patterns=["*/restart*/*", "*o2i.nc"],
    include_patterns=["*.nc"],
    joblib_parallel_kwargs={"n_jobs": 4},
    )

In [8]:
import pathlib
import traceback
import cftime
from ecgtools.builder import INVALID_ASSET, TRACEBACK

def cosima_parser(file):
    """Quick hacked parser for COSIMA datasets"""
    def _get_timeinfo(ds):
        """
        Stolen and slightly adapted from cosima cookbook 
        https://github.com/COSIMA/cosima-cookbook/blob/master/cosima_cookbook/database.py#L565
        """
        time_dim = "time" # TODO: this probably shouldn't be hardcoded
        if time_dim is None:
            return None

        time_var = ds[time_dim]
        has_bounds = hasattr(time_var, "bounds") and time_var.bounds in ds.variables
        
        def _todate(t):
            return cftime.num2date(t, time_var.units, calendar=time_var.calendar)
    
        if has_bounds:
            bounds_var = ds.variables[time_var.bounds]
            start_time = _todate(bounds_var[0, 0])
            end_time = _todate(bounds_var[-1, 1])
        else:
            start_time = _todate(time_var[0])
            end_time = _todate(time_var[-1])
        
        if len(time_var) > 1 or has_bounds:
            if has_bounds:
                next_time = _todate(bounds_var[0, 1])
            else:
                next_time = _todate(time_var[1])

            dt = next_time - start_time
            if dt.days >= 365:
                years = round(dt.days / 365)
                frequency = f"{years} yearly"
            elif dt.days >= 28:
                months = round(dt.days / 30)
                frequency = f"{months} monthly"
            elif dt.days >= 1:
                frequency = f"{dt.days} daily"
            else:
                frequency = f"{dt.seconds // 3600} hourly"
        else:
            # single time value in this file and no averaging
            frequency = "static"
            
        return start_time.strftime("%Y-%m-%d"), end_time.strftime("%Y-%m-%d"), frequency
        
    path = pathlib.Path(file)
    
    try:
        path_parts = path.parts
        filename = path.stem
        # TODO: this can be done better
        # First 5 parts are /,g,data,ik11,outputs,access-om2
        experiment = path_parts[6]
        output = path_parts[7]
        realm = path_parts[8]

        with xr.open_dataset(file, chunks={}, decode_times=False) as ds:
            variable_list = [var for var in ds if 'long_name' in ds[var].attrs]

        info = {
                "experiment": experiment,
                "output": output,
                "realm": realm,
                "variables": variable_list,
                "filename": filename,
                "path": str(file),
            }
        info["start_time"], info["end_time"], info["frequency"] = _get_timeinfo(ds)

        return info

    except Exception:
        return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}

In [9]:
%%time

b = b.build(parsing_func=cosima_parser)

CPU times: user 6.11 s, sys: 2.31 s, total: 8.42 s
Wall time: 4min 4s


In [36]:
%%time

for index_directory in index_directories:
    b = Builder(
        [test_directory],
        depth=3,
        exclude_patterns=["*/restart*/*", "*o2i.nc"],
        include_patterns=["*.nc"],
        joblib_parallel_kwargs={"n_jobs": 4},
    )

    cc_builder = cc_builder.build(parsing_func=parse_cc)

    cc_builder.save(
        f"{os.path.basename(index_directory)}.csv",
        path_column_name='path',
        variable_column_name='variables',
        data_format="netcdf",
        # Which attributes to groupby when reading in variables using intake-esm
        groupby_attrs=["experiment", "filename"],
        aggregations=[
            {
                "type": "join_existing",
                "attribute_name": "start_time",
                "options": {"dim": "time", "combine": "by_coords"},
            }
        ],
    )

CPU times: user 483 ms, sys: 1.35 s, total: 1.84 s
Wall time: 8.65 s
