# Notebook to create aggregated reference files (json) into monthly or yearly zarrs

In [1]:
%load_ext autoreload
%autoreload 2

## Step 0: Import the library of code

In [2]:
import sys
import os
sys.path.append('/home/jovyan/intake-aodn/')
import intake_aodn
import intake

from intake_aodn.utils import get_local_cluster, get_distributed_cluster
from intake_aodn.indexing import process_aggregate

In [3]:
# client = get_local_cluster()
client = get_distributed_cluster(worker_cores=1, worker_memory=2.0, min_workers=32, max_workers=32)

Creating new cluster. Please wait for this to finish.


VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

In [4]:
# requires a working installation of kerchunk in the notebook environment and on the workers if using distributed
from dask.distributed import PipInstall
plugin = PipInstall(packages=["kerchunk"], pip_options=["--upgrade"])
client.register_worker_plugin(plugin)

# If using a distributed cluster on EASI build eggs using "python setup.py bdist_egg" and upload to workers
# otherwise dask workers wont have code for imports
client.upload_file('/home/jovyan/intake-aodn/dist/intake_aodn-0+untagged.57.g0b7665b.dirty-py3.8.egg')

{'tls://10.0.33.65:38249': {'status': 'OK'},
 'tls://10.0.34.43:43119': {'status': 'OK'},
 'tls://10.0.36.67:41019': {'status': 'OK'},
 'tls://10.0.38.246:42045': {'status': 'OK'},
 'tls://10.0.40.91:39417': {'status': 'OK'},
 'tls://10.0.42.16:37669': {'status': 'OK'},
 'tls://10.0.44.133:46813': {'status': 'OK'},
 'tls://10.0.44.138:33229': {'status': 'OK'},
 'tls://10.0.46.67:38493': {'status': 'OK'},
 'tls://10.0.47.90:35651': {'status': 'OK'},
 'tls://10.0.48.102:33171': {'status': 'OK'},
 'tls://10.0.48.2:43067': {'status': 'OK'},
 'tls://10.0.48.83:35551': {'status': 'OK'},
 'tls://10.0.50.72:36101': {'status': 'OK'},
 'tls://10.0.51.130:37091': {'status': 'OK'},
 'tls://10.0.51.98:38459': {'status': 'OK'},
 'tls://10.0.54.12:41387': {'status': 'OK'},
 'tls://10.0.54.162:46145': {'status': 'OK'},
 'tls://10.0.54.17:37401': {'status': 'OK'},
 'tls://10.0.54.202:40315': {'status': 'OK'},
 'tls://10.0.55.183:40729': {'status': 'OK'},
 'tls://10.0.55.97:42449': {'status': 'OK'},
 't

# Unzip existing references

In [None]:
!cd ../../intake_aodn/catalogs/ && unzip -q aodn_refs.zip

# SST Data

In [6]:
def sst_preprocess(ds):
    variables = ['dt_analysis',
                 'l2p_flags',
                 'quality_level',
                 'satellite_zenith_angle',
                 'sea_surface_temperature',
                 'sses_bias',
                 'sses_count',
                 'sses_standard_deviation',
                 'sst_dtime']
    ds = ds[variables]
    return ds

In [7]:
%%time
# s3://imos-data-pixeldrill/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/2016/20161001152000-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc
kwargs = dict(root='imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/',
               year='2021',
               month='07',
               mask='{year}/{year}{month}',
               suffix='-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night',
               extension='nc',
               check_chunking='sea_surface_temperature',
               preprocess=sst_preprocess,
               storage_options=dict(anon=True),
               dest='../../intake_aodn/catalogs/',
               dask=True)
process_aggregate(**kwargs)

Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/2021/202107*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 23 found.
Loading references...
... using dask ...
Checking chunk layout...
Aggregating into ../../intake_aodn/catalogs/imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/202107-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night_a.json
CPU times: user 1.47 s, sys: 118 ms, total: 1.58 s
Wall time: 7.49 s


{'2021/202107': ['imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/202107-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night_a.json']}

In [8]:
import pandas as pd
dt = pd.date_range('2021-07-01',pd.Timestamp.now() + pd.DateOffset(months=1),freq='M')
# dt = pd.date_range('2022-02-01',pd.Timestamp.now() + pd.DateOffset(months=1),freq='M')
print(dt)

DatetimeIndex(['2021-07-31', '2021-08-31', '2021-09-30', '2021-10-31',
               '2021-11-30', '2021-12-31', '2022-01-31', '2022-02-28'],
              dtype='datetime64[ns]', freq='M')


In [9]:
results = []
for d in dt:
    kws = kwargs.copy()
    kws['year'] = d.strftime('%Y')
    kws['month'] = d.strftime('%m')
    results.append(process_aggregate(**kws))

Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/2021/202107*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 23 found.
Loading references...
... using dask ...
Checking chunk layout...
Aggregating into ../../intake_aodn/catalogs/imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/202107-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night_a.json
Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/2021/202108*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 30 found.
Loading references...
... using dask ...
Checking chunk layout...
Aggregating into ../../intake_aodn/catalogs/imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/202108-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night_a.json
Aggregating s3://imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/2021/202109*-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night.nc - 30 found.
Loading references...
... using dask ...
Checking chunk layout...
Aggregating into ../../intake_aodn/catalogs/imos-data/IMOS/SRS/SST/ghrsst/L3S-1d/ngt/202109-ABOM-L3S_GHRSST-SSTskin-AVHRR_D-1d_night_a.json
A

# MODIS Ocean Colour

In [10]:
kwargs = dict(root='imos-data/IMOS/SRS/OC/gridded/aqua/P1D/',
              mask='{year}/{month}/A.P1D.{year}{month}',
              dest='../../intake_aodn/catalogs/',
              dask=True
             ) 

results = []

for d in dt:
    for var in ['K_490','chl_oc3','chl_oc3','chl_gsm']:
        kws = kwargs.copy()
        kws['year'] = d.strftime('%Y')
        kws['month'] = d.strftime('%m')
        kws['suffix'] = f'.aust.{var}'
        kws['check_chunking'] = var
        results.append(process_aggregate(**kws))

Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/2021/07/A.P1D.202107*.aust.K_490.nc - 31 found.
Loading references...
... using dask ...
Checking chunk layout...
Aggregating into ../../intake_aodn/catalogs/imos-data/IMOS/SRS/OC/gridded/aqua/P1D/202107.aust.K_490_a.json
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/2021/07/A.P1D.202107*.aust.chl_oc3.nc - 31 found.
Loading references...
... using dask ...
Checking chunk layout...
Aggregating into ../../intake_aodn/catalogs/imos-data/IMOS/SRS/OC/gridded/aqua/P1D/202107.aust.chl_oc3_a.json
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/2021/07/A.P1D.202107*.aust.chl_oc3.nc - 31 found.
Loading references...
... using dask ...
Checking chunk layout...
Aggregating into ../../intake_aodn/catalogs/imos-data/IMOS/SRS/OC/gridded/aqua/P1D/202107.aust.chl_oc3_a.json
Aggregating s3://imos-data/IMOS/SRS/OC/gridded/aqua/P1D/2021/07/A.P1D.202107*.aust.chl_gsm.nc - 31 found.
Loading references...
... using dask ...
Checking ch

## Zip references

In [11]:
!cd ../../intake_aodn/catalogs/ && rm aodn_refs.zip  && zip -r -q aodn_refs.zip imos-data && rm -rf ../../intake_aodn/catalogs/imos-data/