In [1]:
import argparse
import dask
import json
import netCDF4 as nc4
import numpy as np
import pandas as pd
from pathlib import Path
from pprint import pprint
import time
import warnings
import xarray as xr

warnings.filterwarnings('ignore')

In [2]:
import warnings
warnings.filterwarnings('ignore')
import dask
dask.config.set({'temporary_directory': '/mnt/intraid/ian1/ifenty/dask_tmp'})

# Works on Ian's Machine, but not necessary
from dask.distributed import Client, progress
#client = Client(processes=False, threads_per_worker=48)
client = Client(processes=False, n_workers=1, threads_per_worker=8,memory_limit='128GB')
client

0,1
Client  Scheduler: inproc://137.78.251.47/151581/1  Dashboard: http://137.78.251.47:8787/status,Cluster  Workers: 1  Cores: 8  Memory: 128.00 GB


In [3]:
def load_ecco_fields(data_dir, glob_name):
    time_start=time.time()

    ecco_fields = []
    # opening 312 monthly mean files takes about 40s using parallel & dask
    
    ecco_files = list(data_dir.glob(glob_name))
    print(ecco_files[0:5])
    ecco_fields = xr.open_mfdataset(ecco_files, parallel=True, data_vars='minimal',\
                                  coords='minimal',compat='override')
    
    tt = time.time() - time_start    
    print(tt / len(ecco_fields))
    print(time.time() - time_start)
    return ecco_fields

In [4]:
def get_groupings(base_dir, grid_type, time_type):
    groupings = dict()
    tmp = Path(f'{base_dir}/{grid_type}/{time_type}')
    print(tmp)
    if tmp.exists():
        for pi, p in enumerate(tmp.iterdir()):
            grouping = str(p).split('/')[-1]
            groupings[pi] = dict()
            groupings[pi]['name'] = grouping
            groupings[pi]['grid'] = grid_type
            groupings[pi]['time_type'] = time_type
            groupings[pi]['directory'] = p
            
    return groupings


In [5]:
def construct_DA(ecco_fields):
    t0 = time.time()
    results_da = dict()
    for dv in ecco_fields.data_vars:
        print(dv)
        results_da[dv] = dict()
        results_da[dv]['valid_minmax'] = dask.delayed([ecco_fields[dv].min(), ecco_fields[dv].max()])
        #results_da[dv]['valid_min'] = dask.delayed()

    return results_da   
   

In [6]:
def construct_DS(results, grouping_info):
    DAs = []
    for dv in results.keys():
        print(dv)
        valid_min, valid_max = results[dv]['valid_minmax']
        valid_min = valid_min.values
        valid_max = valid_max.values
        #valid_min = results[dv]['valid_min'].values
        print(valid_max, valid_min)
        tmp = xr.DataArray([valid_min, valid_max], dims=['valid_min_max'])
        tmp.name = dv
        DAs.append(tmp)

    DS = xr.merge(DAs)
    DS.attrs['title']     = ecco_fields.attrs['title']
    DS.attrs['name']      = grouping_info['name']
    DS.attrs['grid']      = grouping_info['grid']
    DS.attrs['time_type'] = grouping_info['time_type']
    DS.attrs['id']        = ecco_fields.attrs['id']
    DS.attrs['shortname'] = ecco_fields.attrs['id'].split('/')[1]
    DS.attrs['directory'] = str(grouping_info['directory'])
    
  
    return DS

## Inputs

In [7]:
dataset_base_dir = Path('/home/ifenty/ian1/ifenty/ECCOv4/Version4/Release4/podaac/')

In [8]:
grids = ['native','latlon']
time_avgs = ['day_inst', 'day_mean','mon_mean']

## Calc

In [9]:
gi = 3
time_type = time_avgs[2]
grid_type = grids[0]

groupings = get_groupings(dataset_base_dir, grid_type, time_type)
groupings[gi]

/home/ifenty/ian1/ifenty/ECCOv4/Version4/Release4/podaac/native/mon_mean


{'name': 'OCEAN_3D_TEMPERATURE_FLUX',
 'grid': 'native',
 'time_type': 'mon_mean',
 'directory': PosixPath('/home/ifenty/ian1/ifenty/ECCOv4/Version4/Release4/podaac/native/mon_mean/OCEAN_3D_TEMPERATURE_FLUX')}

In [10]:
ecco_fields = load_ecco_fields(groupings[gi]['directory'], '*ECCO*nc')

[PosixPath('/home/ifenty/ian1/ifenty/ECCOv4/Version4/Release4/podaac/native/mon_mean/OCEAN_3D_TEMPERATURE_FLUX/OCEAN_3D_TEMPERATURE_FLUX_mon_mean_2002-10_ECCO_V4r4_native_llc0090.nc'), PosixPath('/home/ifenty/ian1/ifenty/ECCOv4/Version4/Release4/podaac/native/mon_mean/OCEAN_3D_TEMPERATURE_FLUX/OCEAN_3D_TEMPERATURE_FLUX_mon_mean_2007-03_ECCO_V4r4_native_llc0090.nc'), PosixPath('/home/ifenty/ian1/ifenty/ECCOv4/Version4/Release4/podaac/native/mon_mean/OCEAN_3D_TEMPERATURE_FLUX/OCEAN_3D_TEMPERATURE_FLUX_mon_mean_1995-03_ECCO_V4r4_native_llc0090.nc'), PosixPath('/home/ifenty/ian1/ifenty/ECCOv4/Version4/Release4/podaac/native/mon_mean/OCEAN_3D_TEMPERATURE_FLUX/OCEAN_3D_TEMPERATURE_FLUX_mon_mean_1992-11_ECCO_V4r4_native_llc0090.nc'), PosixPath('/home/ifenty/ian1/ifenty/ECCOv4/Version4/Release4/podaac/native/mon_mean/OCEAN_3D_TEMPERATURE_FLUX/OCEAN_3D_TEMPERATURE_FLUX_mon_mean_2007-06_ECCO_V4r4_native_llc0090.nc')]
5.040842635290963
35.28609752655029


In [11]:
DA = construct_DA(ecco_fields)

ADVx_TH
DFxE_TH
ADVy_TH
DFyE_TH
ADVr_TH
DFrE_TH
DFrI_TH


In [12]:
start_time = time.time()
results_da_compute = dask.compute(DA)[0]
delta_time = time.time()-start_time
print(delta_time)

261.4717848300934


In [15]:
DS = construct_DS(results_da_compute, groupings[gi])
DS.attrs['calc_time_seconds'] = delta_time

ADVx_TH
36523468.0 -28231902.0
DFxE_TH
574455.7 -348717.34
ADVy_TH
43466144.0 -31236064.0
DFyE_TH
921333.6 -414038.94
ADVr_TH
60402470.0 -106447560.0
DFrE_TH
2275023.8 -2370699.5
DFrI_TH
3000093.5 -2385608.8


In [17]:
filename = f"valid_minmax_{DS.attrs['name']}_{DS.attrs['grid']}_{DS.attrs['time_type']}_{DS.attrs['shortname']}.nc"
filename

'valid_minmax_OCEAN_3D_TEMPERATURE_FLUX_native_mon_mean_ECL5M-3TF44.nc'

In [18]:
output_dir = Path('/home/ifenty/ian1/ifenty/ECCOv4/Version4/Release4/podaac/valid_minmax_a')
if not output_dir.exists():
    output_dir.mkdir()

In [19]:
DS.to_netcdf(output_dir / filename)