In [2]:
from dask.dot import dot_graph
import itertools
import logging
import netCDF4
import numpy
import dask.array as da
from dask import delayed
import time
from dask.distributed import Client
from urllib import request


client = Client('scheduler:8786')
# client = Client(processes=False)
client.ncores()

download_location = '/temp'
data_url = 'http://172.21.0.1:8080'
max_download_attempts = 5

all_models = ['ACCESS1-0', 'BNU-ESM', 'CCSM4', 'CESM1-BGC', 'CNRM-CM5', 'CSIRO-Mk3-6-0', 'CanESM2', 'GFDL-CM3', 'GFDL-ESM2G', 'GFDL-ESM2M', 'IPSL-CM5A-LR', 'IPSL-CM5A-MR', 'MIROC-ESM-CHEM', 'MIROC-ESM', 'MIROC5', 'MPI-ESM-LR', 'MPI-ESM-MR', 'MRI-CGCM3', 'NorESM1-M', 'bcc-csm1-1', 'inmcm4']
all_models = ['ACCESS1-0'] 
all_vars = ['tasmax', 'pr']
all_years = {
     'historical': list(range(1971, 1973))
}

def get_dataset_url(variable, scenario, model, year, prefix = data_url):
    prefix_filename = '/'.join([prefix, 'NEX-GDDP', 'BCSD', scenario, 'day', 'atmos', variable, 'r1i1p1', 'v1.0'])
    prefix_filename = data_url + '/data'
    filename = '_'.join([variable, 'day', 'BCSD', scenario, 'r1i1p1', model, str(year) + '.nc'])
    url = '/'.join([prefix_filename, filename])
    return url

def get_context(year, **kwargs):
    variables = [kwargs.get('variable')] if kwargs.get('variable') else all_vars
    scenarios = ['historical']
    models = [kwargs.get('model')] if kwargs.get('model') else all_models
    outlist = []
    combinations = list(itertools.product(variables, scenarios, models))
    result = list(map(lambda comb: [ *comb, year ], combinations))
    return result

def get_year_ensemble(year, variable = 'tasmax'):
    context = get_context(year, variable = variable)
    datasets = list(map(lambda x: str(get_dataset_url(*x)), context))
    return datasets

def to_dataset(filename):
    return netCDF4.Dataset(filename)

def download_file(url):
    print("url: " + url)
    attempts = 0
    success = False
    filename = ""
    while attempts < max_download_attempts and not success:
        time.sleep(2 ** attempts)
        filename = '/'.join([download_location, str(url.split('/')[-1])])
        print("Downloading file at " + filename)
        u = request.urlopen(url)
        f = open(filename, 'wb')
        f.write(u.read())
        f.close()
        success = True
        break
    return filename

In [3]:
client.ncores()

{'tcp://172.21.0.2:35843': 4}

In [4]:
datasets_tasmax = list(map(get_year_ensemble, all_years['historical']))
datasets_pr = list(map(lambda x : get_year_ensemble(x, variable = 'pr'), all_years['historical']))

In [5]:
dsets = []
da_dsets = []
for row in datasets_tasmax:
    print("Processing year")
    fnames = list(map(download_file, row))
    datasets_year = list(map(lambda dset: netCDF4.Dataset(dset), fnames))
    dsets.append(datasets_year)
    
    dask_dsets_year = map(lambda dset: da.from_array(dset['tasmax'], chunks=(366, 120, 120)), datasets_year)
    da_dsets.append(list(dask_dsets_year))


Processing year
url: http://172.21.0.1:8080/data/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1971.nc
Downloading file at /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1971.nc
Processing year
url: http://172.21.0.1:8080/data/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1972.nc
Downloading file at /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1972.nc


In [6]:
da_dsets = [row[0] for row in da_dsets]
da_dsets

[dask.array<array, shape=(365, 720, 1440), dtype=float32, chunksize=(365, 120, 120)>,
 dask.array<array, shape=(366, 720, 1440), dtype=float32, chunksize=(366, 120, 120)>]

In [27]:
# To concatenate dask arrays
stack = da.concatenate(da_dsets, axis = 0)
stack_rechunked = stack.rechunk((731, 120, 120))

stack_rechunked

dask.array<rechunk-merge, shape=(731, 720, 1440), dtype=float32, chunksize=(731, 120, 120)>

In [28]:
@delayed
def custom_percentile(a, q = 99, **kwargs):
    return numpy.percentile(a, q, **kwargs)

@delayed
def custom_average(a, **kwargs):
    return numpy.mean(a, **kwargs)

# custom_percentile(da.from_array(numpy.arange(1000000).reshape(100, 100, 100), chunks=(10))).compute()
custom_percentile(da.from_array(numpy.arange(1000000).reshape(100, 100, 100), chunks=(10)), axis=2).compute()

array([[  9.80100000e+01,   1.98010000e+02,   2.98010000e+02, ...,
          9.79801000e+03,   9.89801000e+03,   9.99801000e+03],
       [  1.00980100e+04,   1.01980100e+04,   1.02980100e+04, ...,
          1.97980100e+04,   1.98980100e+04,   1.99980100e+04],
       [  2.00980100e+04,   2.01980100e+04,   2.02980100e+04, ...,
          2.97980100e+04,   2.98980100e+04,   2.99980100e+04],
       ..., 
       [  9.70098010e+05,   9.70198010e+05,   9.70298010e+05, ...,
          9.79798010e+05,   9.79898010e+05,   9.79998010e+05],
       [  9.80098010e+05,   9.80198010e+05,   9.80298010e+05, ...,
          9.89798010e+05,   9.89898010e+05,   9.89998010e+05],
       [  9.90098010e+05,   9.90198010e+05,   9.90298010e+05, ...,
          9.99798010e+05,   9.99898010e+05,   9.99998010e+05]])

In [29]:
op = custom_percentile(stack_rechunked, axis = 0)
op

Delayed('custom_percentile-3ce2568c-9122-487f-a08b-22612f4474a4')

In [None]:
res = op.compute()
res

In [2]:
client.get_versions(check=True) 

{'client': {'host': [('python', '3.5.4.final.0'),
   ('python-bits', 64),
   ('OS', 'Linux'),
   ('OS-release', '4.13.11-1-ARCH'),
   ('machine', 'x86_64'),
   ('processor', 'x86_64'),
   ('byteorder', 'little'),
   ('LC_ALL', 'en_US.UTF-8'),
   ('LANG', 'en_US.UTF-8'),
   ('LOCALE', 'en_US.UTF-8')],
  'packages': {'optional': [('numpy', '1.13.3'),
    ('pandas', '0.21.0'),
    ('bokeh', '0.12.10'),
    ('lz4', None),
    ('blosc', None)],
   'required': [('dask', '0.15.4'),
    ('distributed', '1.19.3'),
    ('msgpack', '0.4.8'),
    ('cloudpickle', '0.4.0'),
    ('tornado', '4.5.2'),
    ('toolz', '0.8.2')]}},
 'scheduler': {'host': [['python', '3.6.3.final.0'],
   ['python-bits', 64],
   ['OS', 'Linux'],
   ['OS-release', '4.13.11-1-ARCH'],
   ['machine', 'x86_64'],
   ['processor', ''],
   ['byteorder', 'little'],
   ['LC_ALL', 'C.UTF-8'],
   ['LANG', 'C.UTF-8'],
   ['LOCALE', 'en_US.UTF-8']],
  'packages': {'optional': [['numpy', '1.13.3'],
    ['pandas', '0.21.0'],
    ['bokeh', 