In [12]:
client.restart()

<Client: scheduler='tcp://172.22.0.2:8786' processes=0 cores=0>

In [2]:
%load_ext memory_profiler

In [13]:
%matplotlib inline

import matplotlib.pyplot as plt
from dask.dot import dot_graph
import itertools
import logging
import netCDF4
import numpy as np
import dask.array as da
from dask import delayed
import time
from dask.distributed import Client
from urllib import request
from multiprocessing import Pool


client = Client('scheduler:8786')
#client = Client(processes=False)

download_location = '/temp'
data_url = 'http://172.22.0.1:8080'
max_download_attempts = 5

all_models = ['ACCESS1-0',  'BNU-ESM', 'CCSM4', 'CESM1-BGC', 'CNRM-CM5', 'CSIRO-Mk3-6-0', 'CanESM2', 'GFDL-CM3', 'GFDL-ESM2G', 'GFDL-ESM2M', 'IPSL-CM5A-LR', 'IPSL-CM5A-MR', 'MIROC-ESM-CHEM', 'MIROC-ESM', 'MIROC5', 'MPI-ESM-LR', 'MPI-ESM-MR', 'MRI-CGCM3', 'NorESM1-M', 'bcc-csm1-1', 'inmcm4']
all_models = ['ACCESS1-0', 'BNU-ESM'] 
all_vars = ['tasmax', 'pr']
all_years = {
     # 'historical': list(range(1971, 1976))
    'historical': list(range(1971, 2001))
}

def get_dataset_url(variable, scenario, model, year, prefix = data_url):
    prefix_filename = '/'.join([prefix, 'NEX-GDDP', 'BCSD', scenario, 'day', 'atmos', variable, 'r1i1p1', 'v1.0'])
    # prefix_filename = data_url + '/data'
    filename = '_'.join([variable, 'day', 'BCSD', scenario, 'r1i1p1', model, str(year) + '.nc'])
    url = '/'.join([prefix_filename, filename])
    return url

def get_context(year, **kwargs):
    variables = [kwargs.get('variable')] if kwargs.get('variable') else all_vars
    scenarios = ['historical']
    models = [kwargs.get('model')] if kwargs.get('model') else all_models
    outlist = []
    combinations = list(itertools.product(variables, scenarios, models))
    result = list(map(lambda comb: [ *comb, year ], combinations))
    return result

def get_year_ensemble(year, variable = 'tasmax'):
    context = get_context(year, variable = variable)
    datasets = list(map(lambda x: str(get_dataset_url(*x)), context))
    return datasets

def download_file(url):
    print("url: " + url)
    attempts = 0
    success = False
    filename = ""
    while attempts < max_download_attempts and not success:
        time.sleep(2 ** attempts)
        filename = '/'.join([download_location, str(url.split('/')[-1])])
        print("Downloading file at " + filename)
        u = request.urlopen(url)
        f = open(filename, 'wb')
        f.write(u.read())
        f.close()
        success = True
        break
    return filename

def download_file_list(url_list):
    print("Starting download pool")
    pool = Pool()
    res = pool.map(download_file, url_list)
    print("Jobs sent")
    pool.close()
    pool.join()
    print("Downloads finished")
    print(res)
    return res

"OK"

'OK'

In [44]:
def download_and_stack(year, variable):
    dsets_urls = list(map(lambda x: get_year_ensemble(x, variable = variable), [year]))[0]
    filenames = download_file_list(dsets_urls)
    datasets = [ netCDF4.Dataset(filename) for filename in filenames ]
    dask_arrays = []
    for dset in datasets:
        dask_arrays.append(da.from_array(dset['tasmax'], chunks= (366, 144, 144)))
    final_stack = da.stack(dask_arrays, axis = 0)
    return final_stack

def avg_over_first_axis(darray):
    return np.average(darray, axis=0)

stack_1971 = download_and_stack(1971, variable='tasmax')
avg_stack_1971 = avg_over_first_axis(stack_1971)
avg_stack_1971

Starting download pool


url: http://172.22.0.1:8080/NEX-GDDP/BCSD/historical/day/atmos/tasmax/r1i1p1/v1.0/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1971.nc


url: http://172.22.0.1:8080/NEX-GDDP/BCSD/historical/day/atmos/tasmax/r1i1p1/v1.0/tasmax_day_BCSD_historical_r1i1p1_BNU-ESM_1971.nc


Downloading file at /temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1971.nc


Downloading file at /temp/tasmax_day_BCSD_historical_r1i1p1_BNU-ESM_1971.nc


Jobs sent
Downloads finished
['/temp/tasmax_day_BCSD_historical_r1i1p1_ACCESS1-0_1971.nc', '/temp/tasmax_day_BCSD_historical_r1i1p1_BNU-ESM_1971.nc']


array([[[  5.00000010e+19,   5.00000010e+19,   5.00000010e+19, ...,
           5.00000010e+19,   5.00000010e+19,   5.00000010e+19],
        [  5.00000010e+19,   5.00000010e+19,   5.00000010e+19, ...,
           5.00000010e+19,   5.00000010e+19,   5.00000010e+19],
        [  5.00000010e+19,   5.00000010e+19,   5.00000010e+19, ...,
           5.00000010e+19,   5.00000010e+19,   5.00000010e+19],
        ..., 
        [  5.00000010e+19,   5.00000010e+19,   5.00000010e+19, ...,
           5.00000010e+19,   5.00000010e+19,   5.00000010e+19],
        [  5.00000010e+19,   5.00000010e+19,   5.00000010e+19, ...,
           5.00000010e+19,   5.00000010e+19,   5.00000010e+19],
        [  5.00000010e+19,   5.00000010e+19,   5.00000010e+19, ...,
           5.00000010e+19,   5.00000010e+19,   5.00000010e+19]],

       [[  5.00000010e+19,   5.00000010e+19,   5.00000010e+19, ...,
           5.00000010e+19,   5.00000010e+19,   5.00000010e+19],
        [  5.00000010e+19,   5.00000010e+19,   5.00000010e+1

In [39]:
 da.from_array(stack_1971[0], chunks=(366, 144, 144))

dask.array<array, shape=(365, 720, 1440), dtype=float32, chunksize=(365, 144, 144)>

In [14]:
client.ncores()

{'tcp://172.22.0.3:34753': 8}

In [10]:
stack = da.concatenate(stack_1971, axis = 0)
stack

AttributeError: NetCDF: Attribute not found

In [26]:
test_array = da.from_array(stack_1971[0], chunks = (30, 72, 72))


array([[[ 273.59411621,  273.62255859,  273.62002563, ...,  273.63659668,
          273.60528564,  273.58834839],
        [ 273.95245361,  273.97991943,  273.98950195, ...,  273.9395752 ,
          273.93780518,  273.93704224],
        [ 274.27890015,  274.3119812 ,  274.32345581, ...,  274.23913574,
          274.24472046,  274.25942993],
        ..., 
        [ 236.15135193,  236.27227783,  236.36195374, ...,  235.9937439 ,
          236.01846313,  236.06578064],
        [ 236.26147461,  236.39093018,  236.48231506, ...,  236.14746094,
          236.16404724,  236.20040894],
        [ 236.39985657,  236.49971008,  236.55255127, ...,  236.31938171,
          236.33621216,  236.33799744]],

       [[ 275.32006836,  275.34921265,  275.37612915, ...,  275.40203857,
          275.37585449,  275.33001709],
        [ 275.73413086,  275.76617432,  275.78396606, ...,  275.78521729,
          275.75854492,  275.72753906],
        [ 276.09841919,  276.13531494,  276.15756226, ...,  276.12457275

In [None]:
dot_graph(stack.dask)

In [None]:
%%timeit

block = stack[:, 0:180, 0:180]
res = np.percentile(block, 99, axis = 0)
res

In [45]:
def per_ch_sz(chsize):
    randarr = np.random.randn(10958, chsize, chsize)
    res = np.percentile(randarr, 99, axis = 0)
    return res

In [46]:
%memit per_ch_sz(144)

peak memory: 7697.44 MiB, increment: 3467.30 MiB


In [47]:
%timeit per_ch_sz(144)

14.2 s ± 140 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [48]:
%memit per_ch_sz(180)

peak memory: 9647.99 MiB, increment: 5417.47 MiB


In [49]:
%timeit per_ch_sz(180) # Not much difference

21.1 s ± 50.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [50]:
%memit per_ch_sz(240)

peak memory: 13860.95 MiB, increment: 9630.13 MiB


In [None]:
%timeit per_ch_sz(240)

In [None]:
result = per_ch_sz(144)
result

In [None]:
def get_stacks(a, chunksize):
    time, lat, lon = a.shape
    nstacks_lat = int(np.ceil(lat / chunksize))
    nstacks_lon = int(np.ceil(lon / chunksize))
    
    stacks = []
    
    for i in range(nstacks_lat):
        for j in range(nstacks_lon):
            latmin, latmax = i * chunksize, (i+1) * chunksize
            lonmin, lonmax = j * chunksize, (j+1) * chunksize
            print(i, j, '~>', latmin, latmax, lonmin, lonmax)
            stacked = a[:, latmin:latmax, lonmin:lonmax]
            print(stacked)
            stacks.append(stacked)
    return stacks
stacks = get_stacks(stack, 144)
res = list(map(lambda x: np.percentile(x, 99, axis=0), stacks))
res

In [None]:
int(np.sqrt(len(res) / 2))

In [None]:
slices = [res[i:i + 5] for i in range(1, 50, 5) ]
stacked_lons= list(map(np.hstack, slices))
print(list(map(np.shape, stacked_lons)))
stacked_lats = np.vstack(stacked_lons[0:9])
stacked_lats.shape

In [None]:
np.amax(stacked_lats)