In [None]:
from google.cloud import storage
import os
import itertools
import netCDF4
import numpy as np
import numpy.ma as ma
from dask import delayed
import dask.array as da
from dask.distributed import Client
import glob
import tempfile
import subprocess
import datetime, time
from urllib import request
from multiprocessing import Pool
import json
import gc

download_location = '/temp'
# data_url = 'http://nasanex.s3.amazonaws.com'
data_url = 'http://172.19.0.1:8081'
max_download_attempts = 5
client = Client('scheduler:8786')
#
storage_client = storage.Client.from_service_account_json('/home/jovyan/work/credentials.json')
bucket = storage_client.get_bucket('nexgddp')


# Actual processing
# We load the temperature baseline from a numpy array in disk
baseline = da.from_array(np.load('/home/jovyan/work/baseline_tasmax_99p.npy'), chunks = (144, 144))

# And define the functions to be applied over each dataset
# This if for a single year, either one of the variables or the
# tasavg stack, which would be calculated on-the-fly

# Heating Degree Days - in C, transformation to F should not be problematic
def hdd(a, axis):
    a_to_baseline = 291.483 - a
    masked = ma.masked_where(a_to_baseline <= 0, a_to_baseline)
    intermediate_matrix = ma.filled(masked, fill_value = 0)
    result = np.sum(intermediate_matrix, axis = 0)
    return result

# Cooling degree days
def cdd(a, axis):
    a_to_baseline = 291.483 - a
    a_to_baseline[a_to_baseline < -10000] = 0
    masked = ma.masked_where(a_to_baseline >= 0, a_to_baseline)
    intermediate_matrix = ma.filled(masked, fill_value = 0)
    result = np.sum(np.abs(intermediate_matrix), axis = 0)
    return result

# Number of days of the year with tasmax > 99 percentile from baseline 1971-2000
def extreme_heat(a, axis):
    a_to_baseline = a - baseline
    masked = ma.masked_where(a_to_baseline <= 0, a_to_baseline)
    intermediate_matrix = ma.filled(masked, fill_value = 0)
    result = np.count_nonzero(intermediate_matrix, axis = axis)
    return result

# Helper function, not to be applied directly on the worker
def longest_streak(diff):
    result = 0
    try:
        result =  np.amax(
            np.array(np.where(diff < 0)) - np.array(np.where(diff > 0))
        )
    except ValueError:
        #raised if empty
        result = 0
    return result

# Longest streak of days over freezing temperature (tasmin)
def frost_free_season(a, axis):
    # First, dealing with the first matrix
    frost_days_matrix = (a > 273.15) * 1
    # We pad it with zeroes at the ends of the designed axis
    zeros_shape = list(a.shape)
    del zeros_shape[axis]
    zeros_matrix = np.expand_dims(np.zeros(zeros_shape), axis = axis)
    concat_matrix = np.concatenate((zeros_matrix, frost_days_matrix, zeros_matrix))
    # We calculate the deltas along an axis
    diff = np.diff(concat_matrix, axis = axis)
    # And get the longest streak from there --
    # apply along axis is far from ideal, but
    # np.where doesn't operate over axes, so we have to iterate
    result = np.apply_along_axis(longest_streak, axis, diff)
    return result
# Downloading files
def get_urls(year, model, scenario):
    urls = []
    for var in ['tasmax', 'tasmin']:
        # prefix_filename = '/'.join([data_url, 'NEX-GDDP', 'BCSD', scenario, 'day', 'atmos', var, 'r1i1p1', 'v1.0'])
        prefix_filename = data_url
        filename = '_'.join([var, 'day', 'BCSD', scenario, 'r1i1p1', model, str(year) + '.nc'])
        urls.append('/'.join([prefix_filename, filename]))
    return urls

def download(year, model, scenario):
    tasmax_url, tasmin_url = get_urls(year, model, scenario)
    urls = [tasmax_url, tasmin_url]
    filenames = list(map(lambda url: '/temp/' + url.split('/')[-1],  urls))
    pool = Pool()
    filenames = pool.map(download_file, urls)
    pool.close()
    pool.join()
    return filenames

def download_file(url):
    attempts = 0
    success = False
    filename = ""
    while attempts < max_download_attempts and not success:
        time.sleep(2 ** attempts)
        filename = '/'.join([download_location, str(url.split('/')[-1])])
        u = request.urlopen(url)
        f = open(filename, 'wb')
        f.write(u.read())
        f.close()
        success = True
        break
    return filename

def download_and_process(year, model, scenario):
    filenames = download(year, model, scenario)
    da_arrays = [
              da.from_array(netCDF4.Dataset(filenames[0])['tasmax'], chunks = (366, 144, 144)),
              da.from_array(netCDF4.Dataset(filenames[1])['tasmin'], chunks = (366, 144, 144))
    ]

    base_stack = da.stack(da_arrays)
    tasmax_stack = da_arrays[0]
    tasmin_stack = da_arrays[1]

    tasavg = np.mean(base_stack, axis = 0)
    avg_tasmin = np.mean(tasmin_stack, axis = 0).compute()
    avg_tasmax = np.mean(tasmax_stack, axis = 0).compute()
    avg_tasavg = np.mean(tasavg, axis = 0).compute()

    avg_tasmin[avg_tasmin > 1000] = -1
    avg_tasmax[avg_tasmax > 1000] = -1
    avg_tasavg[avg_tasavg > 1000] = -1

    hdds = delayed(hdd)(tasavg, axis = 0).compute()
    cdds = delayed(cdd)(tasavg, axis = 0).compute()
    ffs = delayed(frost_free_season)(tasmin_stack, axis = 0).compute()
    xs = delayed(extreme_heat)(tasmax_stack, axis = 0).compute()

    results = np.stack((
            avg_tasmax,
            avg_tasmin,
            avg_tasavg,
            hdds,
            cdds,
            ffs,
            xs
    ))
    output_filename = f'{year}_{model}_processed_temperatures.npy'
    np.save('/temp/' + output_filename, results)
    blob = bucket.blob(output_filename)
    blob.upload_from_filename('/temp/' + output_filename)
    
    client.cancel(base_stack)
    del base_stack
    del tasmax_stack
    del tasmin_stack
    del tasavg
    del avg_tasmin
    del avg_tasmax
    del avg_tasavg
    del hdds
    del cdds
    del ffs
    del xs
    gc.collect()
    client.restart()
    return output_filename

In [None]:
# GET /process/:year/:model/:scenario
req = json.loads(REQUEST)
year = int(req['path']['year'])
model = req['path']['model']
scenario = req['path']['scenario']

for ncdf in glob.glob('/temp/*.nc*'):
    os.remove(ncdf)

result = download_and_process(year, model, scenario)
print({'output': result})