In [1]:
%matplotlib inline
import xarray as xr
import datetime
import os
import matplotlib.pyplot as plt
from dask.dot import dot_graph
import itertools
import logging
import netCDF4
import numpy as np
import numpy.ma as ma
import dask.array as da
from dask import delayed
import time
from dask.distributed import Client
from urllib import request
from multiprocessing import Pool
import glob
import tempfile
import subprocess
import glob

In [2]:
# We define some important data
client = Client('scheduler:8786')
download_location = '/temp'
data_url = 'http://nasanex.s3.amazonaws.com'
data_url = 'http://172.21.0.1:8080'
max_download_attempts = 5
all_models = ['ACCESS1-0',  'BNU-ESM', 'CCSM4', 'CESM1-BGC', 'CNRM-CM5', 'CSIRO-Mk3-6-0', 'CanESM2', 'GFDL-CM3', 'GFDL-ESM2G', 'GFDL-ESM2M', 'IPSL-CM5A-LR', 'IPSL-CM5A-MR', 'MIROC-ESM-CHEM', 'MIROC-ESM', 'MIROC5', 'MPI-ESM-LR', 'MPI-ESM-MR', 'MRI-CGCM3', 'NorESM1-M', 'bcc-csm1-1', 'inmcm4']
# all_models = ['ACCESS1-0',  'BNU-ESM', 'bcc-csm1-1']
all_vars = ['tasmax', 'pr', 'tasmin']
all_years = {
    'historical': list(range(1971, 2001))
}

year_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
year_leap_days = [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]


In [3]:
# And some functions to deal with loading data into the cluster
def get_dataset_url(variable, scenario, model, year, prefix = data_url):
    prefix_filename = '/'.join([prefix, 'NEX-GDDP', 'BCSD', scenario, 'day', 'atmos', variable, 'r1i1p1', 'v1.0'])
    filename = '_'.join([variable, 'day', 'BCSD', scenario, 'r1i1p1', model, str(year) + '.nc'])
    url = '/'.join([prefix_filename, filename])
    # url = '/'.join([prefix, filename])
    return url

def get_context(year, **kwargs):
    variables = [kwargs.get('variable')] if kwargs.get('variable') else all_vars
    scenarios = ['historical']
    models = [kwargs.get('model')] if kwargs.get('model') else all_models
    outlist = []
    combinations = list(itertools.product(variables, scenarios, models))
    result = list(map(lambda comb: [ *comb, year ], combinations))
    return result

def get_year_ensemble(year, variable):
    context = get_context(year, variable = variable)
    datasets = list(map(lambda x: str(get_dataset_url(*x)), context))
    return datasets

def days_to_ranges(years):
    ranges = [year_days[:i +1] for i, n in enumerate(year_days)]
    result = [(sum(element[:-1]), sum(element)) for element in ranges]
    return result

def stack_to_months(stack):
    days = days_to_ranges(year_days) if stack.shape[2] == 365 else days_to_ranges(year_leap_days)
    for period in days:
        yield(stack[:, :, period[0]:period[1], :, :])

def stack_to_models(stack):
    for model in range(stack.shape[0]):
        yield(stack[model, :, :, :])

def stack_from_disk(year, chunksize):
    year_ensemble_tasmin = list(map(lambda x: x.split('/')[-1], get_year_ensemble(year, variable = 'tasmin')))
    year_ensemble_tasmax = list(map(lambda x: x.split('/')[-1], get_year_ensemble(year, variable = 'tasmax')))
    datasets_tasmin = [ netCDF4.Dataset('/temp/' + filename) for filename in year_ensemble_tasmin ]
    datasets_tasmax = [ netCDF4.Dataset('/temp/' + filename) for filename in year_ensemble_tasmax ]
    das_tasmin = list(map(lambda dset: da.from_array(dset['tasmin'], chunks = (365, chunksize, chunksize)), datasets_tasmin))
    das_tasmax = list(map(lambda dset: da.from_array(dset['tasmax'], chunks = (365, chunksize, chunksize)), datasets_tasmax))
    # da.stack(list(map(lambda dset: da.from_array(dset['tasmax'], chunks = (365, 144, 144)), datasets_tasmax)))
    final_stack_tasmin = da.stack(das_tasmin)
    final_stack_tasmax = da.stack(das_tasmax)
    final_stack = da.stack((final_stack_tasmin, final_stack_tasmax))
    return final_stack

def download(year):
    context_tasmax = get_context(year, variable = 'tasmax')
    context_tasmin = get_context(year, variable = 'tasmin')
    urls = list(map(lambda ctx: get_dataset_url( *ctx ), [*context_tasmin, *context_tasmax]))
    filenames = list(map(lambda url: '/temp/' + url.split('/')[-1],  urls))
    with tempfile.NamedTemporaryFile(mode = 'w', delete = False) as download_list:
        for url in urls:
            download_list.write(url + '\n')
        download_command = f' aria2c -i {download_list.name} --log={download_list.name}.log --log-level=warn --dir /temp --max-tries 5 --retry-wait 5'
        print("Downloading files")
        download_result = subprocess.Popen(download_command, shell=True, stdout=subprocess.PIPE)
    return download_result, filenames

def download_and_stack(year):
    downloads, filenames = download(year)
    result = downloads.wait()
    print(result)
    if result != 0:
        raise Exception('Downloads failed')
    print("Finished the downloads")
    stack = stack_from_disk(year, 144)
    return stack, filenames 

In [4]:
# Actual processing
# We load the temperature baseline from a numpy array in disk
baseline = np.load('./baseline_tasmax_99p.npy')
client.scatter(baseline)

# And define the functions to be applied over each dataset
# This if for a single year, either one of the variables or the
# tasavg stack, which would be calculated on-the-fly

# Heating Degree Days - in C, transformation to F should not be problematic
def hdd(a, axis):
    a_to_baseline = 291.483 - a
    masked = ma.masked_where(a_to_baseline <= 0, a_to_baseline)
    intermediate_matrix = ma.filled(masked, fill_value = 0)
    result = np.sum(intermediate_matrix, axis = 0)
    return result

# Cooling degree days
def cdd(a, axis):
    a_to_baseline = 291.483 - a
    a_to_baseline[a_to_baseline < -10000] = 0
    masked = ma.masked_where(a_to_baseline >= 0, a_to_baseline)
    intermediate_matrix = ma.filled(masked, fill_value = 0)
    result = np.sum(np.abs(intermediate_matrix), axis = 0)
    return result

# Number of days of the year with tasmax > 99 percentile from baseline 1971-2000
def extreme_heat(a, axis):
    a_to_baseline = a - baseline
    masked = ma.masked_where(a_to_baseline <= 0, a_to_baseline)
    intermediate_matrix = ma.filled(masked, fill_value = 0)
    result = np.count_nonzero(intermediate_matrix, axis = axis)
    return result

# Helper function, not to be applied directly on the worker
def longest_streak(diff):
    result = 0
    try:
        result =  np.amax(
            np.array(np.where(diff < 0)) - np.array(np.where(diff > 0))
        )
    except ValueError:
        #raised if empty
        result = 0
    return result

# Longest streak of days over freezing temperature (tasmin)
def frost_free_season(a, axis):
    # First, dealing with the first matrix
    frost_days_matrix = (a > 273.15) * 1
    # We pad it with zeroes at the ends of the designed axis
    zeros_shape = list(a.shape)
    del zeros_shape[axis]
    zeros_matrix = np.expand_dims(np.zeros(zeros_shape), axis = axis)
    concat_matrix = np.concatenate((zeros_matrix, frost_days_matrix, zeros_matrix))
    # We calculate the deltas along an axis
    diff = np.diff(concat_matrix, axis = axis)
    # And get the longest streak from there --
    # apply along axis is far from ideal, but
    # np.where doesn't operate over axes, so we have to iterate
    result = np.apply_along_axis(longest_streak, axis, diff)
    return result

In [5]:
print("Removing old files")
for netcdf in glob.glob("/temp/*.nc*"):
    os.remove(netcdf)
print("Done.")

client.restart()    

Removing old files
Done.


<Client: scheduler='tcp://172.21.0.2:8786' processes=1 cores=4>

In [None]:
for year in [1971, 1973, 1974, 1975, 1977, 1978, 1979, 1981, 1982, 1983, 1985, 1986, 1987, 1999, 2000]:
    try:
        print(f"processing year {year}")
        year_stack, _ = download_and_stack(year)
        print(year_stack)
        
        # Yearly indexes - we define the datasets as slices of the in-worker dataset
        tasmax_stack = year_stack[0, :, :, :, :]
        tasmin_stack = year_stack[1, :, :, :, :]
        tasavg_stack = (tasmax_stack + tasmin_stack) / 2
        tasmin_per_model = list(stack_to_models(tasmin_stack))
        tasmax_per_model = list(stack_to_models(tasmax_stack))
        tasavg_per_model = list(stack_to_models(tasavg_stack))

        hdds_per_model = list(map(lambda arr: delayed(hdd)(arr, axis=0).compute(), tasavg_per_model))   
        hdds_final_stack = np.stack(hdds_per_model)
        np.save(f'/results/{year}_hdds_per_model.npy', hdds_final_stack)
        
        cdds_per_model = list(map(lambda arr: delayed(cdd)(arr, axis=0).compute(), tasavg_per_model))   
        cdds_final_stack = np.stack(cdds_per_model)
        np.save(f'/results/{year}_cdds_per_model.npy', cdds_final_stack)
        
        ffs_per_model = list(map(lambda arr: delayed(frost_free_season)(arr, axis=0).compute(), tasmin_per_model))
        ffs_final_stack = np.stack(ffs_per_model)
        np.save(f'/results/{year}_ffs_per_model.npy', ffs_final_stack)
        
        xs_per_model = list(map(lambda arr: delayed(extreme_heat)(arr, axis=0).compute(), tasmax_per_model))
        xs_final_stack = np.stack(xs_per_model)
        np.save(f'/results/{year}_xs_per_model.npy', xs_final_stack)
        
    except Exception as e:
        print(f"There was some error for year {year}")
        print(str(e))
        pass
    finally:
        for netcdf in glob.glob("/temp/*.nc*"):
            os.remove(netcdf)
    print(f"Finished with year {year}")

processing year 1971
Downloading files
