In [22]:
import sys
sys.path.insert(1, '../unseen')

import numpy as np
from dask.distributed import Client, LocalCluster
import xclim.analog
import xarray as xr

import fileio

In [2]:
cluster = LocalCluster()
client = Client(cluster)

In [3]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:46339  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 28.00 GiB


## Read data

In [2]:
cafe_file = '/g/data/xv83/dbi599/pr_cafe-c5-d60-pX-f6_awap-additive-correction_19900501-19931101_A-DEC-sum_cafe-grid-TAS-POINT.zarr.zip'
ds_cafe = fileio.open_file(cafe_file)
ds_cafe          

Unnamed: 0,Array,Chunk
Bytes,704 B,704 B
Shape,"(11, 8)","(11, 8)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 704 B 704 B Shape (11, 8) (11, 8) Count 2 Tasks 1 Chunks Type object numpy.ndarray",8  11,

Unnamed: 0,Array,Chunk
Bytes,704 B,704 B
Shape,"(11, 8)","(11, 8)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,66.00 kiB,66.00 kiB
Shape,"(8, 11, 96)","(8, 11, 96)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 66.00 kiB 66.00 kiB Shape (8, 11, 96) (8, 11, 96) Count 2 Tasks 1 Chunks Type float64 numpy.ndarray",96  11  8,

Unnamed: 0,Array,Chunk
Bytes,66.00 kiB,66.00 kiB
Shape,"(8, 11, 96)","(8, 11, 96)"
Count,2 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [29]:
# Note: use_cftime=True when using xr.open_zarr breaks time slice selection

awap_file = '/g/data/xv83/dbi599/pr_awap_cafe-grid-TAS-POINT.zarr.zip'
ds_awap = xr.open_zarr(awap_file, consolidated=True)
ds_awap = ds_awap.sel({'time': slice('1980-01-01', '2000-12-31')})
ds_awap

Unnamed: 0,Array,Chunk
Bytes,168 B,8 B
Shape,"(21,)","(1,)"
Count,143 Tasks,21 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 168 B 8 B Shape (21,) (1,) Count 143 Tasks 21 Chunks Type float64 numpy.ndarray",21  1,

Unnamed: 0,Array,Chunk
Bytes,168 B,8 B
Shape,"(21,)","(1,)"
Count,143 Tasks,21 Chunks
Type,float64,numpy.ndarray


## K-S test

In [8]:
fcst_stacked = ds_cafe.stack({'sample': ['ensemble', 'init_date', 'lead_time']})

In [13]:
fcst_stacked['pr']

Unnamed: 0,Array,Chunk
Bytes,66.00 kiB,66.00 kiB
Shape,"(8448,)","(8448,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 66.00 kiB 66.00 kiB Shape (8448,) (8448,) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",8448  1,

Unnamed: 0,Array,Chunk
Bytes,66.00 kiB,66.00 kiB
Shape,"(8448,)","(8448,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,66.00 kiB,66.00 kiB
Shape,"(8448,)","(8448,)"
Count,6 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 66.00 kiB 66.00 kiB Shape (8448,) (8448,) Count 6 Tasks 1 Chunks Type object numpy.ndarray",8448  1,

Unnamed: 0,Array,Chunk
Bytes,66.00 kiB,66.00 kiB
Shape,"(8448,)","(8448,)"
Count,6 Tasks,1 Chunks
Type,object,numpy.ndarray


In [8]:
#fcst_stacked['sample'] = np.arange(fcst_stacked['sample'].size)
#fcst_stacked

In [17]:
awap_faux_stacked = ds_awap.rename(time='sample')
#awap_faux_stacked['sample'] = np.arange(awap_faux_stacked['sample'].size)
awap_faux_stacked = awap_faux_stacked.chunk({'sample': -1})
awap_faux_stacked['pr']

Unnamed: 0,Array,Chunk
Bytes,0.95 kiB,0.95 kiB
Shape,"(121,)","(121,)"
Count,123 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 0.95 kiB 0.95 kiB Shape (121,) (121,) Count 123 Tasks 1 Chunks Type float64 numpy.ndarray",121  1,

Unnamed: 0,Array,Chunk
Bytes,0.95 kiB,0.95 kiB
Shape,"(121,)","(121,)"
Count,123 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [18]:
test = xclim.analog.spatial_analogs(target=awap_faux_stacked,
                                    candidates=fcst_stacked,
                                    dist_dim='sample',
                                    method='kolmogorov_smirnov')

In [19]:
test.values

array(0.10890152)

For a 2D KS-test I think you just include two variables in the Dataset instead of just one. For instance, at the moment `data_vars` only has one variable, but you could add another.

In [30]:
fcst_stacked.data_vars

Data variables:
    pr       (sample) float64 dask.array<chunksize=(8448,), meta=np.ndarray>

# Resampling (bootstrapping)

In [10]:
def random_resample(*objects, samples,
                    function=None, function_kwargs=None, replace=True):
    """
        Randomly resample from provided xarray objects and return the results of the subsampled dataset passed through \
        a provided function
                
        Parameters
        ----------
        *objects : xarray DataArray or Dataset
            Objects containing data to be resampled. The coordinates of the first object are used for resampling and the \
            same resampling is applied to all objects
        samples : dictionary
            Dictionary containing the dimensions to subsample, the number of samples and the continuous block size \
            within the sample. Of the form {'dim1': (n_samples, block_size), 'dim2': (n_samples, block_size)}. The first \
            object in objects must contain all dimensions listed in samples, but subsequent objects need not.
        function : function object, optional
            Function to reduced the subsampled data
        function_kwargs : dictionary, optional
            Keyword arguments to provide to function
        replace : boolean, optional
            Whether the sample is with or without replacement
                
        Returns
        -------
        sample : xarray DataArray or Dataset
            Array containing the results of passing the subsampled data through function
    """
    objects_sub = [obj.copy() for obj in objects]
    for dimension, (n_samples, block_size) in samples.items():
        n_blocks = int(n_samples / block_size)
        n_samples = n_blocks * block_size
        random_samples = [slice(x,x+block_size) for x in np.random.choice(len(objects_sub[0][dimension])-block_size+1, 
                                                                          size=n_blocks,
                                                                          replace=replace)]
        objects_sub = [xr.concat([obj.isel({dimension: random_sample}) for random_sample in random_samples],
                                 dim=dimension) 
                       if dimension in obj.dims else obj 
                       for obj in objects_sub]
    
    if function:
        if function_kwargs:
            res = tuple([function(obj, **function_kwargs) for obj in objects_sub])
        else:
            res = tuple([function(obj) for obj in objects_sub])
    else:
        res = tuple(objects_sub)
    
    if len(res) == 1:
        return res[0]
    else:
        return res
    

def n_random_resamples(*objects, samples, n_repeats, 
                       function=None, function_kwargs=None,
                       replace=True, with_dask=True):
    """
    Repeatedly randomly resample from provided xarray objects and return
    the results of the subsampled dataset passed through a provided function
                
    Parameters
    ----------
    objects : xarray DataArray or Dataset
        Objects containing data to be resampled.
        The coordinates of the first object are used for resampling and
        the same resampling is applied to all objects
    samples : dictionary
        Dictionary containing the dimensions to subsample, the number of samples and the continuous block size \
        within the sample. Of the form {'dim1': (n_samples, block_size), 'dim2': (n_samples, block_size)}
    n_repeats : int
        Number of times to repeat the resampling process
    function : function object, optional
        Function to reduced the subsampled data
    function_kwargs : dictionary, optional
        Keyword arguments to provide to function
    replace : boolean, optional
        Whether the sample is with or without replacement
    with_dask : boolean, optional
        If True, use dask to parallelize across n_repeats using dask.delayed
    write_to_disk
                
    Returns
    -------
    sample : xarray DataArray or Dataset
        Array containing the results of passing the subsampled data through function
    """

    if with_dask & (n_repeats > 1000):
        n_objects = itertools.repeat(objects[0], times=n_repeats)
        b = db.from_sequence(n_objects, npartitions=100)
        rs_list = b.map(random_resample, *(objects[1:]), 
                        **{'samples':samples, 'function':function, 
                           'function_kwargs':function_kwargs, 'replace':replace}).compute()
    else:              
        resample_ = dask.delayed(random_resample) if with_dask else random_resample
        rs_list = [resample_(*objects,
                             samples=samples,
                             function=function,
                             function_kwargs=function_kwargs,
                             replace=replace) for _ in range(n_repeats)] 
        if with_dask:
            rs_list = dask.compute(rs_list)[0]
    
    if len(objects) == 1:
        return xr.concat([r.unify_chunks() for r in rs_list], dim='k')
    else:
        return tuple([xr.concat([r.unify_chunks() for r in rs], dim='k') for rs in zip(*rs_list)])

In [None]:
alpha = 5
n_repeats = 100
n_block = 1

fcst_moments = n_random_resamples(ds_stacked, 
                                  samples={'sample': (len(da_awap['time']), n_block)}, 
                                  n_repeats=n_repeats,
                                  function=get_first_four_moments,
                                  function_kwargs={'dim' : 'sample'})