### Pipeline outside of ML

This notebook shows some trial and error to create a `Pipeline` that can be used with `xarray_filters`.  

This is a continuation of goals in [Elm issue #149](https://github.com/ContinuumIO/elm/issues/149) to separate ML from GIS utils.

The goal is to be able to run something like this:
```
from xarray_filters.pipeline import Pipeline
from xarary_filters.steps import Generic, Serialize
def step_1(dset, **kw):
    return kw['a'] * dset.mean(dim=('x', 'y')) ** kw['b']

def step_2(dset, **kw):
    return kw['a'] + dset * kw['b']
    
steps = (('s1', Generic(step_1)),
         ('s2', Generic(step_2)),
         ('s3', Serialize('two_step_pipeline_out.nc')))
pipe = Pipeline(steps=steps)
pipe.set_params(s1__a=2,
                s1__b=3,
                s2__a=0,
                s2__b=0,
                s3__fname='file_with_zeros.nc')
pipe.fit_transform(X)
```
 * The example above uses scikit-learn `set_params` style of setting parameters where:
   * Steps in the `Pipeline` are named, `s1`, `s2`, and `s3` in this case
   * Double underscore notation is used to pass parameters to the `set_params` method of a given step.  Here:
     * `a` and `b` are parameters accepted by `step_1` and `step_2`
     * `fname` is accepted by `Serialize`
   * The `Dataset` or `MLDataset` `X` is run through the 3 steps
   * Note the import statements with `xarray_filters` at top of snippet is what we need to do based on this notebook
* Classes formerly part of `elm.pipeline.steps` will now inherit from `sklearn.base.BaseEstimator`


In [1]:
from sklearn.base import BaseEstimator
from sklearn.utils.metaestimators import _BaseComposition
from sklearn.pipeline import Pipeline as _Pipeline
from xarray_filters import MLDataset
from xarray_filters.tests.test_data import new_test_dataset

class Pipeline(_Pipeline):

    def __init__(self, dset=None,
                       steps=None,
                       param_grid=None,
                       memory=None):
        _Pipeline.__init__(self, steps, memory=memory)
        self.dset = dset
        self.param_grid = param_grid
        
    def _get_params_by_id(self, param_grid_id, **params):
        if self.param_grid:
            if param_grid_id is None:
                return params
            params2 = self.param_grid.get(param_grid_id, {})
        else:
            params2 = {}
        params2.update(params)
        return params2
    
    def fit_transform(self, dset=None, param_grid_id=None):
        params = self._get_params_by_id(param_grid_id)
        self.set_params(**params)
        if dset is None:
            dset = self.dset
        for name, step in self.steps:
            params['dset'] = dset
            dset = step.fit_transform(**params)
        return dset

In [2]:
class StepBase(BaseEstimator):
    def __init__(self, func=None):
        self.func = func
            
    def fit_transform(self, dset, **kw):
        params = self.get_params()
        return self.func(dset, **params)
    def fit(self, *args, **kw):
        return self.fit_transform(*args, **kw)
    def transform(self, *args, **kw):
        return self.fit_transform(*args, **kw)

    
class Generic(StepBase):
    def __init__(self, func=None, a=None, b=None):
        self.a = a
        self.b = b
        super(Generic, self).__init__(func=func)
            
    def fit_transform(self, dset, **kw):
        params = self.get_params()
        return self.func(dset=dset, **params)
    fit = transform = fit_transform


class Serialize(StepBase):
    def __init__(self, fname, as_netcdf=True):
        self.fname = fname
        self.as_netcdf = as_netcdf
    
    def fit_transform(self, dset):
        if self.as_netcdf:
            fname = self.get_params()['fname']
            dset.to_netcdf(fname)
            return dset
        else:
            pass # TODO other serializers?

In [3]:
X = new_test_dataset(('wind', 'pressure', 'temperature',))
X

<xarray.MLDataset>
Dimensions:      (t: 48, x: 20, y: 15, z: 8)
Coordinates:
  * x            (x) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
  * y            (y) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
  * z            (z) int64 0 1 2 3 4 5 6 7
  * t            (t) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...
Data variables:
    wind         (x, y, z, t) float64 0.5595 0.1951 0.7879 0.167 0.2586 ...
    pressure     (x, y, z, t) float64 0.6193 0.9228 0.3936 0.9875 0.5534 ...
    temperature  (x, y, z, t) float64 0.1323 0.1877 0.1207 0.5448 0.787 ...

In [4]:
def step_1(dset, **kw):
    return kw['a'] * dset.mean(dim=('x', 'y')) ** kw['b']

def step_2(dset, **kw):
    return kw['a'] + dset * kw['b']

steps = (('s1', Generic(step_1)),
         ('s2', Generic(step_2)),
         ('s3', Serialize('two_step_pipeline_out.nc')))

In [5]:
(_, s1), _, _ = steps

In [6]:
s1.set_params(a=0, b=0)
ones = s1.fit_transform(X)
s1.set_params(a=2, b=2)
other = s1.fit_transform(X)
other.temperature - ones.temperature

<xarray.DataArray 'temperature' (z: 8, t: 48)>
array([[ 0.485309,  0.535802,  0.478572, ...,  0.482235,  0.500192,  0.528511],
       [ 0.51283 ,  0.466976,  0.494062, ...,  0.516987,  0.534294,  0.489226],
       [ 0.501738,  0.497614,  0.547956, ...,  0.525608,  0.498681,  0.546163],
       ..., 
       [ 0.482539,  0.458152,  0.517765, ...,  0.533589,  0.517806,  0.484971],
       [ 0.517204,  0.540091,  0.587889, ...,  0.491527,  0.469517,  0.472836],
       [ 0.507857,  0.533116,  0.420788, ...,  0.472987,  0.490202,  0.463831]])
Coordinates:
  * z        (z) int64 0 1 2 3 4 5 6 7
  * t        (t) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 ...

In [7]:
pipe = Pipeline(steps=steps)

In [8]:
pipe

Pipeline(dset=None, memory=None, param_grid=None,
     steps=(('s1', Generic(a=2, b=2, func=<function step_1 at 0x10d33ed90>)), ('s2', Generic(a=None, b=None, func=<function step_2 at 0x10d33ee18>)), ('s3', Serialize(as_netcdf=True, fname='two_step_pipeline_out.nc'))))

In [9]:
pipe.set_params(s1__a=2, s1__b=3, s2__a=0, s2__b=0, s3__fname='file_with_zeros.nc')
pipe.fit_transform(X)

<xarray.MLDataset>
Dimensions:      (t: 48, z: 8)
Coordinates:
  * z            (z) int64 0 1 2 3 4 5 6 7
  * t            (t) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...
Data variables:
    wind         (z, t) float64 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ...
    pressure     (z, t) float64 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ...
    temperature  (z, t) float64 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ...

In [10]:
pipe.set_params(s1__a=2, s1__b=3, s2__a=1, s2__b=1, s3__fname='file_nonzero.nc')
pipe.fit_transform(X)

<xarray.MLDataset>
Dimensions:      (t: 48, z: 8)
Coordinates:
  * z            (z) int64 0 1 2 3 4 5 6 7
  * t            (t) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...
Data variables:
    wind         (z, t) float64 1.3 1.278 1.252 1.241 1.239 1.26 1.308 1.248 ...
    pressure     (z, t) float64 1.273 1.255 1.269 1.25 1.224 1.247 1.255 ...
    temperature  (z, t) float64 1.239 1.277 1.234 1.242 1.229 1.217 1.193 ...

In [11]:
! ls -l *.nc

-rw-r--r--  1 psteinberg  staff  18977 Sep  8 15:24 file_nonzero.nc
-rw-r--r--  1 psteinberg  staff  18977 Sep  8 15:24 file_with_zeros.nc


In [None]:
pipe