In [None]:
from collections import OrderedDict
from functools import partial
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20, 14)
import os
import re
from urllib.request import urlopen

from distributed import Client
from earthio import load_array, LayerSpec
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import requests
import xarray as xr
import numpy as np
import pandas as pd
import xarray as xr


from dask_glm.datasets import make_regression
from earthio.landsat_util import landsat_metadata
from earthio.s3_landsat_util import SceneDownloader
from elm.mldataset.cv_cache import CVCacheSampleId, cv_split
from elm.model_selection import EaSearchCV
from elm.model_selection.ea_searchcv import EaSearchCV
from elm.pipeline import steps
from elm.pipeline.pipeline import Pipeline
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from xarray_filters import MLDataset
from xarray_filters.datasets import _make_base
from xarray_filters.pipeline import Generic, Step
from xarray_filters.pipe_utils import data_vars_func
from xarray_filters.pipe_utils import for_each_array

In [None]:
LAYER_SPECS = [LayerSpec(search_key='name',  # should be guessed automatically that "name" is where to look - 
                                             # TODO name -> "key" instead of "search_key
                         search_value='B{}.TIF'.format(layer),  # this should have a shorter name like "search"
                         name='layer_{}'.format(layer),         # TODO this should be autoguessed "layer_0" "layer_1",etc 
                         buf_xsize=800,                         
                         buf_ysize=800) for layer in range(1, 8)]

In [None]:
ROW_PATH_MONTHS = dict(row=33, path=15, months=tuple(range(1,13)))

In [None]:
NORMALIZED_DIFFS = ('nbr', 'ndsi', 'ndwi', 'ndvi')
DEFAULT_LAYERS = [layer_spec.name for layer_spec in LAYER_SPECS]

In [None]:
class SamplerBase(Step):
    func = None
    def transform(self, kwargs, y=None, **kw):
        params = self.get_params(deep=True).copy()
        func = params.pop('func')
        func = func or load_array
        kw.update(params)
        return func(**kw)

In [None]:
def s3_landsat_sample(**params):
    params = params.copy()
    s3_landsat = SceneDownloader()
    layer_specs = params.pop('layer_specs')
    clear_image = s3_landsat.lowest_cloud_cover_image(**params)
    download_url = clear_image.download_url.values[0]
    
    local_files = s3_landsat.download_all_layers(download_url)
    this_sample_dir = os.path.dirname(local_files[0])
    X = load_array(this_sample_dir, layer_specs=LAYER_SPECS)
    meta_files = [f for f in local_files if f.endswith('.txt')][0]
    X.attrs.update(vars(landsat_metadata(meta_files)))
    return X

In [None]:
class Sampler(SamplerBase):
    layer_specs = None
    #func = s3_landsat_sample               # TODO it should work to put a function here but it doesn't
    func = None

In [None]:
@for_each_array
def set_nans(arr):
    arr = arr.copy(deep=True)
    arr.values = arr.values.astype(np.float32)
    arr.values[arr.values <= 1] = np.NaN
    arr.values[arr.values == 2**16] = np.NaN
    return arr

class ForEachStep(Step):
    keep_attrs = True
    func = None     
    pass_attrs = False
    def transform(self, X, y=None, **kw):
        kw = kw.copy()
        kw.update(self.get_params(deep=True).copy())
        # TODO here should we filter args and kwargs (see func_signatures.py in xarray_filters)
        if kw.pop('pass_attrs'):
            kw['attrs'] = X.attrs
        dset = kw.pop('func')(X, **kw)
        if kw.pop('keep_attrs', True):
            dset.attrs.update(X.attrs)
        return dset

In [None]:
def normed_diff(a, b):
    return (a - b) / (a + b)

@data_vars_func
def normalized_diffs(**dset):
    print('Called with ', dset.keys())
    dset['ndwi'] = normed_diff(dset['layer_4'], dset['layer_5'])
    dset['ndvi'] = normed_diff(dset['layer_5'], dset['layer_4'])
    dset['ndsi'] = normed_diff(dset['layer_2'], dset['layer_6'])
    dset['nbr']  = normed_diff(dset['layer_4'], dset['layer_7'])
    return dset

class DataVarsStep(Step):                             # TODO - add this to xarray_filters and test
    func = None                                       # func should have signature of **data_vars
                                                      # (expecting data_vars of "X")
    def transform(self, X, y=None, **kw):
        print('kw1', kw, self.get_params(deep=True))
        kw = kw.copy()
        kw.update(self.get_params(deep=True).copy())
        print('kw', kw)
        # TODO here should we filter args and kwargs (see func_signatures.py in xarray_filters)
        return kw.pop('func')(dset=X)

In [None]:
@for_each_array
def to_radiance_or_reflectance(arr, attrs=None, to='REFLECTANCE', **kw):
    num = arr.name.split('_')[-1]
    add = attrs.get('{}_ADD_BAND_{}'.format(to, num))
    mult = attrs.get('{}_MULT_BAND_{}'.format(to, num))
    arr.values[:] = arr.values * mult + add
    return arr

In [None]:
def choose_bands(X, layers=None, include_normed_diffs=True, **kw):
    new = OrderedDict()
    for layer in layers:
        data_arr = getattr(X, layer)
        new[layer] = data_arr
    if include_normed_diffs:
        for diff in NORMALIZED_DIFFS:
            new[diff] = getattr(X, diff)
    return MLDataset(new)

'''
class ChooseBands(Generic):                   # TODO - this section should work but currently doesn't
    include_normed_diffs = True
    layers = DEFAULT_LAYERS
    func = choose_bands
'''
class ChooseBands(Generic):
    include_normed_diffs = True
    layers = None
    func = None


In [None]:
choose_step = ChooseBands(func=choose_bands, layers=DEFAULT_LAYERS)

In [None]:
class DropRows(Step):                                    # TODO - this could be a built in step in xarray_filters
    def transform(self, X, y=None, **kw):                #e.g from xarray_filters.steps import DropRows;d=DropRows();d.fit_transform(X)
        X = X.to_features()
        features = X.features.dropna('space', how='any')
        return MLDataset(OrderedDict([('features', features)]))
    fit = transform

In [None]:
# With the changes above, some of the constructors could be simplified, e.g. not passing "func=something"
dset0 = Sampler(func=s3_landsat_sample, layer_specs=LAYER_SPECS).fit_transform(ROW_PATH_MONTHS)
dset1 = ForEachStep(func=set_nans).fit_transform(dset0)
dset2 = ForEachStep(func=to_radiance_or_reflectance, pass_attrs=True).fit_transform(dset1)
dset3 = DataVarsStep(func=normalized_diffs).fit_transform(dset2)
dset4 = DropRows().fit_transform(dset3)
# These steps return numpy arrs
np_arr0 = steps.preprocessing.StandardScaler().fit_transform(dset4)
np_arr1 = steps.decomposition.PCA(n_components=5).fit_transform(np_arr0)
est = steps.cluster.MiniBatchKMeans()
fitted = est.fit(np_arr1)



In [None]:
pred = est.predict(np_arr1)
pred

In [None]:
param_distributions = {'est__n_clusters': list(range(8, 12)),
                       'choose__include_normed_diffs': [True, False],
                       'pca__n_components': list(range(5, 12))}

pipe = Pipeline([('sampler', Sampler(func=s3_landsat_sample)),
                 ('set_nans', ForEachStep(func=set_nans)),
                 ('radiance', ForEachStep(func=to_radiance_or_reflectance, pass_attrs=True)),
                 ('normed_diffs', DataVarsStep(func=normalized_diffs)),
                 ('drop_na', DropRows()),
                 ('standard', steps.preprocessing.StandardScaler()),
                 ('pca', steps.decomposition.PCA(n_components=5)),
                 ('est', steps.cluster.MiniBatchKMeans())])


In [None]:

model_selection = {
    'select_method': 'selNSGA2',
    'crossover_method': 'cxTwoPoint',
    'mutate_method': 'mutUniformInt',
    'init_pop': 'random',
    'indpb': 0.5,
    'mutpb': 0.9,
    'cxpb':  0.3,
    'eta':   20,
    'ngen':  2,
    'mu':    16,
    'k':     8, # TODO ensure that k is not ignored - see elm issue #218
    'early_stop': None,
}

fitted = pipe.fit(ROW_PATH_MONTHS)


In [None]:
fitted

In [None]:
# The following fails due to https://github.com/ContinuumIO/elm/issues/215
'''
ea = EaSearchCV(pipe,
                param_distributions=param_distributions,
                ngen=2,
                model_selection=model_selection,
                cv=3)
Xt, y = ea.fit(ROW_PATH_MONTHS)
'''