# LANDSAT and Ensemble Learning Models

[Ensemble Learning Models (Elm)](https://github.com/ContinuumIO/elm) was developed for a 2016 NASA SBIR Phase I.  Elm provides large data machine learning tools for satellite imagery and climate data.

 * Using the AWS S3 LANDSAT data
 * Using GeoTiff metadata
 * Feature engineering with `elm.pipeline.Pipeline`
 * Fitting / predicting with `distributed`

In [None]:
%matplotlib inline
import glob
import os
import re
import sys
from urllib.request import urlopen

from bokeh.models import WMTSTileSource
from cartopy import crs as ccrs
from collections import defaultdict, OrderedDict
from dask.diagnostics import ProgressBar
from dask.distributed import Client
from earthio import load_array, load_tif_meta, BandSpec, ElmStore
from earthio.landsat_util import landsat_metadata
from earthio.s3_landsat_util import SceneDownloader
from elm.model_selection.kmeans import kmeans_aic, kmeans_model_averaging
from elm.pipeline import Pipeline, steps
from holoviews.operation import decimate
from holoviews.operation.datashader import aggregate, shade, datashade, dynspread
from pyproj import Proj, transform
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
import dask
import dask.dataframe as dd
import datashader as ds
import datashader.transfer_functions as tf
import dill
import geoviews as gv
import holoviews as hv
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas as pd
import rasterio as rio
import requests
import xarray as xr
import xarray as xr
hv.notebook_extension('bokeh')
decimate.max_samples = 1000
dynspread.max_px = 20
dynspread.threshold = 0.5

## S3 LANDSAT downloader
See [this example scene from the AWS S3 LANDSAT store](http://landsat-pds.s3.amazonaws.com/L8/015/033/LC80150332013207LGN00/index.html)

This example uses `SceneDownloader` to find scenes meeting spatial or cloud cover criteria.

In [None]:
s3_download = SceneDownloader()

## GeoTiff options

Use `elm.readers.BandSpec` to control:

 * Resolution
 * Naming of the bands
 * Where to find each band's GeoTiff based on file name match

In [None]:
BUF_X_SIZE, BUF_Y_SIZE = 600, 600 # Set to 800, 800 for 800 by 800 pix decimation
BAND_SPECS = [BandSpec(search_key='name',
                       search_value='B{}.TIF'.format(band),
                       name='band_{}'.format(band),
                       buf_xsize=BUF_X_SIZE,
                       buf_ysize=BUF_Y_SIZE) for band in range(1, 8)]

## Create `distributed.Client`

 * Defaults to creation of local scheduler / workers
 * Can point to remote scheduler / workers

In [None]:
scheduler = os.environ.get('DASK_SCHEDULER')
if not scheduler:
    client = Client()
else:
    client = Client(scheduler)

## Finding a cloud free image

(For a given LANDSAT row / path and month)

In [None]:
clear_image = s3_download.lowest_cloud_cover_image(row=33, path=15, months=tuple(range(1,13)))
clear_image

In [None]:
download_url = clear_image.download_url.values[0]
download_url

## LANDSAT `sampler` function
 * Uses `elm.readers.load_array` with `band_specs` argument
 * Adds MTL file metadata with `elm.readers.landsat_util.landsat_metadata`

In [None]:
def sampler(download_url, **kwargs):
    local_files = s3_download.download_all_bands(download_url)
    this_sample_dir = os.path.dirname(local_files[0])
    X = load_array(this_sample_dir, band_specs=BAND_SPECS)
    X.attrs.update(vars(landsat_metadata([f for f in local_files if f.endswith('.txt')][0])))
    y = sample_weight = None
    return (X, y, sample_weight)

In [None]:
X, _, _ = sampler(download_url)

In [None]:
X

### Using MTL file metadata example
 * Calculate top of atmosphere (TOA) reflectance for Band 4 (Near Infrared)
 * Use the reflectance and sun elevation metadata from the MTL file
 * Use `xarray` plotting with custom color levels

In [None]:
mult = X.REFLECTANCE_MULT_BAND_4
add = X.REFLECTANCE_ADD_BAND_4
theta = X.SUN_ELEVATION * (np.pi / 180.)
levels = (-0.1, -0.05, 0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.6)
band_4_radiance = (X.band_4 * mult + add) / np.sin(theta)
band_4_radiance.values[band_4_radiance.values < 0.] = 0.
band_4_radiance = hv.Dataset(band_4_radiance)
#((X.band_4 * mult + add) / np.sin(theta)).plot.pcolormesh(levels=levels)
#matplotlib.pyplot.title('Band 4 (NIR) TOA Reflectance');


In [None]:
band_4_radiance.data

In [None]:
band_4_radiance.dframe().describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.95])[['band_4']]

In [None]:
%%opts Image [ width=800 height=600]
#hv.Image(radiance, vdims=['band_4'])
hv.Image(band_4_radiance, vdims=['band_4'])

## Convert digital numbers to radiance or reflectance

Generalize the example given in the plot above to allow TOA radiance or reflectance for any band:

In [None]:
from functools import partial
def toa_rad_or_reflect(X, y=None, sample_weight=None,**kw):
    rad_or_reflect = kw['rad_or_reflect']
    for band in X.data_vars:
        num = band.split('_')[-1]
        add = getattr(X, '{}_ADD_BAND_{}'.format(rad_or_reflect, num))
        mult = getattr(X, '{}_MULT_BAND_{}'.format(rad_or_reflect, num))
        band_arr = getattr(X, band)
        band_arr.values[:] = band_arr.values * mult + add
        if rad_or_reflect == 'REFLECTANCE':
            band_arr.values = band_arr.values / np.sin(X.SUN_ELEVATION * (np.pi / 180.))
    return (X, y, sample_weight)
toa_radiance = partial(toa_rad_or_reflect, rad_or_reflect='RADIANCE')
toa_reflectance = partial(toa_rad_or_reflect, rad_or_reflect='REFLECTANCE')

## Set `NaN` values for no-data regions

In [None]:
def set_nans(X, y=None, sample_weight=None, **kwargs):
    xx = X.copy(deep=True)
    for band in xx.data_vars:
        band_arr = getattr(xx, band)
        band_arr.values = band_arr.values.astype(np.float32)
        band_arr.values[band_arr.values <= 1] = np.NaN
        band_arr.values[band_arr.values == 2**16] = np.NaN
    return (xx, y, sample_weight)

## `elm.pipeline.steps.ModifySample`
 * Use custom functions in an `elm.pipeline.Pipeline` of transformations

In [None]:
set_nans_step = steps.ModifySample(set_nans)
reflectance_step = steps.ModifySample(toa_reflectance)

## Preprocessing example

Later this notebook using `elm.pipeline.Pipeline` to automate a series of transforms like the one below.  The cell below sets `NaN` values to no-data regions, then converts digital numbers to radiance.

In [None]:
X, _, _ = sampler(download_url)
Xnew, _, _ = set_nans_step.fit_transform(X)
Xnew, _, _ = reflectance_step.fit_transform(Xnew)

## Plotting reflectance

Band 4, Band 3, Band 2 as RGB

In [None]:
%%opts RGB [width=800 height=600]
scale = 0.2
bands_432 = ['band_4', 'band_3', 'band_2']
hv.RGB(xr.Dataset({band: getattr(Xnew, band) / scale for band in bands_432}), 
       kdims=['x', 'y'])

In [None]:
%%opts Image [ width=800 height=600 ]

# Xnew.band_2.plot.imshow(levels=levels);
# matplotlib.pyplot.title('Band 2 TOA Reflectance');

hv.Image(np.where(Xnew.band_2 > 0., Xnew.band_2, 0))

## Normalized differences between bands

Normalized differences between band reflectances may be helpful in feature engineering to differentiate water, urban areas and forests.

 * NDWI - Normalized Difference Water Index
 * NDVI - Normalized Difference Vegetation Index
 * NDSI - Normalized Difference Soil Index
 * NBR - Normalized Burn Ratio

In [None]:
normalized_diffs = {'ndwi': ('band_4', 'band_5'),
                    'ndvi': ('band_5', 'band_4'),
                    'ndsi': ('band_2', 'band_6'),
                    'nbr':  ('band_4', 'band_7'),
                 }
normed_diffs_step = steps.NormedBandsDiff(spec=normalized_diffs)

In [None]:
download_url = clear_image.download_url.values[0]
X, _, _ = sampler(download_url)
Xnew, _, _ = set_nans_step.fit_transform(X)
Xnew, _, _ = reflectance_step.fit_transform(Xnew)
Xnew, _, _ = normed_diffs_step.fit_transform(Xnew)

## False Color - Normalized Differences as RGB

In [None]:
np.seterr(invalid='ignore') # the NaN < threshold operation below causes warning
def plot_once(bands, scale=1.0, threshold=None):
    bands = bands or ['band_4', 'band_3', 'band_2']
    subset = xr.Dataset({band: getattr(Xnew, band) / scale for band in bands})
    if threshold is not None:
        for band in subset.data_vars:
            b = getattr(subset, band)
            b.values[b < threshold] = threshold
    return hv.RGB(subset, kdims=['x', 'y'], vdims=bands)

In [None]:
%%opts RGB [width=800 height=600]
pseudo_1 = plot_once(['ndsi', 'ndvi', 'ndwi'], scale=.9, threshold=0.)
pseudo_1

In [None]:
%%opts Image [width=700 height=600]
%%opts Layout [tabs=True tight=True]
pl_th = lambda band: hv.Image(getattr(Xnew, band) > 0.)
pl = lambda band: hv.Image(getattr(Xnew, band))
threshold_plots = pl_th('ndwi') + pl_th('ndsi') + pl_th('ndvi') + pl_th('ndsi')
continuous_plots = (pl('ndwi') + pl('ndsi') + pl('ndvi') + pl('ndsi') )
bands = pl('band_4') + pl('band_3') + pl('band_2')
bands

In [None]:
%%opts Image [width=700 height=600]
%%opts Layout [tabs=True tight=True]
continuous_plots + threshold_plots

In [None]:
%%opts Image [width=700 height=600]
%%opts Layout [tabs=True tight=True]
pl('ndsi') * pl('ndvi') + pl('ndwi') * pl('ndvi') + pl('nbr') * pl('ndwi')

## Normalized Difference Soil Index

In [None]:
%%opts Image [width=800 height=600]
hv.Image(Xnew.ndsi)

## Normalized Difference Water Index

In [None]:
%%opts Image [width=800 height=600]
hv.Image(Xnew.ndwi)

## Normalized Burn Ratio

In [None]:
%%opts Image [width=800 height=600]
hv.Image(Xnew.nbr)

## Normalized Difference Vegetation Index

In [None]:
%%opts RGB [width=800 height=600]
hv.I

## Selecting bands for learning
The following function could allow hyperparameterization to control which bands and normalized differences become input features to machine learning.

In [None]:
NORMALIZED_DIFFS = ('nbr', 'ndsi', 'ndwi', 'ndvi')
DEFAULT_BANDS = [band_spec.name for band_spec in BAND_SPECS]
def choose_bands(X, y=None, sample_weight=None, **kwargs):
    new = {}
    bands = kwargs.get('bands', DEFAULT_BANDS)
    include_normed_diffs = kwargs.get('include_normed_diffs', True)
    for band in bands:
        data_arr = getattr(X, band)
        new[band] = data_arr
    if include_normed_diffs:
        for diff in NORMALIZED_DIFFS:
            new[diff] = getattr(X, diff)
    ks = list(new)
    es = ElmStore({k: new[k] for k in ks}, add_canvas=False)
    for band in es.data_vars:
        es[band].attrs['canvas'] = data_arr.canvas
    es.attrs.update(X.attrs)
    print('Chose', es.data_vars)
    return (es, y, sample_weight)

## Using `elm.pipeline.steps` for preprocessing
The next cell allows a custom function to be used in a `Pipeline`:

In [None]:
choose_bands_step = steps.ModifySample(choose_bands,
                              bands=DEFAULT_BANDS,
                              include_normed_diffs=True)

These steps flatten rasters to columns and remove no-data pixels:

In [None]:
flat = steps.Flatten()
drop_na = steps.DropNaRows()

These steps using `sklearn.preprocessing.StandardScaler` to normalize data and `PCA` to reduce dimensionality.

In [None]:
standardize = steps.StandardScaler()
pca = steps.Transform(PCA(n_components=5))

##  `scikit-learn` estimator

The final step in `Pipeline` is a `scikit-learn` estimator.

In [None]:
estimator = MiniBatchKMeans()

## Creating a `Pipeline`
 * List of named steps for hyperparameterization

In [None]:
pipe = Pipeline([('set_nans', set_nans_step),
                 ('reflect', reflectance_step),
                 ('normed_diffs', normed_diffs_step),
                 ('choose', choose_bands_step),
                 ('flat', flat),
                 ('drop_na', drop_na),
                 ('standard', standardize),
                 ('pca', pca),
                 ('est', estimator)],
                scoring=kmeans_aic,
                scoring_kwargs=dict(score_weights=[-1]))

## Controlling ensemble initialization

Starting with a group of `8` `Pipeline` instances with varying PCA and K-Means parameters.

In [None]:
INIT_ENSEMBLE_SIZE = 8
def random_ensemble_member():
    n_clusters = np.random.choice(range(7, 12))
    n_components = np.random.choice((4, len(DEFAULT_BANDS) - 1))
    params = dict(est__n_clusters=n_clusters, pca__n_components=n_components)
    # Create a new Pipeline instance with new parameters (unfitted)
    new = pipe.new_with_params(**params)
    return new

def ensemble_init_func(pipe, **kwargs):
    '''Initialize Random Pipeline Instances
       Vary N of components, N of clusters

    Parameters:
        pipe: a Pipeline instance
        kwargs: Not used here
    Returns:
        List of Pipeline instances with varying parameters
    '''
    models = []
    for repeat in range(INIT_ENSEMBLE_SIZE):
        # Do random choices of parameters with some contraints
        models.append(random_ensemble_member())
    return models

## Controlling model selection
`Pipeline.fit_ensemble` proceeds in generations with `model_selection` called after each generation.  In this example we are scoring with Akaike Information Criterion and modifying the `evolve_n` worst fit models.

In [None]:
def model_selection(models, best_idxes=None, **kwargs):
    evolve_n = kwargs['evolve_n']
    if kwargs['generation'] == kwargs['ngen'] - 1:
        return models
    if INIT_ENSEMBLE_SIZE > 1:
        keep_n = INIT_ENSEMBLE_SIZE - evolve_n
        top_idxes = best_idxes[:keep_n]
        keep_existing = [(tag, model) for idx, (tag, model) in enumerate(models)
                         if idx in top_idxes]
        changed_tags = [tag for idx, (tag, model) in enumerate(models)
                        if idx not in top_idxes]
        random_new = [(tag, random_ensemble_member()) for tag in changed_tags]
        return list(keep_existing) + random_new
    return models

## Using `dill` to load a trained model
(If it exists)

In [None]:
MODEL_PICKLE = 'landsat.dill'
def load_pickled_pipeline():
    if os.path.exists(MODEL_PICKLE):
        with open(MODEL_PICKLE, 'rb') as f:
            fitted = dill.load(f)
            return fitted

## Run `fit_ensemble`
 * Control number of fitting generations
 * Control model selection
 * Control ensemble initialization

In [None]:
def one_image_problem(pipe, ngen=3):
    fitted = load_pickled_pipeline()
    if fitted:
        return fitted
    evolve_n = INIT_ENSEMBLE_SIZE // 2
    ensemble_kwargs = {
        'model_selection': model_selection,
        'model_selection_kwargs': {'evolve_n': evolve_n,},
        'ensemble_init_func': ensemble_init_func,
        'models_share_sample': True
    }
    X, _, _ = sampler(download_url)
    print('FIT')
    kw = ensemble_kwargs.copy()
    kw['ngen'] = ngen
    fitted = pipe.fit_ensemble(X=X,
                               client=client,
                               **kw)
    return fitted

In [None]:
fitted = one_image_problem(pipe)

## `Pipeline.predict_many`
 * Predicts for one or more samples and one or more ensemble members
 * Uses `distributed` for parallelism
 * Can return xarray data structure or serialize it
 * By default, reshapes 1-D predictions to 2-D spatial arrays

In [None]:
preds = fitted.predict_many(X=X, client=client)

## `predict_many` returns a list of predictions

In [None]:
preds[0]

Here the number of predictions is equal to the number of ensemble members.

In [None]:
len(preds),len(fitted.ensemble)

In [None]:
preds[0]

## Plotting each ensemble member's prediction
 * Each prediction is an `ElmStore` (`xarray.Dataset`) with a `predict` 2-D `DataArray`

In [None]:
%%opts Image [width=700 height=500]
%%opts Layout [tabs=True]
p = [hv.Image(p.predict) for p in preds[:3]]
p[0] + p[1] + p[2]

## Next Steps - Hierarchical Modeling

Notice in the predictions plotted above, most ensemble members arrived at similar clustering systems, but:

* The clusters were named differently in each model (i.e. cluster #1 is not the same in every ensemble member).
* The models differed in the water region of the image (Chesapeake Bay) with some models finding two in-water clusters and other models finding one

Future development with `elm` will automate the following cells' steps of predicting based on an ensemble of predictions.  The steps are to:

* Flatten all predictions
* Use a categorical to binary encoder
* Predict with K-Means based on the ensemble members' encoded predictions

In [None]:
from sklearn.preprocessing import OneHotEncoder
def sampler_layer_2(preds):
    # This will be simplified in Hierarchical modeling / vote count tasks
    predicts = []
    for p in preds:
        flat, _, _ = steps.Flatten().fit_transform(p.copy(deep=True))
        no_na, _, _ = steps.DropNaRows().fit_transform(flat)
        predicts.append(no_na.flat.values[:,0])
    transformed = OneHotEncoder().fit_transform(np.array(predicts).T).todense()
    Xnew = ElmStore({'flat': xr.DataArray(transformed, 
                                          coords=[('space', no_na.space), 
                                                  ('band', np.arange(transformed.shape[1]))],
                                         dims=('space','band'))},
                    attrs=no_na.attrs)
    return Xnew
X_layer_2 = sampler_layer_2(preds)

## Pick a number of clusters to use (randomly)

In [None]:
random = np.random.choice([model for tag, model in pipe.ensemble])
random_n_clusters = random.get_params()['est__n_clusters']
random_n_clusters

## Make a second layer `Pipeline`

In [None]:
model_level_2 = MiniBatchKMeans(n_clusters=random_n_clusters)
pipe_level_2 = Pipeline([('est', model_level_2)])

## Fit and predict based on ensemble of predictions

In [None]:
pipe_level_2.fit_ensemble(X=X_layer_2, ngen=1, init_ensemble_size=1)
preds2 = pipe_level_2.predict_many(X=X_layer_2)
len(preds2)

## Plot prediction from hierarchical model

This shows some of the Phase II idea of hierarchical models (models on predictions from ensembles).

In [None]:
%%opts Image [width=800 height=600]
%%opts Layout [tabs=True]
best = preds2[0]
hv.Image(best, kdims=['x', 'y'])