In [None]:
from __future__ import print_function
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # suppress Pandas warning
import dill
import glob
from bokeh.models import WMTSTileSource
from holoviews.operation import decimate
from holoviews.operation.datashader import aggregate, shade, datashade, dynspread
import dask
import dask.dataframe as dd
import datashader as ds
import datashader.transfer_functions as tf
import dill
import geoviews as gv
import holoviews as hv
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import xarray as xr
hv.extension('matplotlib')
decimate.max_samples = 1000
dynspread.max_px = 20
dynspread.threshold = 0.5

### Import the NLDAS / VIC script's functions / imports

In [None]:
from nldas_soil_moisture_ml import *

### Getting ready to run this notebook:

To run the notebook:
* Install the environment with `elm`, `earthio`, `pynio` and `pydap`
* Set environment variables for your username and password that allow downloads from [https://hydro1.gesdisc.eosdis.nasa.gov/data/NLDAS/](https://hydro1.gesdisc.eosdis.nasa.gov/data/NLDAS/)
* Run the `nldas_soil_moisture_ml.py` script to train/predict for one hour's forecast
```
conda create --name nldas_py27 -c conda-forge -c  elm -c elm/label/dev -c ioam -c ncar pynio elm earthio pydap
source activate nldas_py27
export NLDAS_USER=myusername
export NLDAS_PASS=mypassword
python nldas_soil_moisture_ml.py
```
and that will create a `dill` file of serialized model outputs and inputs.

In [None]:
model_dumps = glob.glob('2000_01_*dill')[:1]
model_dumps

### Using `dill.load` to get the latest ML-predicted soil moisture

* `last_hour_X`: The `xarray.Dataset` that trained the models one hour ago
* `this_hour_X`: The `xarray.Dataset` that was used in prediction for the current time step
* `models`:      The trained models from the ensemble of `Pipeline` instances (using `last_hour_X`)
* `preds`:       Predictions from `models` based on `this_hour_X`
* `models2`:     The trained models based on the a second layer model that was fit to `preds` (using `last_hour_X`) 
* `preds2`:      Predictions from `models2` based on `this_hour_X`

In [None]:
last_hour_X, this_hour_X, models, preds, models2, preds2 = dill.load(open('2000_01_01T02_00_00.dill', 'rb'))

### `models` and `models` are sorted by Pareto optimal order

Multiobjective Pareto sorting *MSE* *R<sup>2</sup>* with bounds checks

In [None]:
tag, best_layer_0 = models[0]
print(best_layer_0.summary, '\n\nScore (MSE, R2, bounds check ok):',best_layer_0._score, end='\n\n')
print('Best Pipeline:\n\n', repr(best_layer_0))

### Input data - NLDAS Forcing A (FORA)

The NLDAS FORA files give the following fields for each hour:

```
[  
    'A_PCP_110_SFC_acc1h',
    'PEVAP_110_SFC_acc1h',
    'TMP_110_HTGL',
    'DSWRF_110_SFC',
    'PRES_110_SFC',
    'DLWRF_110_SFC',
    'V_GRD_110_HTGL',
    'SPF_H_110_HTGL',
    'U_GRD_110_HTGL',
    'CAPE_110_SPDY',
]
```

#### Up to `X_TIME_STEPS` (144 currently) are used to form an input data set

Shown below are the raw 144 hourly `DataArray`s loaded from FORA data.

In [None]:
variables = tuple(this_hour_X.data_vars)
variables[1:10], variables[-10:]

### Example of differencing to `xarray.DataArray`s
The following cell shows descriptive statistcs calculated after subtracting the last hour's precipitation field from the current hour's precipitation field.

In [None]:
precip_change = this_hour_X.hr_0_A_PCP_110_SFC_acc1h - this_hour_X.hr_1_A_PCP_110_SFC_acc1h
precip_change.name = 'Delta Precip kg / m2'
precip_change.to_dataframe().describe()

### Computing differences and averages of differences in the `Pipeline`s of the ensemble

Since soil moisture is a long memory process, it makes sense to include rolling averages and averages of differences as a feature engineering step.  Each `Pipeline` of transformers starts with the `differencing_integrating` function.  

`ModifySample` is a wrapper that allows custom functions like `differencing_integrating` to be run inside a `Pipeline`.  Though not done in this example, `ModifySample` could be used here to optimize the keyword arguments to `differencing_integrating`.

In [None]:
label, diff = diff_in_time
diff_in_time

In [None]:
diffs_avg_instant,_, _ = diff.fit_transform(last_hour_X)

In [None]:
new_vars = tuple(diffs_avg_instant.data_vars)
[name for name in new_vars if name.startswith('diff_')][:24]

### `preds`, the predictions from the first layer ensemble, are now sorted from best model's prediction to worst

In [None]:
best_layer_0_pred = preds[0]
best_layer_0_pred

### `preds2` are sorted from best to worst model output in the second layer of models

In [None]:
best_layer_1_pred = preds2[0]

#### Comparing VIC and ML predicted soil moisture in the second layer of models

In [None]:
residuals_soil_moisture = best_layer_1_pred.predict - this_hour_X.SOIL_M_110_DBLY
residuals_soil_moisture.name = 'Soil Moisture Residuals: Elm - VIC (kg / m2)'
df0 = this_hour_X.SOIL_M_110_DBLY.to_dataframe()
df2 = residuals_soil_moisture.to_dataframe()
df1 = best_layer_1_pred.predict.to_dataframe()
joined = df0.join(df1).join(df2)
joined.columns = ['VIC Soil Moisture (kg / m2)', 'Elm Soil Moisture (kg / m2)', residuals_soil_moisture.name]
joined.describe()

### Comparing VIC and ML predicted soil moisture in the first layer of models
#### The best of the first layer models

In [None]:
p0 = preds[0]
resids = p0.predict - this_hour_X.SOIL_M_110_DBLY
resids.name = 'Best of First Layer Models'
resids.to_dataframe().describe()

#### The second best of the first layer models

In [None]:
p1 = preds[1]
resids = p1.predict - this_hour_X.SOIL_M_110_DBLY
resids.name = 'Best of First Layer Models'
resids.to_dataframe().describe()

### Using `holoviews` to visualize `xarray.DataArray` predictions

In [None]:
%%opts Image [width=500 height=400]
%opts Image (cmap='viridis')
ds = hv.Dataset(preds2[0],
                kdims=['lon_110', 'lat_110'], vdims=['predict'])
elm = ds.to(hv.Image, ['lon_110', 'lat_110'], group='Elm Hierarchical Model').hist(bin_range=(-50, 450))

ds = hv.Dataset(this_hour_X,
                kdims=['lon_110', 'lat_110'], vdims=['SOIL_M_110_DBLY'])
vic = ds.to(hv.Image, ['lon_110', 'lat_110'], group='VIC Soil Moisture (kg / m2)').hist(bin_range=(-50, 450))

color_range = vic.range('SOIL_M_110_DBLY')
(elm + vic).redim.range(hv.Image, predict=color_range, SOIL_M_110_DBLY=color_range)

### Another `holoviews` example - Visualizing the NLDAS FORA data for one hour

In [None]:
i = []
for col in DIFFERENCE_COLS:
    current = this_hour_X['hr_0_' + col]
    img = hv.Image(current, group=col)
    i.append(img)
print('NLDAS FORA Datasets used for each hour({}):'.format(len(i)),'\n\t', '\n\t'.join(DIFFERENCE_COLS), sep='')

In [None]:
%%opts Layout [tabs=True]
%%opts Image [width=500 height=400]
%opts Image (cmap='viridis')
i[0] + i[1] + i[2] + i[3] + i[4] + i[5] + i[6] + i[7] + i[8] + i[9]