# NLDAS Data Exploration

This notebook accomplishes the following:

- Downloads data file(s) from NASA
- Show attribute statistics and visualizations
- Do viz-related data cleaning
- Show (corrected) attribute statistics and visualizations

### Setup Instructions:
1. Create *.netrc* file in home dir according to [GES DISC site instructions](https://disc.gsfc.nasa.gov/information/howto?title=How%20to%20Download%20Data%20Files%20from%20HTTP%20Service%20with%20wget)
2. Create environment, install notebook pkgs, enable extension:
```
conda env create -n elm python=2.7 # 2.7 needed for pynio
source activate elm
conda install -c conda-forge pycurl lxml holoviews
jupyter nbextension enable --py widgetsnbextension # This should report "OK"
```

In [None]:
from __future__ import absolute_import, division, print_function

import gc
import os
import getpass

import six
import holoviews as hv
import numpy as np
import pandas as pd
import xarray as xr
from example_utils import GRBSelector, get_metadata, dl_file

hv.notebook_extension('bokeh')
#%matplotlib inline

## Download NLDAS GRIB file

This persists the file to disk, then loads the data into RAM as an xarray Dataset object.

In [None]:
selector = GRBSelector()
selector

In [None]:
selector.selected_url

In [None]:
data_fpath = dl_file(selector.selected_url)
ds = xr.open_dataset(data_fpath, engine='pynio')
ds

### Attributes alongside their descriptions

In [None]:
info = []
for k in ds.data_vars:
    raster = ds[k]
    about = (k, raster.long_name, raster.units, raster.initial_time)
    about_raster = '{:<20} {} ({}) - {}'.format(*about)
    info.append(about_raster)
print('Rasters in {}\n'.format(os.path.basename(data_fpath)), '\n  '.join(info), sep='\n  ')

In [None]:
raster

## Statistics and visualizations

Below we show the data as-is.

In [None]:
ds.to_dataframe().describe(percentiles=(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))

In [None]:
%opts Image RGB [width=300 height=200]
hvds = hv.Dataset(ds)
imgs = [hvds.to(hv.Image, ['lon_110', 'lat_110'], var).relabel(var) for var in ds.data_vars]
hv.Layout(imgs)

## Viz-related data cleaning

Noticing that -9999 seems to confuse the visualizations, we replace -9999 values with 0.

In [None]:
def set_to_na(da):
    da.values[np.isclose(da.values, -9999.)] = 0
ds.apply(set_to_na)
ds.to_dataframe().describe(percentiles=(0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975))

## Corrected visualizations

In [None]:
hvds = hv.Dataset(ds)
imgs = [hvds.to(hv.Image, ['lon_110', 'lat_110'], var, group='('+ds[var].long_name+')').relabel(var) for var in ds.data_vars]
hv.Layout(imgs)

In [None]:
from elm.model_selection import EaSearchCV
from xarray_filters import MLDataset

In [None]:
dset = MLDataset(ds)

In [None]:
dset

In [None]:
dset.to_features()

In [None]:
import datetime
from sklearn.model_selection import KFold
from itertools import product
from xarray_filters.pipeline import Step
from elm.pipeline import Pipeline
from elm.pipeline.steps import linear_model, decomposition, cluster
from elm.model_selection import EaSearchCV
from elm.model_selection.sorting import pareto_front
from elm.pipeline import Pipeline
from elm.model_selection import CVCacheSampler
from elm.pipeline.predict_many import predict_many
from elm.pipeline.steps import linear_model, cluster, decomposition
import sklearn.model_selection as sk_model_selection

In [None]:
START_DATE = datetime.datetime(2000, 1, 1, 0, 0, 0)
MAX_TIME_STEPS = 8
DATES = np.array([START_DATE - datetime.timedelta(hours=hr)
                 for hr in range(MAX_TIME_STEPS)])
DATE_GROUPS = np.linspace(0, 5, DATES.size).astype(np.int32)
CV_CLASSES = {'KFold': KFold}
model_selection = {
    'select_method': 'selNSGA2',
    'crossover_method': 'cxTwoPoint',
    'mutate_method': 'mutUniformInt',
    'init_pop': 'random',
    'indpb': 0.5,
    'mutpb': 0.9,
    'cxpb':  0.3,
    'eta':   20,
    'ngen':  2,
    'mu':    16,
    'k':     8, # TODO ensure that k is not ignored - make elm issue if it is
    'early_stop': None,
}


class Sampler(Step):
    def transform(self, X, y=None, **kw):
        return dset.to_features()


class GetY(Step):
    layer = 'y'
    def transform(self, X, y=None, **kw):
        layer = self.get_params()['layer']
        y = getattr(X, layer).values.ravel()
        X = MLDataset(OrderedDict([(k, v) for k, v in X.data_vars.items()
                                    if k != layer])).to_features()
        return X.features.values, y
    fit_transform = transform


# TODO - also test regressors
regress_distributions = {
    'estimator__fit_intercept': [True, False],
    'estimator__normalize': [True, False],
}

kmeans_distributions = {
    'estimator__n_clusters': list(range(4, 12)),
    'estimator__init': ['k-means++', 'random'],
    'estimator__copy_x': [False],
    'estimator__algorithm': ["auto", "full", "auto"],
}
pca_distributions = {
    'pca__n_components': list(range(2, 4)),
    'pca__whiten': [True, False],
}

regress = Pipeline([
    ('get_y', GetY()),
    ('estimator', linear_model.Ridge()),
])

pca_regress = Pipeline([
    ('get_y', GetY()),
    ('pca', decomposition.PCA()),
    ('estimator', linear_model.Ridge()),
])

kmeans = Pipeline([
    ('estimator', cluster.KMeans()),
])

pipes = {'one_step_unsupervised': kmeans,
         'get_y_supervised':  regress,
         'get_y_pca_then_regress': pca_regress,}

dists = {'one_step_unsupervised': kmeans_distributions,
         'get_y_supervised': regress_distributions,
         'get_y_pca_then_regress': pca_distributions,}
dists['get_y_pca_then_regress'].update(regress_distributions)

DEFAULT = 'one_step_unsupervised'

pipe = pipes[DEFAULT]
param_distributions = dists[DEFAULT]
cv = KFold()
sampler = Sampler()
refit_Xy = sampler.fit_transform([datetime.datetime(2000, 1, 1)])
refit = True
eas = []
ea = EaSearchCV(pipe,
                param_distributions=param_distributions,
                sampler=sampler,
                ngen=2,
                model_selection=model_selection,
                cv=cv,
                refit=refit,
                refit_Xy=refit_Xy)
ea.fit(DATES) # TODO test that y is passed as a cv grouping variable
results = getattr(ea, 'cv_results_', None)
assert isinstance(results, dict) and 'gen' in results
