In [None]:
import numpy as np
import pandas as pd
%matplotlib inline

In [None]:
data = pd.read_csv('assets/wave_height_hourly.csv')

In [None]:
data.head()

In [None]:
wave = pd.Series(data['SignificantWaveHeight'].values, index = pd.to_datetime(data['time']))
wave.plot()

In [None]:
from src.tde import UnivariateTDE

In [None]:
# prever os valores das próximas 12 horas com base nas últimas 6
wave_tde = UnivariateTDE(wave, horizon=12, k=6)
wave_tde.head()

In [None]:
is_future = wave_tde.columns.str.contains('\+')
X = wave_tde.iloc[:,~is_future]
Y = wave_tde.iloc[:,is_future]

X.head()

In [None]:
Y.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, shuffle=False)

In [None]:
X_train.head()

In [None]:
X_test.head()

### Ensembles

In [None]:
from sklearn.ensemble \
    import (RandomForestRegressor,
            ExtraTreesRegressor,
            BaggingRegressor)
from sklearn.linear_model \
    import (Lasso,
            Ridge,
            OrthogonalMatchingPursuit,
            ElasticNet)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_decomposition import PLSRegression, PLSCanonical

METHODS = \
    dict(
        RandomForestRegressor=RandomForestRegressor,
        PLSRegression=PLSRegression,
        ExtraTreesRegressor=ExtraTreesRegressor,
        KNeighborsRegressor=KNeighborsRegressor,
        Ridge=Ridge,
        ElasticNet=ElasticNet,
        BaggingRegressor=BaggingRegressor,
    )

METHODS_PARAMETERS = \
    dict(
        RandomForestRegressor={
            'n_estimators': [50, 100],
            'max_depth': [None, 3, 5],
        },
        ExtraTreesRegressor={
            'n_estimators': [50, 100],
            'max_depth': [None, 3, 5],
        },
        KNeighborsRegressor={
            'n_neighbors': [1, 5, 10, 20, 50],
            'weights': ['uniform', 'distance'],
        },
        Ridge={
            'alpha': [1, .5, .25, .75]
        },
        ElasticNet={
        },
        PLSRegression={
            'n_components': [2, 3, 5]
        },
        BaggingRegressor={
            'n_estimators': [50, 100]
        },
    )


In [None]:
from src.utils import expand_grid_all

models = {}

for learning_method in METHODS:
    print(f'Creating {learning_method}')
    if len(METHODS_PARAMETERS[learning_method]) > 0:
        gs_df = expand_grid_all(METHODS_PARAMETERS[learning_method])

        n_gs = len(gs_df[[*gs_df][0]])
        for i in range(n_gs):
            print(f'Training {i} out of {n_gs}')

            pars = {k: gs_df[k][i] for k in gs_df}
            pars = {p: pars[p] for p in pars if pars[p] is not None}
            print(pars)

            model = METHODS[learning_method](**pars)
            model.fit(X_train, Y_train)

            models[f'{learning_method}_{i}'] = model
    else:
        model = METHODS[learning_method]()
        model.fit(X_train, Y_train)

        models[f'{learning_method}_0'] = model


In [None]:
preds_all = {}
for method_ in models:
    predictions = models[method_].predict(X_test)
    preds_all[method_] = pd.DataFrame(predictions, columns=Y_test.columns)

In [None]:
preds_all.keys()

In [None]:
len(preds_all)

In [None]:
preds_all['RandomForestRegressor_0']

In [None]:
model_names = [*preds_all]

horizon_names = preds_all[model_names[0]].columns.tolist()

yhat_by_horizon_ = {h_: pd.DataFrame({m: preds_all[m][h_]
                                      for m in model_names})
                    for h_ in horizon_names}

In [None]:
yhat_by_horizon_['t+1']

In [None]:
avg_preds = pd.DataFrame({k: yhat_by_horizon_[k].mean(axis=1) for k in yhat_by_horizon_})
avg_preds.head()

In [None]:
preds_all['Ensemble'] = avg_preds

In [None]:
from sklearn.metrics import mean_squared_error

error = {}
for k, pred in preds_all.items():
    error[k] = mean_squared_error(Y_test, pred)

error

In [None]:
pd.Series(error).sort_values().plot.bar(figsize=(30,13), fontsize=30)