In [None]:
import numpy as np
import pandas as pd
%matplotlib inline

In [None]:
data = pd.read_csv('assets/wave_height_hourly.csv')

In [None]:
data.head()

In [None]:
wave = pd.Series(data['SignificantWaveHeight'].values, index = pd.to_datetime(data['time']))
wave.plot()

In [None]:
from src.tde import UnivariateTDE

In [None]:
# prever o próximo valor com base nos 6 anteriores
wave_tde = UnivariateTDE(wave.diff(), horizon=1, k=6)
wave_tde.head()

In [None]:
# separar variáveis explicativas da variável objetivo
X = wave_tde.drop('t+1', axis=1)
y = wave_tde['t+1']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

In [None]:
X_train.head()

In [None]:
y_train.head()

# Estimação de Desempenho

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

In [None]:
cv = TimeSeriesSplit(n_splits=5)

cv_err = []
for tr_idx, ts_idx in cv.split(X_train, y_train):
    print('Training indices:')
    print(tr_idx[-5:])
    X_tr = X_train.values[tr_idx]
    y_tr = y_train.values[tr_idx]

    X_vl = X_train.values[ts_idx]
    y_vl = y_train.values[ts_idx]

    print('Training the model')
    model = Ridge()

    model.fit(X_tr, y_tr)
    print('Making predictions')
    pred = model.predict(X_vl)

    print('Computing error')
    err = mean_absolute_error(y_vl, pred)
    cv_err.append(err)

    
print(cv_err)

In [None]:
# erro estimado em validação cruzada
print(f'Average error is: {np.mean(cv_err)}')

In [None]:
# erro que obtemos no teste
model = Ridge()
model.fit(X_train, y_train)
pred_test = model.predict(X_test)
err_test = mean_absolute_error(y_test, pred_test)
print(f'Test error is: {err_test}')

# Seleção de Modelo

In [None]:
model_list = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'RandomForestRegressor': RandomForestRegressor(n_estimators=10),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
}

from sklearn.model_selection import cross_val_score

In [None]:
err_by_method = {}
for method in model_list:
    print(f'Running model: {method}')
    cv_error = cross_val_score(model_list[method],
                               X_train,
                               y_train,
                               cv=cv,
                               scoring='neg_mean_absolute_error')

    err_by_method[method] = -np.mean(cv_error)

In [None]:
pd.Series(err_by_method).plot.bar()

In [None]:
from pprint import pprint
pprint(err_by_method)

# Seleção de Parâmetros

In [None]:
from sklearn.model_selection import GridSearchCV

?GridSearchCV

parameters = {'weights': ['uniform', 'distance'],
              'n_neighbors': [1, 5, 10]}

In [None]:
model = KNeighborsRegressor()
cv = TimeSeriesSplit(n_splits=5)

grid_search = GridSearchCV(estimator=model,
                           param_grid=parameters,
                           cv=cv, 
                           refit=True)


In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_estimator_

In [None]:
pred_test_best = grid_search.predict(X_test)
err_test = mean_absolute_error(y_test, pred_test_best)
print(f'Test error is: {err_test}')