In [1]:
#!/usr/bin/env python

# Python Standard Library Modules
import os
import pathlib
import sys
import warnings

# External Libraries
from gluonts.dataset import common
from gluonts.evaluation.backtest import make_evaluation_predictions
from gluonts.model import deepar
from gluonts.mx.trainer import Trainer
from hyperopt import fmin, hp, tpe, STATUS_OK, STATUS_FAIL, Trials
from hyperopt.pyll import scope
import matplotlib
import matplotlib.pyplot as plt
import mxnet as mx
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import seasonal_decompose

warnings.filterwarnings("ignore")
mx.random.seed(0)
np.random.seed(0)

prediction_length = 30
# validation_length = 30
# if validation_length:
#     prediction_length = prediction_length + validation_length

## YAHOO Finance Data

In [2]:
def covert_yahoo_series_dir(path: str, prediction_length: int) -> list:
    """Clean and load all coin histories in the Yahoo Finance folder

    Params:
        path: folder full of historical crypto coins timeseries data
        prediction_length: length on which to predict
    Returns:
        List of Gluon-compatible dicts from the coin data
    """
    gluon_list = []
    for file in os.listdir(path):
        coin_gluon_dict = dict()
        file_path = path + file
        coin = pd.read_csv(file_path)
        coin["Date"] = pd.to_datetime(coin["Date"])
        coin.set_index("Date", inplace=True)
        if len(coin) < 100:
            continue
        coin = coin.asfreq("D")
        total_na_vals = coin.isna().sum()[0]
        if (total_na_vals / len(coin)) > 0.25:
            continue
        # Get values for ListDatasets
        coin_closes = coin["Close"]
        coin_closes.index = pd.DatetimeIndex(coin_closes.index)
        coin_closes = coin_closes.asfreq("D")
#         coin_closes.fillna(method='bfill', inplace=True)
#         coin_closes.dropna(inplace=True)
        start = coin_closes.index[0]

        coin_gluon_dict["test"] = {
            "start": start,
            "target": coin_closes,
            "name": file,
        }

#         coin_gluon_dict["validation"] = {
#             "start": start,
#             "target": coin_closes[:-30],
#             "name": file,
#         }

        coin_gluon_dict["train"] = {
            "start": start,
            "target": coin_closes[:-prediction_length],
            "name": file,
        }

        gluon_list.append(coin_gluon_dict)

    return gluon_list


gluon_list = covert_yahoo_series_dir("../data/historical_yahoo_crypto/", 30)

In [3]:
# Enforce List Order BTC, ETH, Cardano at the beginning
new_list = []
btc_gluon_dict = None
eth_gluon_dict = None
card_gluon_dict = None
for index, value in enumerate(gluon_list):
    if value['test']['name'] == "BTC-USD.csv":
        btc_gluon_dict = value
    elif value['test']['name'] == "ETH-USD.csv":
        eth_gluon_dict = value
    elif value['test']['name'] == "ADA-USD.csv":
        card_gluon_dict = value
    else:
        new_list.append(value)


new_list.insert(0, card_gluon_dict)   
new_list.insert(0, eth_gluon_dict)
new_list.insert(0, btc_gluon_dict)

In [4]:
warnings.filterwarnings("always")

import mxnet as mx
mx.random.seed(0)
np.random.seed(0)


test_data = common.ListDataset(
    [
    ],
    freq="D",
)

# validation_data = common.ListDataset(
#     [
#     ],
#     freq="D",
# )

train_data = common.ListDataset(
    [
    ],
    freq="D",
)


for coin_gluon_dict in new_list:
    test_data.list_data.append(coin_gluon_dict['test'])
#     validation_data.list_data.append(coin_gluon_dict['validation'])
    train_data.list_data.append(coin_gluon_dict['train'])
    
    

## HYPEROPT

In [None]:
mx.random.seed(0)

# search_space = {
#     'epochs': scope.int(hp.quniform('epochs', 1, 20, q=1)),
#     'num_layers': scope.int(hp.quniform('num_layers', 1, 8, q=1)),
#     'num_cells': scope.int(hp.quniform('num_cells', 30, 100, q=1)),
#     'cell_type': hp.choice('cell_type', ['lstm', 'gru']),
#     'batch_size': scope.int(hp.quniform('batch_size', 16, 256, q=1)),
#     'learning_rate': hp.quniform('learning_rate', 5e-5, 1e-1, 0.00005),
#     'context_length': scope.int(hp.quniform('context_length', 1, 200, q=1)),
# }


search_space = {
    'epochs': scope.int(hp.quniform('epochs', 1, 10, q=1)),
    'num_layers': scope.int(hp.quniform('num_layers', 1, 4, q=1)),
    'num_cells': scope.int(hp.quniform('num_cells', 30, 100, q=1)),
    'cell_type': hp.choice('cell_type', ['lstm', 'gru']),
    'batch_size': scope.int(hp.quniform('batch_size', 40, 256, q=1)),
    'learning_rate': hp.quniform('learning_rate', 5e-5, 1e-1, 0.00005),
    'context_length': scope.int(hp.quniform('context_length', 1, 200, q=1)),
}

def global_objective(params):
    print(f"TRYING: {params}")
    try:
        deepar_params = {k: v for k,v in params.items() if k not in ('epochs', 'batch_size', 'learning_rate')}
        epochs = params['epochs']
        batch_size = params['batch_size']
        learning_rate = params['learning_rate']
        cell_type = params['cell_type']
        trainer = Trainer(
                          epochs=epochs, 
                          batch_size=batch_size, 
                          learning_rate=learning_rate,
                          clip_gradient=2.5, # to avoid weird endless NaN bug
                          hybridize=False, # to avoid weird endless NaN bug
                          )
        estimator = deepar.DeepAREstimator(
            freq="1D", 
            prediction_length=prediction_length, 
            trainer=trainer,
            **deepar_params
        )
        predictor = estimator.train(training_data=train_data)
        global_loss = 0
        predictions = predictor.predict(train_data.list_data)
        for index, value in enumerate(range(len(new_list))):
            prediction = next(predictions)
            name = test_data.list_data[index]['name']
            # Skip graphs with absurd loss, stablecoins etc.
            if name in {"USDT-USD.csv", "CCXX-USD.csv", "TUSD-USD.csv"}:
                continue
            full_actual = test_data.list_data[index]['target']
            actual = full_actual[-30:]
            preds = pd.Series(prediction.mean)
            preds.index = actual.index

            # SCALING
            scaler = MinMaxScaler()
            scaled_actual = np.array(actual)
            scaler.fit([scaled_actual])
            scaled_actual = scaler.fit_transform(np.array(scaled_actual[:, np.newaxis]))
            scaled_preds = scaler.transform([preds])
            scaled_preds = scaled_preds.reshape(-1, 1)
            mse = mean_squared_error(scaled_actual, scaled_preds)
            global_loss += mse

        return {'loss': global_loss, 'status': STATUS_OK}
    
    except BaseException as e:
        print(f"ERROR: {e}")
        return {'status': STATUS_FAIL}

trials = Trials()
best = fmin(
    global_objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials,
)


TRYING: {'batch_size': 63, 'cell_type': 'lstm', 'context_length': 33, 'epochs': 3, 'learning_rate': 0.04735, 'num_cells': 71, 'num_layers': 1}
  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

  return init(self, **all_args)




  0%|          | 0/50 [00:00<?, ?it/s]
[A
 56%|#####6    | 28/50 [00:03<00:02,  8.76it/s, epoch=1/3, avg_epoch_loss=0.0553]


ERROR:                                                 
TRYING: {'batch_size': 248, 'cell_type': 'lstm', 'context_length': 87, 'epochs': 9, 'learning_rate': 0.0175, 'num_cells': 59, 'num_layers': 4}
  1%|          | 1/100 [00:03<05:22,  3.26s/trial, best loss=?]

  return init(self, **all_args)




  0%|          | 0/50 [00:00<?, ?it/s]
[A
 28%|##8       | 14/50 [00:10<00:26,  1.36it/s, epoch=1/9, avg_epoch_loss=2.05]
[A
 50%|#####     | 25/50 [00:18<00:18,  1.38it/s, epoch=1/9, avg_epoch_loss=1.25]


ERROR:                                                         
TRYING: {'batch_size': 67, 'cell_type': 'lstm', 'context_length': 154, 'epochs': 4, 'learning_rate': 0.08695, 'num_cells': 74, 'num_layers': 3}
  2%|▏         | 2/100 [00:21<19:37, 12.02s/trial, best loss=?]

  return init(self, **all_args)




  0%|          | 0/50 [00:00<?, ?it/s]
[A
 16%|#6        | 8/50 [00:08<00:45,  1.08s/it, epoch=1/4, avg_epoch_loss=5.99]


ERROR:                                                         
TRYING: {'batch_size': 214, 'cell_type': 'lstm', 'context_length': 88, 'epochs': 8, 'learning_rate': 0.06035, 'num_cells': 81, 'num_layers': 3}
  3%|▎         | 3/100 [00:30<16:56, 10.48s/trial, best loss=?]

  return init(self, **all_args)




  0%|          | 0/50 [00:00<?, ?it/s]
[A
 36%|###6      | 18/50 [00:10<00:18,  1.72it/s, epoch=1/8, avg_epoch_loss=3.47]
[A


In [6]:
trainer = Trainer(epochs=2, learning_rate=0.0001)
estimator = deepar.DeepAREstimator(
        freq="D", 
#         num_cells=150,
#         num_layers=3,
        prediction_length=prediction_length, 
        trainer=trainer,
#         context_length=100,
#         use_feat_dynamic_real=True,
    )

predictor = estimator.train(
    training_data=train_data,
#     validation_data=validation_data
)


mx.random.seed(0)
global_loss = 0
predictions = predictor.predict(train_data.list_data)
for index, value in enumerate(range(len(gluon_list))):
    prediction = next(predictions)
    name = test_data.list_data[index]['name']
    # Skip graphs with absurd loss, stablecoins etc.
    if name in {"USDT-USD.csv", "CCXX-USD.csv", "TUSD-USD.csv"}:
        continue
#     print(name)
    full_actual = test_data.list_data[index]['target']
    actual = full_actual[-30:]
    preds = pd.Series(prediction.mean)
    preds.index = actual.index
    # PLOT ALL CRYPTO PREDICTIONS
#     plt.figure()
#     preds.plot(legend=True, label=f"{name} PREDICTED")
#     actual.plot(legend=True, label=f"{name} ACTUAL")
#     plt.show()

    # SCALING
    scaler = MinMaxScaler()
    scaled_actual = np.array(actual)
    scaler.fit([scaled_actual])
    scaled_actual = scaler.fit_transform(np.array(scaled_actual[:, np.newaxis]))
    scaled_preds = scaler.transform([preds])
    scaled_preds = scaled_preds.reshape(-1, 1)
    mse = mean_squared_error(scaled_actual, scaled_preds)
#     print(f"mse: {mse}")
    global_loss += mse
    
print(f"global_loss is {global_loss}")
print(f"average global_loss is {global_loss / len(gluon_list)}")

100%|██████████| 50/50 [00:03<00:00, 13.95it/s, epoch=1/2, avg_epoch_loss=0.63]
100%|██████████| 50/50 [00:03<00:00, 15.67it/s, epoch=2/2, avg_epoch_loss=-.13]


global_loss is 47.499963074804114
average global_loss is 0.4611646900466419
