In [None]:
#
# Initialization
#

import os
import sys

import ipynbname
from pathlib import Path

# Set notebook's src module path. Note that you may have to update your IDE's project settings to do the same for the
#  local library imports to work the same
MODULE_PATH = ipynbname.path().parent.parent
sys.path.append(str(MODULE_PATH))

# Keep paths consistent throughout notebook
os.chdir(MODULE_PATH)

# This should always be `./src`
print(f"Current working directory [{os.getcwd()}]")

# Place all local artifacts in a disposable, git-ignored directory
local_artifact_dir = Path(os.getcwd()).parent / "out"
local_artifact_dir.mkdir(parents=True, exist_ok=True)

# Autoreload imports at the beginning of cell execution.
#  https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [None]:
#
# Setup utils
#

import subprocess

from utils.logger_util import LoggerUtil
from utils.utils import Utils
from utils import config

LOGGER = LoggerUtil(config.MODEL_ID, local_artifact_dir / "logs")
UTILS = Utils(LOGGER)

UTILS.describe_env()

# AWS instance specs can be found here https://aws.amazon.com/sagemaker/pricing/
AWS_INSTANCE = 'ml.m5.large' # 2 vCPU, 0 GPU, 8 GB memory, $0.134/hour
AWS_INSTANCE_2 = 'ml.m5.4xlarge' # 16 vCPU, 0 GPU, 64 GB memory, $0.922/hour
AWS_GPU_INSTANCE = 'ml.g4dn.xlarge' # 4 vCPU, 1 GPU, 16 GB memory, $0.736/hour
AWS_GPU_INSTANCE_2 = 'ml.g4dn.2xlarge' # 8 vCPU, 1 GPU, 32 GB memory, $1.053/hour
AWS_GPU_INSTANCE_3 = 'ml.g4dn.4xlarge' # 16 vCPU, 1 GPU, 64 GB memory, $1.505/hour
AWS_GPU_INSTANCE_4 = 'ml.g4dn.8xlarge' # 32 vCPU, 1 GPU, 128 GB memory, $2.72/hour
AWS_GPU_INSTANCE_5 = 'ml.g4dn.16xlarge' # 64 vCPU, 1 GPU, 256 GB memory, $5.44/hour
LOCAL_INSTANCE = 'local'
try:
    if subprocess.call('nvidia-smi') == 0:
        LOCAL_INSTANCE = 'local_gpu'
except:
    LOGGER.log("The nvidia-smi binary was not found and thus GPU computation is not supported. Using the default CPU "
               "computation")

# Change this to your desired instance type
INSTANCE_TYPE = LOCAL_INSTANCE
IS_LOCAL = LOCAL_INSTANCE == INSTANCE_TYPE

# Does the model use filedataset or CSVs
FILEDATASET_BASED = True

# Is the model univariate
ONE_DIM_TARGET = True

In [None]:
#
# Parse dataset
#

from data_processing.parse import Parse

PARSE = Parse(LOGGER)

dataset_dir_path = local_artifact_dir / "datasets"

# Creates train and test dataset
PARSE.create_train_test_dataset(
    dataset_dir_path,
    filedataset_based=FILEDATASET_BASED,
    one_dim_target=ONE_DIM_TARGET,
    starting_date_truncate="2020-01-01 00:00:00"
    # starting_date_truncate="2021-03-01 00:00:00"
)

In [None]:
#
# Setup local/aws environment. If aws, upload the datasets to S3
#

from data_processing.aws_handler import AWSHandler
from sagemaker import LocalSession

AWS_HANDLER = AWSHandler(LOGGER, config.MODEL_ID)

sagemaker_session = None

model_output_dir_path = local_artifact_dir / config.MODEL_ID / "models"
model_output_dir_path.mkdir(parents=True, exist_ok=True)

if IS_LOCAL:
    LOGGER.log("Notebook is set to local mode, not uploading to S3")

    dataset_dir_uri = f"file://{dataset_dir_path}"
    model_output_dir_uri = f"file://{model_output_dir_path}"

    sagemaker_session = LocalSession()
    sagemaker_session.config = {
        'local': {
            'local_code': True,
            'container_root': str(model_output_dir_path)
        }
    }
else:
    sagemaker_session = AWS_HANDLER.sagemaker_session

    AWS_HANDLER.upload_train_datasets(dataset_dir_path, filedataset_based=FILEDATASET_BASED)
    dataset_dir_uri = AWS_HANDLER.s3_dataset_dir_uri

    model_output_dir_uri = AWS_HANDLER.s3_model_output_uri

LOGGER.log(f"Model output dir is [{model_output_dir_uri}]")

In [None]:
#
# Configure sagemaker and estimator
#

from ml.train import Train

TRAIN = Train(LOGGER)

if IS_LOCAL:
    train_kwargs = {}
else:
    train_kwargs = {
        # 'checkpoint_s3_uri': model_output_dir_uri,
        'output_path': model_output_dir_uri,
        'code_location': model_output_dir_uri,
        'use_spot_instances': True,
        'max_wait': 18 * 60 * 60, # 18 hours
        'max_run': 18 * 60 * 60, # 18 hours
    }

estimator = TRAIN.create_model(config.SM_ROLE, INSTANCE_TYPE, sagemaker_session, train_kwargs)
TRAIN.fit_model(estimator, dataset_dir_uri)

In [None]:
#
# Load model
#

import pandas as pd
import matplotlib.pyplot as plt
from itertools import islice
from gluonts.model.predictor import Predictor

# Uncomment if you want to quickly compare AWS model with local model
# IS_LOCAL = False

if IS_LOCAL:
    # model_output_dir_path is basically the same path as it was before, though sagemaker appends a random temp
    # directory to the path. The path from TRAIN includes that random temp directory
    # model_dir_path = TRAIN.model_data_path.parent.parent / "model"
    model_dir_path = local_artifact_dir / "local_cli" / "model"
else:
    model_dir_path = AWS_HANDLER.download_model_from_s3(str(TRAIN.model_data_path), local_artifact_dir)

LOGGER.log(f"Model dir is [{model_dir_path}]")
predictor = Predictor.deserialize(model_dir_path)
LOGGER.log(f"Predictor metadata [{predictor.__dict__}]")

def plot_prob_forecasts(ts_list, forecast_list, plot_length=100):
    for target, forecast in islice(zip(ts_list, forecast_list), len(forecast_list)):
        prediction_intervals = (50.0, 90.0)
        legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]
        ax = target[-plot_length:].plot(figsize=(10, 7), linewidth=2)
        forecast.plot(prediction_intervals=prediction_intervals, color='g')
        plt.grid(which="both")
        plt.legend(legend, loc="upper left")
        plt.show()

def plot_prob_forecasts_multi(ts_list, forecast_list, close_index, plot_length=60):
    for target, forecast in islice(zip(ts_list, forecast_list), len(forecast_list)):
        prediction_intervals = (50.0, 90.0)
        legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]
        fig, ax = plt.subplots(1, 1, figsize=(10, 7))
        target[close_index][-plot_length:].plot(ax=ax)  # plot the time series
        forecast.copy_dim(close_index).plot(prediction_intervals=prediction_intervals, color='g')
        plt.grid(which="both")
        plt.legend(legend, loc="upper left")
        plt.show()

In [None]:
#
# Define test data and make a prediction
#

from gluonts.evaluation.backtest import make_evaluation_predictions
from gluonts.dataset.common import load_datasets
from gluonts.dataset.stat import calculate_dataset_statistics
from gluonts.dataset.split import DateSplitter

import data_processing.gluonts_helper as gh

test_dates = ["2021-05-27 12:50:00", "2021-05-27 15:55:00", "2021-05-28 16:00:00", "2021-05-28 17:00:00"]
# test_dates = ["2021-05-22 17:00:00"]
test_datasets = []
feature_columns = []

if FILEDATASET_BASED:
    datasets = load_datasets(
        metadata=(dataset_dir_path / config.METADATA_DATASET_FILENAME).parent,
        train=(dataset_dir_path / config.TRAIN_DATASET_FILENAME).parent,
        test=(dataset_dir_path / config.TEST_DATASET_FILENAME).parent,
        one_dim_target=ONE_DIM_TARGET,
        cache=True
    )

    feature_columns_map = {}
    for feat in datasets.metadata.feat_static_cat:
        if feat.name.startswith("feature_column_"):
            feature_index = int(feat.name.split("_")[2])
            feature_columns_map[feature_index] = feat.cardinality
    feature_columns = [feature_columns_map.get(ele, 0) for ele in range(len(feature_columns_map))]

    for idx, date in enumerate(test_dates):
        # 1) Get splice of dataset for different dates with ample history\n",
        splitter = DateSplitter(pd.Period(date, freq='T'))
        train_dataset, test_template = splitter.split(datasets.test)
        test_dataset = test_template.generate_instances(
            prediction_length=-config.HYPER_PARAMETERS["prediction_length"],
            max_history=config.FREQTRADE_MAX_CONTEXT,
        )

        LOGGER.log(f"Test dataset [{idx}] stats: {calculate_dataset_statistics(test_dataset.dataset)}")
        test_datasets.append(test_dataset)
else:
    test_dataset_filename = dataset_dir_path / config.TEST_CSV_FILENAME
    test_df = pd.read_csv(filepath_or_buffer=test_dataset_filename, header=0, index_col=0)

    feature_columns = gh.get_feature_columns(test_df, exclude_close=False)

    for idx, date in enumerate(test_dates):
        split_df = test_df[:date].tail(config.FREQTRADE_MAX_CONTEXT)
        test_dataset = gh.df_to_multivariate_target_dataset(split_df, feature_columns)

        LOGGER.log(f"Test dataset [{idx}] stats: {calculate_dataset_statistics(test_dataset)}")
        test_datasets.append(test_dataset)

print(f"feature_columns are [{feature_columns}]")

In [None]:
#
# Evaluate and visualize the prediction
#
import json

from gluonts.evaluation import Evaluator, MultivariateEvaluator

for test_dataset in test_datasets:
    forecast_it, ts_it = make_evaluation_predictions(
        dataset=test_dataset.dataset,  # test dataset
        predictor=predictor,  # predictor
        num_samples=100,  # number of sample paths we want for evaluation
    )

    forecasts = list(forecast_it)
    forecast_entry = forecasts[0]
    tss = list(ts_it)

    # LOGGER.log(f"Number of sample paths: {forecast_entry.num_samples}")
    # LOGGER.log(f"Dimension of samples: {forecast_entry.samples.shape}")
    # LOGGER.log(f"Start date of the forecast window: {forecast_entry.start_date}")
    # LOGGER.log(f"Frequency of the time series: {forecast_entry.freq}")

    if ONE_DIM_TARGET:
        evaluator = Evaluator(quantiles=[0.1])
    else:
        evaluator = MultivariateEvaluator(quantiles=[0.1])

    agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(test_dataset))

    for key in list(agg_metrics.keys()):
        if key[0].isdigit():
            del agg_metrics[key]
    LOGGER.log("Aggregated performance")
    LOGGER.log(json.dumps(agg_metrics, indent=4))

    if ONE_DIM_TARGET:
        plot_prob_forecasts(tss, forecasts)
    else:
        close_index = feature_columns.index("close")
        # close_index = feature_columns.index("log_return_close")
        LOGGER.log("'close' performance")
        LOGGER.log(item_metrics.iloc[close_index])

        plot_prob_forecasts_multi(tss, forecasts, close_index)

In [None]:
from mxnet.gluon import nn
import numpy as np

def count_model_params(net: nn.HybridBlock) -> int:
    params = net.collect_params()
    num_params = 0
    for p in params:
        v = params[p]
        num_params += np.prod(v.shape)
    return num_params

net_name = type(predictor.prediction_net).__name__
num_model_param = count_model_params(predictor.prediction_net)
print(f"Number of parameters in {net_name}: {num_model_param}")

def plot_prob_forecasts_multi2(ts_list, forecast_list, close_index, plot_length=60):
    for target, forecast in islice(zip(ts_list, forecast_list), len(forecast_list)):
        print(forecast)
        prediction_intervals = (50.0, 90.0)
        legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]
        fig, ax = plt.subplots(1, 1, figsize=(10, 7))
        target[close_index][-plot_length:].plot(ax=ax)  # plot the time series
        forecast.copy_dim(close_index).plot(prediction_intervals=prediction_intervals, color='g')
        plt.grid(which="both")
        plt.legend(legend, loc="upper left")
        plt.show()

# close_index = feature_columns.index("close")
# LOGGER.log("'close' performance")
# LOGGER.log(item_metrics.iloc[close_index])
#
# plot_prob_forecasts_multi2(tss, forecasts, close_index)

In [None]:
#
# NOTE: FURTHER CELLS ARE COMPATIBLE WITH AWS SAGEMAKER ONLY, LOCAL MODE WILL NOT WORK
# Hyperparameter tune the model
#

from ml.tune import Tune

TUNE = Tune(UTILS, LOGGER)

train_dataset_uri = f"{dataset_dir_uri}/{config.TRAIN_DATASET_FILENAME}"
test_dataset_uri = f"{dataset_dir_uri}/{config.TEST_DATASET_FILENAME}"

# Note: Feel free to tune the tuner, i.e. update max number of jobs and hyperparameters. Default is 10 jobs, but you
# may want to change this as you refine the model. Additionally, if you find the best model has a parameter at the
# end of the range you gave it, then you should look to move that range to determine if the model performs better
# along that vector
tuner = TUNE.create_tuner(estimator)
TUNE.fit_tuner(tuner, dataset_dir_uri)

In [None]:
#
# Get updates for Hyperparameter tune job. Ensure this is completed before going to the next cell
#

TUNE.get_tune_job_update()

In [None]:
#
# Evaluate the metrics of the tune job
#

TUNE.report_job_analytics()
