In [13]:
#
# Initialization
#
import os
import sys

import ipynbname
from pathlib import Path

# Set notebook's src module path. Note that you may have to update your IDE's project settings to do the same for the
#  local library imports to work the same
MODULE_PATH = ipynbname.path().parent.parent
sys.path.append(str(MODULE_PATH))

# Keep paths consistent throughout notebook
os.chdir(MODULE_PATH)

# This should always be `./src`
print(f"Current working directory [{os.getcwd()}]")

# Place all local artifacts in a disposable, git-ignored directory
local_artifact_dir = Path(os.getcwd()).parent / "out"
local_artifact_dir.mkdir(parents=True, exist_ok=True)

# Autoreload imports at the beginning of cell execution.
#  https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

Current working directory [/Users/jbeckman/projects/capia/src]
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
#
# Setup utils
#

import subprocess

from utils.logger_util import LoggerUtil
from utils.utils import Utils
from utils import config

LOGGER = LoggerUtil(config.MODEL_ID, local_artifact_dir / "logs")
UTILS = Utils(LOGGER)

UTILS.describe_env()

# AWS instance specs can be found here https://aws.amazon.com/sagemaker/pricing/
AWS_INSTANCE = 'ml.m5.large' # 2 vCPU, 0 GPU, 8 GB memory, $0.134/hour
AWS_INSTANCE_2 = 'ml.m5.4xlarge' # 8 vCPU, 0 GPU, 32 GB memory, $0.538/hour
AWS_GPU_INSTANCE = 'ml.g4dn.xlarge' # 4 vCPU, 1 GPU, 16 GB memory, $0.736/hour
AWS_GPU_INSTANCE_2 = 'ml.g4dn.2xlarge' # 8 vCPU, 1 GPU, 32 GB memory, $1.053/hour
LOCAL_INSTANCE = 'local'
try:
    if subprocess.call('nvidia-smi') == 0:
        LOCAL_INSTANCE = 'local_gpu'
except:
    LOGGER.log("The nvidia-smi binary was not found and thus GPU computation is not supported. Using the default CPU "
               "computation")

# Change this to your desired instance type
INSTANCE_TYPE = LOCAL_INSTANCE
IS_LOCAL = LOCAL_INSTANCE == INSTANCE_TYPE

2021-04-16 07:16:46.332314 The model id is [giia-0.5.8]
2021-04-16 07:16:46.333394 The MXNet version is [1.7.0]
2021-04-16 07:16:46.333779 The GPU count is [0]
2021-04-16 07:16:46.342938 The nvidia-smi binary was not found and thus GPU computation is not supported. Using the default CPU computation


In [15]:
#
# Parse dataset
#

from data_processing.parse import Parse

PARSE = Parse(LOGGER)

dataset_dir_path = local_artifact_dir / "datasets"

# Creates train and test dataset CSVs
PARSE.split_train_test_dataset(dataset_dir_path)

2021-04-16 07:16:57.810654 First sample:
2021-04-16 07:16:57.813693 
                        open     high      low    close    volume
date                                                             
2017-08-17 04:00:00  4261.48  4280.56  4261.48  4261.48  2.189061
2021-04-16 07:16:57.814085 Last sample:
2021-04-16 07:16:57.816821 
                         open      high       low     close     volume
date                                                                  
2021-03-26 01:25:00  52151.39  52178.81  52108.76  52122.84  93.536227
2021-04-16 07:16:58.134366 Parsed train and test datasets can be found in [/Users/jbeckman/projects/capia/out/datasets]


In [None]:
#
# Setup local/aws environment. If aws, upload the datasets to S3
#

from data_processing.aws_handler import AWSHandler
from sagemaker import LocalSession

AWS_HANDLER = AWSHandler(LOGGER, config.MODEL_ID)

sagemaker_session = None

model_output_dir_path = local_artifact_dir / config.MODEL_ID / "models"
model_output_dir_path.mkdir(parents=True, exist_ok=True)

if IS_LOCAL:
    LOGGER.log("Notebook is set to local mode, not uploading to S3")

    dataset_dir_uri = f"file://{dataset_dir_path}"
    model_output_dir_uri = f"file://{model_output_dir_path}"

    sagemaker_session = LocalSession()
    sagemaker_session.config = {
        'local': {
            'local_code': True,
            'container_root': str(model_output_dir_path)
        }
    }
else:
    sagemaker_session = AWS_HANDLER.sagemaker_session

    AWS_HANDLER.upload_train_datasets(dataset_dir_path)
    dataset_dir_uri = AWS_HANDLER.s3_dataset_dir_uri

    model_output_dir_uri = AWS_HANDLER.s3_model_output_uri

LOGGER.log(f"Model output dir is [{model_output_dir_uri}]")

In [None]:
#
# Configure sagemaker and estimator
#

from ml.train import Train

TRAIN = Train(LOGGER)

if IS_LOCAL:
    train_kwargs = {}
else:
    train_kwargs = {
        # 'checkpoint_s3_uri': model_output_dir_uri,
        'output_path': model_output_dir_uri,
        'code_location': model_output_dir_uri,
        'use_spot_instances': True,
        'max_wait': 18 * 60 * 60, # 18 hours
        'max_run': 18 * 60 * 60, # 18 hours
    }

estimator = TRAIN.create_model(config.SM_ROLE, INSTANCE_TYPE, sagemaker_session, train_kwargs)
TRAIN.fit_model(estimator, dataset_dir_uri)

In [16]:
#
# Load model
#

import pandas as pd
import matplotlib.pyplot as plt
from itertools import islice
from gluonts.model.predictor import Predictor

if IS_LOCAL:
    # model_output_dir_path is basically the same path as it was before, though sagemaker appends a random temp
    # directory to the path. The path from TRAIN includes that random temp directory
    # model_dir_path = TRAIN.model_data_path.parent.parent / "model"
    model_dir_path = local_artifact_dir / "local_cli" / "model"
else:
    model_dir_path = AWS_HANDLER.download_model_from_s3(str(TRAIN.model_data_path), local_artifact_dir)

LOGGER.log(f"Model dir is [{model_dir_path}]")
predictor = Predictor.deserialize(model_dir_path)
LOGGER.log(f"Predictor metadata [{predictor.__dict__}]")


def plot_prob_forecasts(ts_list, forecast_list, plot_length=100):
    for target, forecast in islice(zip(ts_list, forecast_list), len(forecast_list)):
        prediction_intervals = (50.0, 90.0)
        legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]
        ax = target[-plot_length:].plot(figsize=(10, 7), linewidth=2)
        forecast.plot(prediction_intervals=prediction_intervals, color='g')
        plt.grid(which="both")
        plt.legend(legend, loc="upper left")
        plt.show()
    
def plot_prob_forecasts_multi(ts_list, forecast_list, plot_length=100):
    for target, forecast in islice(zip(ts_list, forecast_list), len(forecast_list)):
        for i in range(5):
            prediction_intervals = (50.0, 90.0)
            legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]
            fig, ax = plt.subplots(1, 1, figsize=(10, 7))
            target[i][-plot_length:].plot(ax=ax)  # plot the time series
            forecast.copy_dim(i).plot(prediction_intervals=prediction_intervals, color='g')
            plt.grid(which="both")
            plt.legend(legend, loc="upper left")
            plt.show()

2021-04-16 07:17:15.143892 Model dir is [/Users/jbeckman/projects/capia/out/local_cli/model]
2021-04-16 07:17:15.412521 Predictor metadata [{'prediction_length': 12, 'freq': '5min', 'lead_time': 0, 'input_names': ['past_target', 'past_observed_values'], 'prediction_net': gluonts.model.lstnet._network.LSTNetPredict(ar_window=18, channels=90, context_length=24, dropout_rate=0.2, dtype=numpy.float32, kernel_size=6, lead_time=0, num_series=5, output_activation=None, prediction_length=12, rnn_cell_type="gru", rnn_num_cells=100, rnn_num_layers=90, scaling=True, skip_rnn_cell_type="gru", skip_rnn_num_cells=10, skip_rnn_num_layers=9, skip_size=9), 'batch_size': 16, 'input_transform': gluonts.transform._base.Chain(trans=[gluonts.transform._base.Chain(trans=[gluonts.transform.convert.AsNumpyArray(dtype=numpy.float32, expected_ndim=2, field="target"), gluonts.transform.feature.AddObservedValuesIndicator(dtype=numpy.float32, imputation_method=gluonts.transform.feature.DummyValueImputation(dummy_va

In [17]:
#
# Define test data and make a prediction
#

from gluonts.evaluation.backtest import make_evaluation_predictions
from gluonts.dataset.common import ListDataset, FileDataset
from utils.splitter import DateSplitter
from gluonts.dataset.field_names import FieldName
from gluonts.dataset.stat import calculate_dataset_statistics

# test_file_dataset = FileDataset(path=(dataset_dir_path / config.TEST_DATASET_FILENAME).parent, freq="min")
test_file_dataset = FileDataset(path=(dataset_dir_path / config.TEST_DATASET_FILENAME).parent, freq="5min")

# pd.Timedelta(config.HYPER_PARAMETERS["prediction_length"], unit="5min")
# for data in iter(test_file_dataset):
#     data.start.freqstr = "min"

test_datasets = []
test_dates = ["2020-11-20 12:50:00", "2021-01-20 15:55:00", "2021-01-20 17:10:00"]
for idx, date in enumerate(test_dates):
    # 1) Get splice of dataset for different dates with ample history
    splitter = DateSplitter(
        prediction_length=-config.HYPER_PARAMETERS["prediction_length"],
        split_date=date,
        max_history=config.HYPER_PARAMETERS["past_length"]
        # max_history=config.HYPER_PARAMETERS["past_length"] + config.HYPER_PARAMETERS["prediction_length"]
    )
    (_, train_dataset), (_, test_dataset) = splitter.split(test_file_dataset)

    # # 2) Remove other time-series as we only want to predict
    # for data in iter(test_dataset):
    #     if data['item_id'] == "close":
    #         test_dataset = ListDataset([{
    #             FieldName.START: data[FieldName.START],
    #             FieldName.TARGET: data[FieldName.TARGET],
    #             FieldName.FEAT_STATIC_CAT: data[FieldName.FEAT_STATIC_CAT],
    #             FieldName.ITEM_ID: data[FieldName.ITEM_ID],
    #         }], freq=config.DATASET_FREQ)
    #         break

    LOGGER.log(f"Test dataset [{idx}] stats: {calculate_dataset_statistics(test_dataset)}")
    test_datasets.append(test_dataset)

Exception: Data must be 1-dimensional

In [None]:
#
# Evaluate and visualize the prediction
#
import json

from gluonts.evaluation import Evaluator, MultivariateEvaluator

for test_dataset in test_datasets:
    forecast_it, ts_it = make_evaluation_predictions(
        dataset=test_dataset,  # test dataset
        predictor=predictor,  # predictor
        num_samples=100,  # number of sample paths we want for evaluation
    )

    forecasts = list(forecast_it)
    forecast_entry = forecasts[0]
    tss = list(ts_it)

    # LOGGER.log(f"Number of sample paths: {forecast_entry.num_samples}")
    # LOGGER.log(f"Dimension of samples: {forecast_entry.samples.shape}")
    # LOGGER.log(f"Start date of the forecast window: {forecast_entry.start_date}")
    # LOGGER.log(f"Frequency of the time series: {forecast_entry.freq}")

    evaluator = Evaluator(quantiles=[0.1, 0.5, 0.9])
    agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(test_dataset))

    LOGGER.log(json.dumps(agg_metrics, indent=4))
    item_metrics.head()

    plot_prob_forecasts(tss, forecasts)

In [None]:
#
# NOTE: FURTHER CELLS ARE COMPATIBLE WITH AWS SAGEMAKER ONLY, LOCAL MODE WILL NOT WORK
# Hyperparameter tune the model
#

from ml.tune import Tune

TUNE = Tune(UTILS, LOGGER)

train_dataset_uri = f"{dataset_dir_uri}/{config.TRAIN_DATASET_FILENAME}"
test_dataset_uri = f"{dataset_dir_uri}/{config.TEST_DATASET_FILENAME}"

# Note: Feel free to tune the tuner, i.e. update max number of jobs and hyperparameters. Default is 10 jobs, but you
# may want to change this as you refine the model. Additionally, if you find the best model has a parameter at the
# end of the range you gave it, then you should look to move that range to determine if the model performs better
# along that vector
tuner = TUNE.create_tuner(estimator)
TUNE.fit_tuner(tuner, dataset_dir_uri)

In [None]:
#
# Get updates for Hyperparameter tune job. Ensure this is completed before going to the next cell
#

TUNE.get_tune_job_update()

In [None]:
#
# Evaluate the metrics of the tune job
#

TUNE.report_job_analytics()
