In [1]:
#
# Initialization
#
import os
import sys

import ipynbname
from pathlib import Path

# Set notebook's src module path. Note that you may have to update your IDE's project settings to do the same for the
#  local library imports to work the same
MODULE_PATH = ipynbname.path().parent.parent
sys.path.append(str(MODULE_PATH))

# Keep paths consistent throughout notebook
os.chdir(MODULE_PATH)

# This should always be `./src`
print(f"Current working directory [{os.getcwd()}]")

# Place all local artifacts in a disposable, git-ignored directory
local_artifact_dir = Path(os.getcwd()).parent / "out"
local_artifact_dir.mkdir(parents=True, exist_ok=True)

# Autoreload imports at the beginning of cell execution.
#  https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

Current working directory [/Users/jbeckman/projects/capia/src]


In [2]:
#
# Setup utils
#

import subprocess

from utils.logger_util import LoggerUtil
from utils.utils import Utils
from utils import config

LOGGER = LoggerUtil(config.MODEL_ID, local_artifact_dir / "logs")
UTILS = Utils(LOGGER)

UTILS.describe_env()

# AWS instance specs can be found here https://aws.amazon.com/sagemaker/pricing/
AWS_INSTANCE = 'ml.m5.large' # 2 vCPU, 0 GPU, 8 GB memory, $0.134/hour
AWS_INSTANCE_2 = 'ml.m5.4xlarge' # 8 vCPU, 0 GPU, 32 GB memory, $0.538/hour
AWS_GPU_INSTANCE = 'ml.g4dn.xlarge' # 4 vCPU, 1 GPU, 16 GB memory, $0.736/hour
AWS_GPU_INSTANCE_2 = 'ml.g4dn.2xlarge' # 8 vCPU, 1 GPU, 32 GB memory, $1.053/hour
LOCAL_INSTANCE = 'local'
try:
    if subprocess.call('nvidia-smi') == 0:
        LOCAL_INSTANCE = 'local_gpu'
except:
    LOGGER.log("The nvidia-smi binary was not found and thus GPU computation is not supported. Using the default CPU "
               "computation")

# Change this to your desired instance type
INSTANCE_TYPE = AWS_GPU_INSTANCE_2
IS_LOCAL = LOCAL_INSTANCE == INSTANCE_TYPE

2021-05-11 14:15:09.910933 Background logger started
2021-05-11 14:15:09.911395 The model id is [giia-0.6.2]
2021-05-11 14:15:09.911488 The MXNet version is [1.7.0]
2021-05-11 14:15:09.911653 The GluonTS version is [0.7.1.dev3+gecb903f]
2021-05-11 14:15:09.911896 The GPU count is [0]
2021-05-11 14:15:09.917194 The nvidia-smi binary was not found and thus GPU computation is not supported. Using the default CPU computation


In [3]:
#
# Parse dataset
#

from data_processing.parse import Parse

PARSE = Parse(LOGGER)

dataset_dir_path = local_artifact_dir / "datasets"

# Creates train and test dataset CSVs
PARSE.split_train_test_dataset(dataset_dir_path)

Number of momentum_indicator: 9
Number of overlap_studies: 1
Number of pattern_recognition: 2
Number of volume_bin: 1
2021-05-11 14:16:18.561781 First sample:
2021-05-11 14:16:18.569640 
              open    high     low   close     volume        mfi       roc  \
date                                                                         
2021-01-01  736.42  737.09  735.94  737.02  647.71994  67.016654 -0.024417   

                  adx        rsi      slowd      slowk     macd  macdsignal  \
date                                                                          
2021-01-01  41.517721  48.978247  28.422228  36.793794 -0.39571   -0.524754   

            macdhist         hma  pattern_count  pattern_detected volume_bin  
date                                                                          
2021-01-01  0.129044  736.669073              1                63          1  
2021-05-11 14:16:18.569849 Last sample:
2021-05-11 14:16:18.575791 
                        open     hi

In [4]:
#
# Setup local/aws environment. If aws, upload the datasets to S3
#

from data_processing.aws_handler import AWSHandler
from sagemaker import LocalSession

AWS_HANDLER = AWSHandler(LOGGER, config.MODEL_ID)

sagemaker_session = None

model_output_dir_path = local_artifact_dir / config.MODEL_ID / "models"
model_output_dir_path.mkdir(parents=True, exist_ok=True)

if IS_LOCAL:
    LOGGER.log("Notebook is set to local mode, not uploading to S3")

    dataset_dir_uri = f"file://{dataset_dir_path}"
    model_output_dir_uri = f"file://{model_output_dir_path}"

    sagemaker_session = LocalSession()
    sagemaker_session.config = {
        'local': {
            'local_code': True,
            'container_root': str(model_output_dir_path)
        }
    }
else:
    sagemaker_session = AWS_HANDLER.sagemaker_session

    AWS_HANDLER.upload_train_datasets(dataset_dir_path)
    dataset_dir_uri = AWS_HANDLER.s3_dataset_dir_uri

    model_output_dir_uri = AWS_HANDLER.s3_model_output_uri

LOGGER.log(f"Model output dir is [{model_output_dir_uri}]")

2021-05-11 14:18:17.170105 Data will be uploaded to [sagemaker-us-east-2-941048668662]
2021-05-11 14:18:17.469087 Uploaded metadata/metadata.json to s3://sagemaker-us-east-2-941048668662/giia-0.6.2/datasets
2021-05-11 14:18:17.469499 Data will be uploaded to [sagemaker-us-east-2-941048668662]
2021-05-11 14:18:28.222064 Uploaded train/data.json to s3://sagemaker-us-east-2-941048668662/giia-0.6.2/datasets
2021-05-11 14:18:28.222466 Data will be uploaded to [sagemaker-us-east-2-941048668662]
2021-05-11 14:18:31.520954 Uploaded test/data.json to s3://sagemaker-us-east-2-941048668662/giia-0.6.2/datasets
2021-05-11 14:18:31.521408 Model output dir is [s3://sagemaker-us-east-2-941048668662/giia-0.6.2/models]


In [5]:
#
# Configure sagemaker and estimator
#

from ml.train import Train

TRAIN = Train(LOGGER)

if IS_LOCAL:
    train_kwargs = {}
else:
    train_kwargs = {
        # 'checkpoint_s3_uri': model_output_dir_uri,
        'output_path': model_output_dir_uri,
        'code_location': model_output_dir_uri,
        'use_spot_instances': True,
        'max_wait': 18 * 60 * 60, # 18 hours
        'max_run': 18 * 60 * 60, # 18 hours
    }

estimator = TRAIN.create_model(config.SM_ROLE, INSTANCE_TYPE, sagemaker_session, train_kwargs)
TRAIN.fit_model(estimator, dataset_dir_uri)

2021-05-11 18:20:49 Starting - Starting the training job...
2021-05-11 18:21:12 Starting - Launching requested ML instancesProfilerReport-1620757162: InProgress
......
2021-05-11 18:22:12 Starting - Preparing the instances for training.........
2021-05-11 18:23:53 Downloading - Downloading input data...
2021-05-11 18:24:23 Training - Training image download completed. Training in progress..[34m2021-05-11 18:24:23,642 sagemaker-training-toolkit INFO     Imported framework sagemaker_mxnet_container.training[0m
[34m2021-05-11 18:24:23,665 sagemaker_mxnet_container.training INFO     MXNet training environment: {'SM_HOSTS': '["algo-1"]', 'SM_NETWORK_INTERFACE_NAME': 'eth0', 'SM_HPS': '{"batch_size":128,"context_length":60,"dropout_rate":0.0528,"epochs":4,"learning_rate":0.003,"num_cells":96,"num_layers":4,"prediction_length":5}', 'SM_USER_ENTRY_POINT': 'deepar_dynamic_real.py', 'SM_FRAMEWORK_PARAMS': '{}', 'SM_RESOURCE_CONFIG': '{"current_host":"algo-1","hosts":["algo-1"],"network_interf

In [None]:
#
# Load model
#

import pandas as pd
import matplotlib.pyplot as plt
from itertools import islice
from gluonts.model.predictor import Predictor

if IS_LOCAL:
    # model_output_dir_path is basically the same path as it was before, though sagemaker appends a random temp
    # directory to the path. The path from TRAIN includes that random temp directory
    # model_dir_path = TRAIN.model_data_path.parent.parent / "model"
    model_dir_path = local_artifact_dir / "local_cli" / "model"
else:
    model_dir_path = AWS_HANDLER.download_model_from_s3(str(TRAIN.model_data_path), local_artifact_dir)

LOGGER.log(f"Model dir is [{model_dir_path}]")
predictor = Predictor.deserialize(model_dir_path)
LOGGER.log(f"Predictor metadata [{predictor.__dict__}]")


def plot_prob_forecasts(ts_list, forecast_list, plot_length=100):
    for target, forecast in islice(zip(ts_list, forecast_list), len(forecast_list)):
        prediction_intervals = (50.0, 90.0)
        legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]
        ax = target[-plot_length:].plot(figsize=(10, 7), linewidth=2)
        forecast.plot(prediction_intervals=prediction_intervals, color='g')
        plt.grid(which="both")
        plt.legend(legend, loc="upper left")
        plt.show()
    
def plot_prob_forecasts_multi(ts_list, forecast_list, plot_length=60):
    for target, forecast in islice(zip(ts_list, forecast_list), len(forecast_list)):
        for i in range(5):
            prediction_intervals = (50.0, 90.0)
            legend = ["observations", "median prediction"] + [f"{k}% prediction interval" for k in prediction_intervals][::-1]
            fig, ax = plt.subplots(1, 1, figsize=(10, 7))
            target[i][-plot_length:].plot(ax=ax)  # plot the time series
            forecast.copy_dim(i).plot(prediction_intervals=prediction_intervals, color='g')
            plt.grid(which="both")
            plt.legend(legend, loc="upper left")
            plt.show()

In [None]:
#
# Define test data and make a prediction
#

from gluonts.evaluation.backtest import make_evaluation_predictions
from gluonts.dataset.common import ListDataset, FileDataset
from utils.splitter import DateSplitter
from gluonts.dataset.field_names import FieldName
from gluonts.dataset.stat import calculate_dataset_statistics

# test_file_dataset = FileDataset(path=(dataset_dir_path / config.TEST_DATASET_FILENAME).parent, freq="min")
test_file_dataset = FileDataset(path=(dataset_dir_path / config.TEST_DATASET_FILENAME).parent, freq=config.DATASET_FREQ)

# pd.Timedelta(config.HYPER_PARAMETERS["prediction_length"], unit="5min")
# for data in iter(test_file_dataset):
#     data.start.freqstr = "min"

test_datasets = []
test_dates = ["2021-04-20 12:50:00", "2021-04-20 15:55:00", "2021-04-21 16:00:00", "2021-04-22 17:00:00"]
for idx, date in enumerate(test_dates):
    # 1) Get splice of dataset for different dates with ample history
    splitter = DateSplitter(
        prediction_length=-config.HYPER_PARAMETERS["prediction_length"],
        split_date=date,
        max_history=config.FREQTRADE_MAX_CONTEXT
    )
    (_, train_dataset), (_, test_dataset) = splitter.split(test_file_dataset)

    # # 2) Remove other time-series as we only want to predict
    # for data in iter(test_dataset):
    #     if data['item_id'] == "close":
    #         test_dataset = ListDataset([{
    #             FieldName.START: data[FieldName.START],
    #             FieldName.TARGET: data[FieldName.TARGET],
    #             FieldName.FEAT_STATIC_CAT: data[FieldName.FEAT_STATIC_CAT],
    #             FieldName.ITEM_ID: data[FieldName.ITEM_ID],
    #         }], freq=config.DATASET_FREQ)
    #         break

    LOGGER.log(f"Test dataset [{idx}] stats: {calculate_dataset_statistics(test_dataset)}")
    test_datasets.append(test_dataset)

In [None]:
#
# Evaluate and visualize the prediction
#
import json

from gluonts.evaluation import Evaluator, MultivariateEvaluator

for test_dataset in test_datasets:
    forecast_it, ts_it = make_evaluation_predictions(
        dataset=test_dataset,  # test dataset
        predictor=predictor,  # predictor
        num_samples=100,  # number of sample paths we want for evaluation
    )

    forecasts = list(forecast_it)
    forecast_entry = forecasts[0]
    tss = list(ts_it)

    # LOGGER.log(f"Number of sample paths: {forecast_entry.num_samples}")
    # LOGGER.log(f"Dimension of samples: {forecast_entry.samples.shape}")
    # LOGGER.log(f"Start date of the forecast window: {forecast_entry.start_date}")
    # LOGGER.log(f"Frequency of the time series: {forecast_entry.freq}")

    evaluator = Evaluator(quantiles=[0.1, 0.5, 0.9])
    agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(test_dataset))

    LOGGER.log(json.dumps(agg_metrics, indent=4))
    item_metrics.head()

    plot_prob_forecasts(tss, forecasts)

In [None]:
#
# NOTE: FURTHER CELLS ARE COMPATIBLE WITH AWS SAGEMAKER ONLY, LOCAL MODE WILL NOT WORK
# Hyperparameter tune the model
#

from ml.tune import Tune

TUNE = Tune(UTILS, LOGGER)

train_dataset_uri = f"{dataset_dir_uri}/{config.TRAIN_DATASET_FILENAME}"
test_dataset_uri = f"{dataset_dir_uri}/{config.TEST_DATASET_FILENAME}"

# Note: Feel free to tune the tuner, i.e. update max number of jobs and hyperparameters. Default is 10 jobs, but you
# may want to change this as you refine the model. Additionally, if you find the best model has a parameter at the
# end of the range you gave it, then you should look to move that range to determine if the model performs better
# along that vector
tuner = TUNE.create_tuner(estimator)
TUNE.fit_tuner(tuner, dataset_dir_uri)

In [None]:
#
# Get updates for Hyperparameter tune job. Ensure this is completed before going to the next cell
#

TUNE.get_tune_job_update()

In [None]:
#
# Evaluate the metrics of the tune job
#

TUNE.report_job_analytics()
