In [None]:
#
# Install all dependencies
#
import mxnet
import numpy as np
import pandas as pd
import boto3
import s3fs
import sagemaker
from sagemaker.mxnet import MXNet

MODEL_NAME = "gia"
# DATASET = "datasets/SandP_1995_2019_monthly.csv"
DATASET = "datasets/SandP_1995_2020_daily.csv"

In [None]:
#
# Run basic checks
#

print(mxnet.__version__)
gpu_count = mxnet.context.num_gpus()
print(f"The GPU count is [{gpu_count}]")

In [None]:
#
# Parse dataset
#

df = pd.read_csv(DATASET, header=0, index_col=0)
print("First sample:")
print(df.head(1))
print("\nLast sample:")
print(df.tail(1))

# Configure fractions to split dataset between training, testing, and validation
fractions = np.array([0.7, 0.3])

# Split dataset between training, testing, and validation
train, test = np.array_split(
    df, (fractions[:-1].cumsum() * len(df)).astype(int))

# train = df[: "2013-12-01"]
train.to_csv("datasets/train.csv")

# test = df[: "2015-04-15"]
test.to_csv("datasets/test.csv")

In [None]:
#
# Upload dataset(s)
# Dataset retrieved from:
#   https://finance.yahoo.com/quote/%5EGSPC/history?period1=788936400&period2=1564545600&interval=1mo&filter=history&frequency=1mo
#

sagemaker_session = sagemaker.Session()
s3_bucket = sagemaker_session.default_bucket()

s3_train_data_path = "s3://{}/{}/train".format(s3_bucket, MODEL_NAME)
s3_test_data_path = "s3://{}/{}/test".format(s3_bucket, MODEL_NAME)

print("Data will be uploaded to: ", s3_bucket)

In [None]:
#
# Upload dataset to S3
#

s3 = boto3.resource('s3')
def copy_to_s3(local_file, s3_path, override=True):
    assert s3_path.startswith('s3://')
    split = s3_path.split('/')
    bucket = split[2]
    path = '/'.join(split[3:])
    buk = s3.Bucket(bucket)

    if len(list(buk.objects.filter(Prefix=path))) > 0:
        if not override:
            print('File s3://{}/{} already exists.\nSet override to upload anyway.\n'.format(s3_bucket, s3_path))
            return
        else:
            print('Overwriting existing file')
    with open(local_file, 'rb') as data:
        print('Uploading file to {}'.format(s3_path))
        buk.put_object(Key=path, Body=data)

copy_to_s3("datasets/train.csv", s3_train_data_path + "/train.csv")
copy_to_s3("datasets/test.csv", s3_test_data_path + "/test.csv")

# Check if the data was uploaded correctly
s3filesystem = s3fs.S3FileSystem()
with s3filesystem.open(s3_train_data_path + "/train.csv", 'rb') as fp:
    print(fp.readline().decode("utf-8")[:100] + "...")

In [None]:
#
# Configure sagemaker and estimator
#

role ='arn:aws:iam::941048668662:role/service-role/AmazonSageMaker-ExecutionRole-20191206T145896'

local_instance='local'
aws_instance='ml.m5.large'

estimator = MXNet(
    entry_point='train.py',
    # source_dir='entry_point',
    role=role,
    train_instance_type=local_instance,
    train_instance_count=1,
    framework_version='1.6.0', py_version='py3',
    hyperparameters={
         'epochs': 15,
         'prediction_length': 19,
         'num_layers': 5,
         'dropout_rate': 0.18,
     })

estimator.fit({"train": s3_train_data_path, "test": s3_test_data_path})

In [None]:
#
# NOTE: THIS DOES NOT WORK LOCALLY AND IT IS NOT SUPPOSED TO.
# Hyperparameter tune the model
#

from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter

tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name='loss',
    hyperparameter_ranges={
        'epochs': IntegerParameter(5,20),
        'prediction_length':IntegerParameter(5,20),
        'num_layers': IntegerParameter(1, 5),
        'dropout_rate': ContinuousParameter(0, 0.5) },
    metric_definitions=[{'Name': 'loss', 'Regex': "MSE: ([0-9\\.]+)"}],
    max_jobs=10,
    max_parallel_jobs=5,
    objective_type='Minimize')

tuner.fit({'train': s3_train_data_path, "test": s3_test_data_path})
tuning_job_name = tuner.latest_tuning_job.job_name
print("Tuning job name: " + tuning_job_name)

In [None]:
sage_client = boto3.Session().client('sagemaker')

# Run this cell to check current status of hyperparameter tuning job
tuning_job_result = sage_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuning_job_name)

status = tuning_job_result['HyperParameterTuningJobStatus']
if status != 'Completed':
    print('Reminder: the tuning job has not been completed.')

job_count = tuning_job_result['TrainingJobStatusCounters']['Completed']
print("%d training jobs have completed" % job_count)

is_minimize = (tuning_job_result['HyperParameterTuningJobConfig']['HyperParameterTuningJobObjective']['Type'] != 'Maximize')
objective_name = tuning_job_result['HyperParameterTuningJobConfig']['HyperParameterTuningJobObjective']['MetricName']


In [None]:
import pandas as pd

job_analytics = sagemaker.HyperparameterTuningJobAnalytics(tuning_job_name)

full_df = job_analytics.dataframe()

if len(full_df) > 0:
    df = full_df[full_df['FinalObjectiveValue'] > -float('inf')]
    if len(df) > 0:
        df = df.sort_values('FinalObjectiveValue', ascending=is_minimize)
        print("Number of training jobs with valid objective: %d" % len(df))
        print({"lowest":min(df['FinalObjectiveValue']),"highest": max(df['FinalObjectiveValue'])})
        pd.set_option('display.max_colwidth', -1)  # Don't truncate TrainingJobName
    else:
        print("No training jobs have reported valid results yet.")

df
