In [1]:
#
# Setup constants
#

import subprocess

MODEL_NAME = "giia"
MODEL_VERSION = "0.3.3"
DATASETS = [
    "datasets/SandP_1995_2020_daily.csv",
    "datasets/AWK_2008IPO_2020_daily.csv"
]
SM_ROLE ='arn:aws:iam::941048668662:role/service-role/AmazonSageMaker-ExecutionRole-20191206T145896'

MODULE_PATH = ""

In [2]:
#
# Initialization
#

import os
import sys
import cache_magic
from pathlib import Path

# Set notebook's src module path. Note that you may have to update your IDE's project settings to do the same for the
#  local library imports to work the same
%cache MODULE_PATH = os.path.dirname(Path().resolve())
sys.path.append(MODULE_PATH)

# Keep paths consistent throughout notebook
os.chdir(MODULE_PATH)

# Autoreload imports at the beginning of cell execution.
#  https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

from utils.logging import LoggerUtil
from utils.utils import Utils

LOGGER = LoggerUtil(f"{MODEL_NAME}-{MODEL_VERSION}")
UTILS = Utils(LOGGER)

LOGGER.log("Current working directory [{}]".format(os.getcwd()))
UTILS.describe_env()

%cache magic is now registered in ipython
loading cached value for variable 'MODULE_PATH'. Time since pickling  9 days, 0:25:04.606537
2020-07-03 15:16:52.929588 Background logger started
2020-07-03 15:16:52.930171 Current working directory [/Users/jbeckman/giia/src]
2020-07-03 15:16:52.930341 1.6.0
2020-07-03 15:16:52.930599 The GPU count is [0]


In [None]:
from gluonts.dataset.multivariate_grouper import MultivariateGrouper
from gluonts.dataset.common import ListDataset, TrainDatasets
from gluonts.evaluation.backtest import backtest_metrics


def load_multivariate(dataset):
    target_dim = dataset.metadata.feat_static_cat[0].cardinality
    grouper_train = MultivariateGrouper(max_target_dim=target_dim)
    grouper_test = MultivariateGrouper(max_target_dim=target_dim)
    return TrainDatasets(
        metadata=dataset.metadata,
        train=grouper_train(dataset.train),
        test=grouper_test(dataset.test),
    )

dataset = load_multivariate(data)


In [20]:
import pandas as pd

df_1 = pd.read_csv("datasets/AWK_2008IPO_2020_daily.csv", header=0, index_col=0)
# print(df_1)

df_2 = pd.read_csv("datasets/AWK_2008IPO_2020_daily.csv", header=0, index_col=0)
df_2 /= 2
# print(df_2)
# print(df_2["Open"].to_numpy())

print("-----------------------")
# for (columnName, columnData) in df_2.iteritems():
#     print(columnName)
#     print(columnData.to_numpy())
    
dataframes = []
dataframes.append(df_1.set_index(["Open"]))
dataframes.append(df_2.set_index(["Open"]))

combined_df = pd.concat([df_1, df_2], axis=1, join="inner")
print(combined_df)


df = pd.concat([df_1, df_2], axis = 1)
# print(df.head(1))

# import xarray
#
# output_as_dataarray = xarray.concat(
#     [xarray.DataArray(X,
#                       dims=['record', 'edge'],
#                       coords={'record': range(X.shape[0]),
#                               'edge': ['start', 'end']},
#                      ) for X in (A, B, C)],
#     dim='descriptor',
# ).assign_coords(descriptor=['A', 'B', 'C'])

-----------------------
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2008-04-23   20.600000   21.450001   20.219999   20.600000   14.833152   
2008-04-24   20.719999   21.000000   20.309999   20.809999   14.984364   
2008-04-25   21.000000   21.250000   20.700001   21.170000   15.243580   
2008-04-28   21.490000   21.490000   20.809999   21.000000   15.121177   
2008-04-29   21.389999   21.389999   20.820000   21.070000   15.171576   
...                ...         ...         ...         ...         ...   
2020-06-19  131.059998  131.130005  126.360001  126.360001  126.360001   
2020-06-22  126.330002  127.599998  125.699997  126.830002  126.830002   
2020-06-23  127.709999  128.179993  124.879997  125.220001  125.220001   
2020-06-24  124.339996  125.199997  123.099998  124.400002  124.400002   
2020-06-25  124.400002  124.400002  120.669998  122.220001  122.220001   

             

In [None]:
import numpy as np

def encode_missing_values(dataset: list):
    max_len = max(len(sublist) for sublist in dataset)
    for sublist in dataset:
        sublist.extend([np.nan] * (max_len - len(sublist)))
