In [35]:
#
# Initialization
#

import os
import sys

import ipynbname
from pathlib import Path

# Set notebook's src module path. Note that you may have to update your IDE's project settings to do the same for the
#  local library imports to work the same
MODULE_PATH = ipynbname.path().parent.parent
sys.path.append(str(MODULE_PATH))

# Keep paths consistent throughout notebook
os.chdir(MODULE_PATH)

# This should always be `./src`
print(f"Current working directory [{os.getcwd()}]")

# Place all local artifacts in a disposable, git-ignored directory
local_artifact_dir = Path(os.getcwd()).parent / "out"
local_artifact_dir.mkdir(parents=True, exist_ok=True)

# Autoreload imports at the beginning of cell execution.
#  https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

Current working directory [/Users/jbeckman/projects/capia/giia/src]
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
#
# Setup utils
#

import subprocess

from utils.logger_util import LoggerUtil
from utils.utils import Utils
from utils import config

LOGGER = LoggerUtil(config.MODEL_ID, local_artifact_dir / "logs")
UTILS = Utils(LOGGER)

UTILS.describe_env()

# AWS instance specs can be found here https://aws.amazon.com/sagemaker/pricing/
# AWS_INSTANCE = 'ml.m5.large'        # 2 vCPU,   0 GPU,  8 GB memory,     $0.134/hour
# AWS_INSTANCE = 'ml.m5.4xlarge'      # 16 vCPU,  0 GPU,  64 GB memory,    $0.922/hour
# AWS_INSTANCE = 'ml.g4dn.xlarge'     # 4 vCPU,   1 GPU,  16 GB memory,    $0.736/hour
AWS_INSTANCE = 'ml.g4dn.8xlarge'    # 32 vCPU,  1 GPU,  128 GB memory,   $2.72/hour
# AWS_INSTANCE = 'ml.p2.xlarge'       # 4 vCPU,   1 GPU,  61 GB memory,    $0.900/hour
# AWS_INSTANCE = 'ml.p3.2xlarge'      # 8 vCPU,   1 GPU,  61 GB memory,    $3.825/hour
LOCAL_INSTANCE = 'local'

try:
    if subprocess.call('nvidia-smi') == 0:
        LOCAL_INSTANCE = 'local_gpu'
except:
    LOGGER.log("The nvidia-smi binary was not found and thus GPU computation is not supported. Using the default CPU "
               "computation")

# Change this to your desired instance type
# INSTANCE_TYPE = AWS_INSTANCE
INSTANCE_TYPE = LOCAL_INSTANCE
IS_LOCAL = LOCAL_INSTANCE == INSTANCE_TYPE

# Does the model use filedataset or CSVs
FILEDATASET_BASED = True

# Is the model univariate
ONE_DIM_TARGET = True

2023-02-19 14:43:49.423105 The model id is [giia-transformer-1.1.2]
2023-02-19 14:43:49.424160 The MXNet version is [1.9.1]
2023-02-19 14:43:49.424272 The GluonTS version is [0.12.1]
2023-02-19 14:43:49.424363 The SageMaker version is [2.111.0]
2023-02-19 14:43:49.424540 The GPU count is [0]
2023-02-19 14:43:49.443715 The nvidia-smi binary was not found and thus GPU computation is not supported. Using the default CPU computation


In [37]:
#
# Parse dataset
#

from data_processing.parse import Parse

PARSE = Parse(LOGGER)

dataset_dir_path = local_artifact_dir / "datasets"

# Get the dataset
df = PARSE.get_df(
    starting_date_truncate="2020-01-01 00:00:00"
    # starting_date_truncate="2021-03-01 00:00:00"
)

2023-02-19 14:43:54.275464 First sample:
2023-02-19 14:43:54.278621 
              open    high     low   close  volume
date                                              
2020-01-01  128.66  128.66  128.66  128.66     0.0
2023-02-19 14:43:54.278757 Last sample:
2023-02-19 14:43:54.280988 
                        open    high      low   close     volume
date                                                            
2023-02-07 01:43:00  1624.49  1624.6  1624.49  1624.6  23.077475
2023-02-19 14:43:54.281101 Number of raw columns: 5
2023-02-19 14:43:54.281188 Number of rows: 1631624


In [None]:
#
# Plot data
#

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import laplace


def plot_distribution(dataset):
    #scale dataset by 1e6
    dataset = dataset * 1e6

    mu = np.mean(dataset)
    # b = np.std(dataset) / np.sqrt(2)
    # b = 5
    mad = np.median(np.abs(dataset - np.median(dataset)))
    b = mad / 0.674 # 3 sigma
    print(f"mu={mu}, b={b}")
    synthetic_values = laplace.rvs(loc=mu, scale=b, size=len(dataset))

    fig, ax = plt.subplots()
    ax.hist(dataset, bins=500, alpha=0.5, label='Original Data')
    ax.hist(synthetic_values, bins=50, alpha=0.5, label='Synthetic Data')
    ax.legend()
    plt.xlim([-2e6, 2e6])
    plt.show()
    # plt.grid(which="both")
    # plt.legend(legend, loc="upper left")
    # plt.show()

plot_distribution(df["roc"])
