In [227]:
#!/usr/bin/env python

# Python Standard Library Modules
import os
import random
import pathlib
import sys
import warnings

# External Libraries
from gluonts.dataset import common
from gluonts.evaluation.backtest import make_evaluation_predictions
from gluonts.model import deepar
from gluonts.mx.trainer import Trainer
from hyperopt import fmin, hp, tpe, STATUS_OK, STATUS_FAIL, Trials
from hyperopt.pyll import scope
import matplotlib
import matplotlib.pyplot as plt
import mxnet as mx
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import seasonal_decompose

warnings.filterwarnings("ignore")
mx.random.seed(0)
np.random.seed(0)

prediction_length = 30
# validation_length = 30
# if validation_length:
#     prediction_length = prediction_length + validation_length

### BIOTECH

In [228]:
gluon_list = []
def covert_yahoo_series_dir(path: str, prediction_length: int, gluon_list=None) -> list:
    """Clean and load all biotech histories in the Yahoo biotech folder

    Params:
        path: folder full of historical crypto coins timeseries data
        prediction_length: length on which to predict
    Returns:
        List of Gluon-compatible dicts from the coin data
    """
    if gluon_list is not None:
        print("NOT NONE")
        gluon_list = gluon_list
    else:
        gluon_list = []
    for file in os.listdir(path):
        print(file)
        stock_gluon_dict = dict()
        file_path = path + file
        stock = pd.read_csv(file_path)
        stock["Date"] = pd.to_datetime(stock["Date"])
        stock.set_index("Date", inplace=True)
        if len(stock) < 100:
            continue
        total_na_vals = stock.isna().sum()[0]
#         print(f"TOTAL NAs BEFORE ASFREQ B for {file} is {total_na_vals}")
        stock = stock.asfreq("B")
        total_na_vals = stock.isna().sum()[0]
#         print(f"TOTAL NAs for {file} is {total_na_vals}")
        if (total_na_vals / len(stock)) > 0.25:
            print(f"CONTINUED {file}")
            continue
        # Get values for ListDatasets
        stock_closes = stock["Close"]
        stock_closes.index = pd.DatetimeIndex(stock_closes.index)
#         stock_closes.dropna(inplace=True)
#         coin_closes.fillna(method='bfill', inplace=True)
#         coin_closes.dropna(inplace=True)
        stock_closes = stock_closes.asfreq("B")
#         stock_closes.dropna(inplace=True)
        start = stock_closes.index[0]
#         stock_closes = stock_closes.reset_index()

        stock_gluon_dict["test"] = {
            "start": start,
            "target": stock_closes,
            "name": file,
        }

        stock_gluon_dict["train"] = {
            "start": start,
            "target": stock_closes[:-prediction_length],
            "name": file,
        }

        gluon_list.append(stock_gluon_dict)

    return gluon_list


gluon_list = covert_yahoo_series_dir("../data/historical_yahoo_biotech/", prediction_length=30)

TYRA.csv
NAUT.csv
BBIO.csv
VIR.csv
JNCE.csv
NBIX.csv
VRTX.csv
LVTX.csv
PSTX.csv
HARP.csv
BCYC.csv
CNTA.csv
SGMO.csv
ITCI.csv
KNSA.csv
MTEM.csv
OLMA.csv
SGTX.csv
MYOV.csv
EDIT.csv
NKTX.csv
IBRX.csv
OMGA.csv
REPL.csv
PHVS.csv
COGT.csv
FGEN.csv
ERAS.csv
RXDX.csv
RFL.csv
TECH.csv
PGEN.csv
ABOS.csv
RAIN.csv
NXTC.csv
KRON.csv
ANAB.csv
DNLI.csv
RPRX.csv
CYTK.csv
OPT.csv
FDMT.csv
GRTS.csv
GRCL.csv
RARE.csv
SNDX.csv
TPTX.csv
ERYP.csv
ALNY.csv
FREQ.csv
LYEL.csv
KURA.csv
GMAB.csv
MESO.csv
QURE.csv
SANA.csv
TSHA.csv
STOK.csv
ADCT.csv
MIST.csv
PASG.csv
ATRA.csv
KLDO.csv
DCPH.csv
ANNX.csv
APRE.csv
BLU.csv
NVAX.csv
MDGL.csv
SPRB.csv
ZYME.csv
CABA.csv
APLS.csv
VERV.csv
KDMN.csv
MGTX.csv
NUVB.csv
PCVX.csv
BMEA.csv
MRVI.csv
IPSC.csv
PTCT.csv
DCAL.BO.csv
ALGS.csv
SEER.csv
DSGN.csv
RCUS.csv
ENTA.csv
PRAX.csv
VOR.csv
BCEL.csv
VIRX.csv
DRNA.csv
MOR.csv
NGM.csv
KNTE.csv
AKUS.csv
ALKS.csv
JAZZ.csv
PLRX.csv
GTHX.csv
ABSI.csv
ARVN.csv
ADPT.csv
YMAB.csv
PMVP.csv
RGNX.csv
BMRN.csv
TCRR.csv
CYT.csv
INZY.csv
GOSS.c

In [229]:
gluon_list = covert_yahoo_series_dir("/Users/dan/projects/gluon/data/historical_yahoo_tech_big_movers/", prediction_length=30, gluon_list=gluon_list)


NOT NONE
ZG.csv
MSFT (1).csv
INTC.csv
PYPL.csv
ATVI.csv
EA.csv
AMD.csv
MTCH.csv
NVDA.csv
FB.csv
TTD.csv
AAPL (1).csv
TSLA.csv
AMZN (1).csv
YELP.csv
GOOG (1).csv
BABA.csv
CRM.csv


In [230]:
warnings.filterwarnings("always")

import mxnet as mx
mx.random.seed(0)
np.random.seed(0)


test_data = common.ListDataset(
    [
    ],
    freq="B",
)

train_data = common.ListDataset(
    [
    ],
    freq="B",
)

for stock_gluon_dict in gluon_list:
    test_data.list_data.append(stock_gluon_dict['test'])
    train_data.list_data.append(stock_gluon_dict['train'])

In [231]:
len(train_data.list_data)

236

In [232]:
trainer = Trainer(epochs=50, batch_size=256, learning_rate=0.0001)
estimator = deepar.DeepAREstimator(
        freq="B", 
#         num_cells=150,
#         num_layers=3,
        prediction_length=prediction_length, 
        trainer=trainer,
    )

predictor = estimator.train(
    training_data=train_data,
)

random.seed(0)
mx.random.seed(0)
np.random.seed(0)
global_loss = 0
predictions = predictor.predict(train_data.list_data)




for index, value in enumerate(range(len(gluon_list))):
    prediction = next(predictions)
    name = test_data.list_data[index]['name']
    full_actual = test_data.list_data[index]['target']
    full_actual.dropna(inplace=True)
    actual = full_actual[-30:]
    preds = pd.Series(prediction.mean)
    preds.index = actual.index
#     mse = mean_squared_error(actual, preds)
    
    # PLOT ALL BIOTECH PREDICTIONS
#     plt.figure()
#     preds.plot(legend=True, label=f"{name} PREDICTED")
#     actual.plot(legend=True, label=f"{name} ACTUAL")
#     plt.show()

    
    scaler = MinMaxScaler()
    scaled_actual = np.array(actual)
    scaler.fit([scaled_actual])
    scaled_actual = scaler.fit_transform(np.array(scaled_actual[:, np.newaxis]))
    scaled_preds = scaler.transform([preds])
    scaled_preds = scaled_preds.reshape(-1, 1)
    mse = mean_squared_error(scaled_actual, scaled_preds)
#     print(f"mse: {mse}")
    global_loss += mse
    
print(f"global_loss is {global_loss}")
print(f"average global_loss is {global_loss / len(gluon_list)}")

  return init(self, **all_args)
100%|██████████| 50/50 [00:03<00:00, 15.12it/s, epoch=1/50, avg_epoch_loss=4.06]
100%|██████████| 50/50 [00:03<00:00, 16.06it/s, epoch=2/50, avg_epoch_loss=3.33]
100%|██████████| 50/50 [00:03<00:00, 15.93it/s, epoch=3/50, avg_epoch_loss=2.53]
100%|██████████| 50/50 [00:03<00:00, 15.94it/s, epoch=4/50, avg_epoch_loss=2.3]
100%|██████████| 50/50 [00:03<00:00, 16.09it/s, epoch=5/50, avg_epoch_loss=2.2]
100%|██████████| 50/50 [00:03<00:00, 16.04it/s, epoch=6/50, avg_epoch_loss=2.18]
100%|██████████| 50/50 [00:03<00:00, 16.14it/s, epoch=7/50, avg_epoch_loss=2.04]
100%|██████████| 50/50 [00:03<00:00, 16.23it/s, epoch=8/50, avg_epoch_loss=2.05]
100%|██████████| 50/50 [00:03<00:00, 15.84it/s, epoch=9/50, avg_epoch_loss=2.08]
100%|██████████| 50/50 [00:03<00:00, 16.21it/s, epoch=10/50, avg_epoch_loss=2.03]
100%|██████████| 50/50 [00:03<00:00, 16.20it/s, epoch=11/50, avg_epoch_loss=2.11]
100%|██████████| 50/50 [00:03<00:00, 15.47it/s, epoch=12/50, avg_epoch_loss=2

global_loss is 145.4655384213718
average global_loss is 0.6163794000905585


In [201]:
average global_loss is 1.2ish               # 18
average global_loss is 0.8193485594635367   # 36
average global_loss is 1.0578433766058088   # 54
average global_loss is 1.0184960608247695   # 69
average global_loss is 1.0639288802030842   # 86
average global_loss is 0.9494059324316426   # 204
average global_loss is 0.8880175967582968   # 241
average global_loss is 1.9637798520177      # 276 different exchange
average global_loss is 0.9105032333595303   # 240 attempted restore
average global_loss is 0.9554277347125465   # 258 w/ big tech movers
average global_loss is 0.335191223822778   # 258 w/ big tech movers 10 epochs
average global_loss is 0.6163794000905585   # 258 w/ big tech movers 50 epochs
average global_loss is 0.4051840267830506   # 258 wo/ big tech movers 10 epochs

SyntaxError: invalid syntax (<ipython-input-201-02862b3e764e>, line 1)