In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU, Embedding, Bidirectional, TimeDistributed
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tqdm import tqdm
import tensorflow.keras.backend as K
import os
import time
import pandas as pd
import numpy as np
import psutil
# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
gpu_devices = tf.config.list_physical_devices('GPU')

if gpu_devices:
    print('Using GPU')
    for gpu in gpu_devices:
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print('Using CPU')
    tf.config.optimizer.set_jit(True) 

In [None]:
import psutil
print('used: {}% free: {:.2f}GB'.format(psutil.virtual_memory().percent, float(psutil.virtual_memory().free)/1024**3))

In [None]:
from pathlib import Path
DATA_STORE = Path('model_data.h5')

In [None]:
# Get News headlines
def get_news():
    import h5py
    import s3fs
    data = pd.DataFrame()
    s3 = s3fs.S3FileSystem(anon=False, key='****', 
                           secret='******',
                           client_kwargs={'region_name':'us-west-2'})
    with h5py.File(s3.open("charlanguagemodeldata/news_db.h5", 'rb'), 'r', lib_version='latest') as f:
        tickers = list(f.keys())
        for i in tqdm(tickers):
            temp_df = pd.DataFrame(f[i+'/table'].value)# took one day to realize
            temp_df = temp_df[['versionCreated', 'text', 'ticker']]
            temp_df = temp_df.rename(columns={'versionCreated':'time', 'text': 'headline'})
            temp_df[temp_df.columns[temp_df.dtypes == object]] = temp_df.select_dtypes([object]).stack().str.decode('utf-8').unstack()
            temp_df.time = pd.to_datetime(temp_df.time, unit='ns')
            temp_df = temp_df.set_index('time')
            print()
            data = data.append(temp_df)
        data.index = data.index.strftime('%Y-%m-%d %H:%M:%S.%fZ') #get full time to drop duplicates
        data.index = pd.to_datetime(data.index)
        data = data.sort_index()
        data = data.reset_index().set_index(['ticker', 'time']) #drop identical(ticker-datetime)
        data = data[~data.index.duplicated()].reset_index().set_index('time')
        data.index = data.index.strftime('%Y-%m-%d %H:%M:%S')
        data.index = pd.to_datetime(data.index)
        data['headline'] = data.headline.apply(lambda x: '' + x + '<\s') #parse start/end tokens
    f.close()
    return data

def get_prices(interval):
    import h5py
    import s3fs
    idx = pd.IndexSlice
    data = pd.DataFrame()
    s3 = s3fs.S3FileSystem(anon=False, key='AKIAVKQQJHFPQ35IN6F6', 
                           secret='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',
                           client_kwargs={'region_name':'us-west-2'})
    with h5py.File(s3.open("charlanguagemodeldata/universe.h5", 'rb'), 'r', lib_version='latest') as f:
        print(list(f.keys()))
        print(list(f.get('prices').keys())) 
        get_columns =['time', 'ticker', 'Open', 'Close']
        data = pd.DataFrame(f['prices/'+interval+'/table'].value,columns=get_columns)
        data[data.columns[data.dtypes == object]] = data.select_dtypes([object]).stack().str.decode('utf-8').unstack()
        data.time = pd.to_datetime(data.time, unit='ns')
        data = data.set_index(['ticker', 'time'])
        data = (data.sort_index(level = 0,sort_remaining=0)
                                    .loc[idx[:, '2019':], :]
                                    .sort_index())
        data = data[~data.index.duplicated()]
    f.close()
    return data

In [None]:
news = get_news()
news.head().append(news.tail())