In [1]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet "tensorflow-hub>=0.7.0"
!pip3 install --quiet seaborn
# Install market calendar
!pip3 install --quiet pandas-market-calendars

In [2]:
from absl import logging

import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import seaborn as sns
import json
import itertools
import pandas as pd
import torch
import pandas_market_calendars as mcal
import datetime
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm

In [3]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [4]:
tf.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()

Instructions for updating:
non-resource variables are not supported in the long term


In [5]:
stocknet_dataset_filepath = './stocknet-dataset-master'
start_date = '2014-01-01'
end_date = '2016-01-01'

In [6]:
cache = {}
calendar = mcal.get_calendar('NYSE')
def next_trading_day(start_day=None, SAFE_DELTA = 4):
    """Returns the next/previous trading date separated by a certain number of 
    trading days.
    """
    if start_day is None:
        start_day = datetime.datetime.utcnow().date()
    if start_day in cache:
        return cache[start_day]
    start = pd.to_datetime(start_day)
    end = start + np.timedelta64(SAFE_DELTA, 'D')
    business_days = calendar.valid_days(start_date=start, end_date=end)
    next_day = business_days[1].date()
    next_day = next_day.strftime("%Y-%m-%d")
    cache[start_day] = next_day
    return next_day

In [7]:
preprocessed_prices_filepath = stocknet_dataset_filepath + '/price/preprocessed'
preprocessed_tweets_filepath = stocknet_dataset_filepath + '/tweet/preprocessed'

company_to_price_df = {}
company_to_tweets = {}

for filename in os.listdir(preprocessed_prices_filepath):
    with open(preprocessed_prices_filepath + '/' + filename) as file:
        company_name = filename.split('.')[0]
        
        # Not enough data for GMRE
        if company_name == 'GMRE':
            continue
        df = pd.read_csv(file, sep='\t')
        df.columns = ['date', 'open', 'high', 'low', 'close', 'adjust_close', 'volume']
        mask = (df['date'] >= start_date) & (df['date'] <= end_date)
        df = df.loc[mask]
        company_to_price_df[company_name] = df.dropna()

for filename in tqdm(os.listdir(preprocessed_tweets_filepath)):
    company_name = filename.split('.')[0]
    dates_to_tweets = {}
    for tweet_filename in os.listdir(preprocessed_tweets_filepath + '/' + filename):
        if tweet_filename < start_date or tweet_filename > end_date:
            continue
        with open(preprocessed_tweets_filepath + '/' + filename + '/' + tweet_filename) as file:
            list_of_tweets = []
            for line in file:
                tweet_json = json.loads(line)
                list_of_tweets.append(tweet_json)
            date_idx = next_trading_day(tweet_filename)
            if date_idx not in dates_to_tweets:
                dates_to_tweets[date_idx] = list_of_tweets
            else:
                dates_to_tweets[date_idx] += list_of_tweets
    company_to_tweets[company_name] = dates_to_tweets

HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




In [8]:
#print(company_to_tweets.keys())
#print(dates_to_tweets.keys())
print(company_to_tweets['AAPL']['2015-10-02'][0])

{'text': ['apple', 'releases', 'ios', '9.0', '.', '2', 'with', 'bug', 'fixes', ',', 'performance', 'improvements', '-', 'URL', '-', '$', 'aapl', 'URL'], 'created_at': 'Thu Oct 01 09:57:36 +0000 2015', 'user_id_str': '229597766'}


In [9]:
# Reduce logging output.
logging.set_verbosity(logging.ERROR)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Import the Universal Sentence Encoder's TF Hub module
def embed_useT(module):
    with tf.Graph().as_default():
        sentences = tf.placeholder(tf.string)
        embed = hub.Module(module)
        embeddings = embed(sentences)
        session = tf.train.MonitoredSession()
    return lambda x: session.run(embeddings, {sentences: x})
embed_fn = embed_useT(module_url)

In [10]:
# Generate embeddings
for company in tqdm(company_to_tweets.keys()):
  for date in company_to_tweets[company].keys():
    messages = []
    for j in range(len(company_to_tweets[company][date])):
      messages.append(' '.join(company_to_tweets[company][date][j]['text']))
    message_embeddings = embed_fn(messages)
    for k in range(len(company_to_tweets[company][date])):
      company_to_tweets[company][date][k]['embedding'] = list(message_embeddings[k])

HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




In [11]:
# Create date mapping
date_universe = set()
for company in company_to_price_df.keys():
    date_universe = date_universe.union(set(company_to_price_df[company].date))
for company in company_to_tweets.keys():
    date_universe = date_universe.union(set(company_to_tweets[company].keys()))
date_universe = sorted(list(date_universe))
index_to_date = {i-5:d for i,d in enumerate(date_universe)}
date_to_index = {d:i-5 for i,d in enumerate(date_universe)}

In [12]:
# Calculate dimensions for tensor
n_stocks = len(company_to_tweets.keys())
n_days = len(date_universe)
max_tweets = 0
for c,d in itertools.product(company_to_tweets.keys(), date_universe):
    if d in company_to_tweets[c]:
        max_tweets = max(max_tweets, len(company_to_tweets[c][d]))
# Create index mapping for stocks alphabetically
company_to_index = {c:i for i,c in enumerate(sorted(list(company_to_tweets.keys())))}
# print dimensions
print(n_stocks)
print(n_days)
print(max_tweets)

87
505
555


In [13]:
# Construct tensors
price_tensor = np.zeros((n_stocks, n_days-5, 6, 3))
smi_tensor = np.zeros((n_stocks, n_days-5, 6, max_tweets, 512))

In [14]:
class StockDataset(Dataset):
    """Price dataset"""

    def __init__(self, company_to_price_df, company_to_tweets, date_universe, n_days, n_stocks, max_tweets):
        # Initialize class members
        self.n_stocks = n_stocks
        self.n_days = n_days
        self.max_tweets = max_tweets
        company_to_index = {c:i for i,c in enumerate(sorted(list(company_to_tweets.keys())))}
        date_to_index = {d:i for i,d in enumerate(date_universe)}
        # Get price data tensor: n_stocks, n_days, 3
        self.price_data = np.zeros((n_stocks, n_days, 3))
        for company in company_to_price_df.keys():
            df = company_to_price_df[company]
            for index, row in df.iterrows():
                d_index = date_to_index[row['date']]
                c_index = company_to_index[company]
                self.price_data[c_index, d_index, 0] = row['high']
                self.price_data[c_index, d_index, 1] = row['low']
                self.price_data[c_index, d_index, 2] = row['adjust_close']
        # Get smi data tensor
        self.smi_data = np.zeros((n_stocks, n_days, max_tweets, 512))
        self.tweet_counts = np.zeros((n_stocks, n_days))
        for company in company_to_tweets.keys():
            dates = sorted(list(company_to_tweets[company].keys()))
            for date in dates:
                n_tweets = len(company_to_tweets[company][date])
                tweets = [company_to_tweets[company][date][k]['embedding'] for k in range(n_tweets)]
                c_index = company_to_index[company]
                d_index = date_to_index[date]
                self.tweet_counts[c_index, d_index] = n_tweets
                for i,embedding in enumerate(tweets):
                    #stocks, day, lags, tweet, embedding
                    self.smi_data[c_index, d_index, i, :] = embedding[:]

    def __len__(self):
        return self.n_days-5

    def __getitem__(self, idx):
        """
        gets a price tensor of shape (n_stocks, 6, 3)
        gets a smi tensor of shape (n_stocks, 6, K, 512)
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        #price_output = np.zeros((self.n_stocks, 6, 3))
        price_output = self.price_data[:, idx:idx+6, :]
        
        #smi_output = np.zeros((self.n_stocks, 6, self.max_tweets, 512))
        smi_output = self.smi_data[:, idx:idx+6, :, :]
        
        tweet_count = self.tweet_counts[:, idx:idx+6]
        
        # construct output
        output = {'price': price_output, 'smi': smi_output, 'n_tweets': tweet_count}
                
        return output

In [15]:
price_dataset = StockDataset(company_to_price_df, company_to_tweets, date_universe, n_days, n_stocks, max_tweets)

In [16]:
dataloader = DataLoader(price_dataset, batch_size=4,
                        shuffle=True, num_workers=0)

In [17]:
"""for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch)
    print(sample_batched)"""

'for i_batch, sample_batched in enumerate(dataloader):\n    print(i_batch)\n    print(sample_batched)'