In [1]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet "tensorflow-hub>=0.7.0"
!pip3 install --quiet seaborn

In [2]:
from absl import logging

import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import seaborn as sns
import json
import itertools
import pandas as pd
from tqdm import tqdm_notebook

In [3]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [4]:
tf.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()

Instructions for updating:
non-resource variables are not supported in the long term


In [5]:
stocknet_dataset_filepath = './stocknet-dataset-master'

In [6]:
preprocessed_prices_filepath = stocknet_dataset_filepath + '/price/preprocessed'
preprocessed_tweets_filepath = stocknet_dataset_filepath + '/tweet/preprocessed'

company_to_price_df = {}
company_to_tweets = {}

for filename in os.listdir(preprocessed_prices_filepath):
    with open(preprocessed_prices_filepath + '/' + filename) as file:
        company_name = filename.split('.')[0]
        
        # Not enough data for GMRE
        if company_name == 'GMRE':
            continue
        df = pd.read_csv(file, sep='\t')
        df.columns = ['date', 'open', 'high', 'low', 'close', 'adjust_close', 'volume']
        company_to_price_df[company_name] = df.dropna()

for filename in tqdm_notebook(os.listdir(preprocessed_tweets_filepath)):
    company_name = filename.split('.')[0]
    dates_to_tweets = {}
    for tweet_filename in os.listdir(preprocessed_tweets_filepath + '/' + filename):
        with open(preprocessed_tweets_filepath + '/' + filename + '/' + tweet_filename) as file:
            list_of_tweets = []
            for line in file:
                tweet_json = json.loads(line)
                list_of_tweets.append(tweet_json)
            dates_to_tweets[tweet_filename] = list_of_tweets
    company_to_tweets[company_name] = dates_to_tweets

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in tqdm_notebook(os.listdir(preprocessed_tweets_filepath)):


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




In [8]:
#print(company_to_tweets.keys())
#print(dates_to_tweets.keys())
print(company_to_tweets['AAPL']['2015-10-02'][0])

{'text': ['rt', '$', 'tsla', 'hft', 'algos', 'triggered', 'buy', 'in', 'sigma-x', ',', 'crossfinder', ',', 'ats', ',', 'lx', '@', '08:28', ',', 'p', '/', 't', '245.00', 'quant', '$', 'msft', '$', 'fb', '$', 'gpro', '$', 'amzn', '$', 'goog', '$', 'aapl', '$', 'nflx', '$', 'qqq'], 'created_at': 'Fri Oct 02 12:29:15 +0000 2015', 'user_id_str': '242469235'}


In [9]:
# Reduce logging output.
logging.set_verbosity(logging.ERROR)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Import the Universal Sentence Encoder's TF Hub module
def embed_useT(module):
    with tf.Graph().as_default():
        sentences = tf.placeholder(tf.string)
        embed = hub.Module(module)
        embeddings = embed(sentences)
        session = tf.train.MonitoredSession()
    return lambda x: session.run(embeddings, {sentences: x})
embed_fn = embed_useT(module_url)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


In [10]:
# Generate embeddings
for company in tqdm_notebook(company_to_tweets.keys()):
  for date in company_to_tweets[company].keys():
    messages = []
    for j in range(len(company_to_tweets[company][date])):
      messages.append(' '.join(company_to_tweets[company][date][j]['text']))
    message_embeddings = embed_fn(messages)
    for k in range(len(company_to_tweets[company][date])):
      company_to_tweets[company][date][k]['embedding'] = list(message_embeddings[k])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for company in tqdm_notebook(company_to_tweets.keys()):


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




In [35]:
# Create date mapping
date_universe = set()
for company in company_to_price_df.keys():
    date_universe = date_universe.union(set(company_to_price_df[company].date))
for company in company_to_tweets.keys():
    date_universe = date_universe.union(set(company_to_tweets[company].keys()))
date_universe = sorted(list(date_universe))
index_to_date = {i-5:d for i,d in enumerate(date_universe)}
date_to_index = {d:i-5 for i,d in enumerate(date_universe)}

In [13]:
# Calculate dimensions for tensor
n_stocks = len(company_to_tweets.keys())
n_days = len(date_universe)
max_tweets = 0
for c,d in itertools.product(company_to_tweets.keys(), date_universe):
    if d in company_to_tweets[c]:
        max_tweets = max(max_tweets, len(company_to_tweets[c][d]))
# Create index mapping for stocks alphabetically
company_to_index = {c:i for i,c in enumerate(sorted(list(company_to_tweets.keys())))}
# print dimensions
print(n_stocks)
print(n_days)
print(max_tweets)

87
1473
555


In [14]:
# Construct tensors
price_tensor = np.zeros((n_stocks, n_days-5, 6, 3))
smi_tensor = np.zeros((n_stocks, n_days-5, 6, max_tweets, 512))

In [38]:
# SMI tensor
for company in tqdm_notebook(company_to_price_df.keys()):
    dates = sorted(list(company_to_price_df[company].date))
    lags = []
    for date in dates[:5]:
        entry = []
        row = company_to_price_df[company].loc[company_to_price_df[company]['date'] == date]
        entry.append(row['high'].values[0])
        entry.append(row['low'].values[0])
        entry.append(row['adjust_close'].values[0])
        lags.append(entry)
    for date in dates[5:]:
        entry = []
        row = company_to_price_df[company].loc[company_to_price_df[company]['date'] == date]
        entry.append(row['high'].values[0])
        entry.append(row['low'].values[0])
        entry.append(row['adjust_close'].values[0])
        lags.append(entry)
        company_index = company_to_index[company]
        date_index = date_to_index[date]
        for i,entry in enumerate(lags):
            for j,price in enumerate(entry):
                #stocks, day, lags, hi/lo/close
                price_tensor[company_index, date_index, i, j] = price
        lags.pop(0)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for company in tqdm_notebook(company_to_price_df.keys()):


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




In [39]:
# SMI tensor
for company in tqdm_notebook(company_to_tweets.keys()):
    dates = sorted(list(company_to_tweets[company].keys()))
    lags = []
    for date in dates[:5]:
        n_tweets = len(company_to_tweets[company][date])
        lags.append([company_to_tweets[company][date][k]['embedding'] for k in range(n_tweets)])
    for date in dates[5:]:
        n_tweets = len(company_to_tweets[company][date])
        lags.append([company_to_tweets[company][date][k]['embedding'] for k in range(n_tweets)])
        company_index = company_to_index[company]
        date_index = date_to_index[date]
        for i,messages in enumerate(lags):
            for j,embedding in enumerate(messages):
                #stocks, day, lags, tweet, embedding
                smi_tensor[company_index, date_index, i, j, :] = embedding[:]
        lags.pop(0)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for company in tqdm_notebook(company_to_tweets.keys()):


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




In [None]:
np.savez_compressed('prices.npz', price_tensor)
np.savez_compressed('smi.npz', smi_tensor)
"""import h5py
with h5py.File('prices.h5', 'w') as hf:
    hf.create_dataset("prices",  data=price_tensor)
with h5py.File('smi.h5', 'w') as hf:
    hf.create_dataset("smi",  data=smi_tensor)"""