In [1]:
# Install the latest Tensorflow version.
!pip3 install --quiet "tensorflow>=1.7"
# Install TF-Hub.
!pip3 install --quiet "tensorflow-hub>=0.7.0"
!pip3 install --quiet seaborn

In [28]:
from absl import logging

import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import seaborn as sns
import json
import itertools
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

In [3]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [4]:
tf.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()

Instructions for updating:
non-resource variables are not supported in the long term


In [5]:
stocknet_dataset_filepath = './stocknet-dataset-master'

In [6]:
preprocessed_prices_filepath = stocknet_dataset_filepath + '/price/preprocessed'
preprocessed_tweets_filepath = stocknet_dataset_filepath + '/tweet/preprocessed'

company_to_price_df = {}
company_to_tweets = {}

for filename in os.listdir(preprocessed_prices_filepath):
    with open(preprocessed_prices_filepath + '/' + filename) as file:
        company_name = filename.split('.')[0]
        
        # Not enough data for GMRE
        if company_name == 'GMRE':
            continue
        df = pd.read_csv(file, sep='\t')
        df.columns = ['date', 'open', 'high', 'low', 'close', 'adjust_close', 'volume']
        company_to_price_df[company_name] = df.dropna()

for filename in tqdm_notebook(os.listdir(preprocessed_tweets_filepath)):
    company_name = filename.split('.')[0]
    dates_to_tweets = {}
    for tweet_filename in os.listdir(preprocessed_tweets_filepath + '/' + filename):
        with open(preprocessed_tweets_filepath + '/' + filename + '/' + tweet_filename) as file:
            list_of_tweets = []
            for line in file:
                tweet_json = json.loads(line)
                list_of_tweets.append(tweet_json)
            dates_to_tweets[tweet_filename] = list_of_tweets
    company_to_tweets[company_name] = dates_to_tweets

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for filename in tqdm_notebook(os.listdir(preprocessed_tweets_filepath)):


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




In [7]:
#print(company_to_tweets.keys())
#print(dates_to_tweets.keys())
print(company_to_tweets['AAPL']['2015-10-02'][0])

{'text': ['rt', '$', 'tsla', 'hft', 'algos', 'triggered', 'buy', 'in', 'sigma-x', ',', 'crossfinder', ',', 'ats', ',', 'lx', '@', '08:28', ',', 'p', '/', 't', '245.00', 'quant', '$', 'msft', '$', 'fb', '$', 'gpro', '$', 'amzn', '$', 'goog', '$', 'aapl', '$', 'nflx', '$', 'qqq'], 'created_at': 'Fri Oct 02 12:29:15 +0000 2015', 'user_id_str': '242469235'}


In [8]:
# Reduce logging output.
logging.set_verbosity(logging.ERROR)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Import the Universal Sentence Encoder's TF Hub module
def embed_useT(module):
    with tf.Graph().as_default():
        sentences = tf.placeholder(tf.string)
        embed = hub.Module(module)
        embeddings = embed(sentences)
        session = tf.train.MonitoredSession()
    return lambda x: session.run(embeddings, {sentences: x})
embed_fn = embed_useT(module_url)

In [9]:
# Generate embeddings
for company in tqdm_notebook(company_to_tweets.keys()):
  for date in company_to_tweets[company].keys():
    messages = []
    for j in range(len(company_to_tweets[company][date])):
      messages.append(' '.join(company_to_tweets[company][date][j]['text']))
    message_embeddings = embed_fn(messages)
    for k in range(len(company_to_tweets[company][date])):
      company_to_tweets[company][date][k]['embedding'] = list(message_embeddings[k])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for company in tqdm_notebook(company_to_tweets.keys()):


HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




In [10]:
# Create date mapping
date_universe = set()
for company in company_to_price_df.keys():
    date_universe = date_universe.union(set(company_to_price_df[company].date))
for company in company_to_tweets.keys():
    date_universe = date_universe.union(set(company_to_tweets[company].keys()))
date_universe = sorted(list(date_universe))
index_to_date = {i-5:d for i,d in enumerate(date_universe)}
date_to_index = {d:i-5 for i,d in enumerate(date_universe)}

In [11]:
# Calculate dimensions for tensor
n_stocks = len(company_to_tweets.keys())
n_days = len(date_universe)
max_tweets = 0
for c,d in itertools.product(company_to_tweets.keys(), date_universe):
    if d in company_to_tweets[c]:
        max_tweets = max(max_tweets, len(company_to_tweets[c][d]))
# Create index mapping for stocks alphabetically
company_to_index = {c:i for i,c in enumerate(sorted(list(company_to_tweets.keys())))}
# print dimensions
print(n_stocks)
print(n_days)
print(max_tweets)

87
1473
555


In [12]:
# Construct tensors
price_tensor = np.zeros((n_stocks, n_days-5, 6, 3))
smi_tensor = np.zeros((n_stocks, n_days-5, 6, max_tweets, 512))

In [93]:
class StockDataset(Dataset):
    """Price dataset"""

    def __init__(self, company_to_price_df, company_to_tweets, date_universe, n_days, n_stocks, max_tweets):
        # Initialize class members
        self.n_stocks = n_stocks
        self.n_days = n_days
        self.max_tweets = max_tweets
        company_to_index = {c:i for i,c in enumerate(sorted(list(company_to_tweets.keys())))}
        date_to_index = {d:i for i,d in enumerate(date_universe)}
        # Get price data tensor: n_stocks, n_days, 3
        self.price_data = np.zeros((n_stocks, n_days, 3))
        for company in company_to_price_df.keys():
            df = company_to_price_df[company]
            for index, row in df.iterrows():
                d_index = date_to_index[row['date']]
                c_index = company_to_index[company]
                self.price_data[c_index, d_index, 0] = row['high']
                self.price_data[c_index, d_index, 1] = row['low']
                self.price_data[c_index, d_index, 2] = row['adjust_close']
        # Get smi data tensor
        self.smi_data = np.zeros((n_stocks, n_days, max_tweets, 512))
        self.tweet_counts = np.zeros((n_stocks, n_days))
        for company in company_to_tweets.keys():
            dates = sorted(list(company_to_tweets[company].keys()))
            for date in dates:
                n_tweets = len(company_to_tweets[company][date])
                tweets = [company_to_tweets[company][date][k]['embedding'] for k in range(n_tweets)]
                c_index = company_to_index[company]
                d_index = date_to_index[date]
                self.tweet_counts[c_index, d_index] = n_tweets
                for i,embedding in enumerate(tweets):
                    #stocks, day, lags, tweet, embedding
                    self.smi_data[c_index, d_index, i, :] = embedding[:]

    def __len__(self):
        return self.n_days-5

    def __getitem__(self, idx):
        """
        gets a price tensor of shape (n_stocks, 6, 3)
        gets a smi tensor of shape (n_stocks, 6, K, 512)
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        #price_output = np.zeros((self.n_stocks, 6, 3))
        price_output = self.price_data[:, idx:idx+6, :]
        
        #smi_output = np.zeros((self.n_stocks, 6, self.max_tweets, 512))
        smi_output = self.smi_data[:, idx:idx+6, :, :]
        
        tweet_count = self.tweet_counts[:, idx:idx+6]
        
        # construct output
        output = {'price': price_output, 'smi': smi_output, 'n_tweets': tweet_count}
                
        return output

In [94]:
price_dataset = StockDataset(company_to_price_df, company_to_tweets, date_universe, n_days, n_stocks, max_tweets)

In [102]:
dataloader = DataLoader(price_dataset, batch_size=4,
                        shuffle=True, num_workers=0)

In [103]:
for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch)
    print(sample_batched)

0
{'price': tensor([[[[ 0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000],
          [ 0.0451,  0.0452, -0.5003],
          [ 0.0321,  0.0473,  0.1828],
          [ 0.0347,  0.0369, -1.5201],
          [ 0.0462,  0.0572,  1.5875]],

         [[ 0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000],
          [ 0.0652,  0.0686, -0.2044],
          [ 0.0599,  0.0696, -0.2322],
          [ 0.0800,  0.0841,  0.0557],
          [ 0.0696,  0.0835,  0.1022]],

         [[ 0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000],
          [ 0.0821,  0.0912,  0.1205],
          [ 0.0674,  0.0710, -1.3656],
          [ 0.0749,  0.0854,  0.1309],
          [ 0.0766,  0.1139,  2.0578]],

         ...,

         [[ 0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000],
          [ 0.0609,  0.0672,  0.0377],
          [ 0.0580,  0.0639, -0.3015],
          [ 0.0580,  0.0621, -0.3392],
          [ 0.0712,  0.0860,  1.1213]],

         [[ 0.0000,  0.0000, 

1
{'price': tensor([[[[ 4.7991e-02,  5.3194e-02, -7.9163e-01],
          [ 4.3625e-02,  4.3942e-02, -3.2238e+00],
          [ 4.8463e-02,  6.5396e-02,  1.5546e+00],
          [ 4.5087e-02,  4.9670e-02, -1.8694e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00]],

         [[ 1.1568e-01,  1.2247e-01,  3.5839e-02],
          [ 1.1308e-01,  1.1725e-01, -2.6876e-02],
          [ 1.1621e-01,  1.2090e-01, -8.9630e-03],
          [ 1.2456e-01,  1.2613e-01,  8.9630e-03],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00]],

         [[ 1.0106e-01,  1.0920e-01,  2.7372e-01],
          [ 1.0013e-01,  1.0013e-01, -2.2810e-01],
          [ 9.1108e-02,  9.9425e-02, -9.1240e-02],
          [ 9.6171e-02,  1.0450e-01, -1.8250e-02],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00]],

         ...,

         [[ 7.7209e-02,  8.4041e-02,  2.5106e-01]

2
{'price': tensor([[[[ 0.0793,  0.0926,  1.0009],
          [ 0.0715,  0.0766, -0.4213],
          [ 0.0748,  0.0793, -0.8454],
          [ 0.0756,  0.0846,  0.1223],
          [ 0.0741,  0.0766, -0.6048],
          [ 0.0802,  0.0877,  0.8161]],

         [[ 0.1532,  0.1614,  0.1474],
          [ 0.1495,  0.1577,  0.0693],
          [ 0.1641,  0.1655,  0.0780],
          [ 0.1500,  0.1541, -0.1474],
          [ 0.1527,  0.1559, -0.1907],
          [ 0.1545,  0.1545, -0.0173]],

         [[ 0.1432,  0.1547,  0.1667],
          [ 0.1396,  0.1567,  0.3509],
          [ 0.1505,  0.1757,  0.3772],
          [ 0.1366,  0.1429, -0.3597],
          [ 0.1389,  0.1418, -0.4474],
          [ 0.1477,  0.1570,  0.5439]],

         ...,

         [[ 0.1158,  0.1220,  0.2966],
          [ 0.1098,  0.1269,  0.4044],
          [ 0.1137,  0.1173, -0.1168],
          [ 0.1155,  0.1232,  0.1348],
          [ 0.1170,  0.1257,  0.0539],
          [ 0.1175,  0.1323,  0.4134]],

         [[ 0.0966,  0.1217, 

3
{'price': tensor([[[[ 0.0934,  0.1109,  0.7260],
          [ 0.0922,  0.0978, -0.1037],
          [ 0.0959,  0.0984,  0.5448],
          [ 0.0913,  0.1029,  0.7693],
          [ 0.0932,  0.1073,  0.9072],
          [ 0.0849,  0.0938, -0.5514]],

         [[ 0.1631,  0.1641, -0.0087],
          [ 0.1505,  0.1689,  0.0780],
          [ 0.1594,  0.1636,  0.0780],
          [ 0.1510,  0.1620,  0.1300],
          [ 0.1453,  0.1505, -0.0520],
          [ 0.1635,  0.1640, -0.0260]],

         [[ 0.1507,  0.1545, -0.3130],
          [ 0.1538,  0.1813,  0.6782],
          [ 0.1416,  0.1608, -0.2000],
          [ 0.1466,  0.1545,  0.0609],
          [ 0.1433,  0.1514, -0.5304],
          [ 0.1427,  0.1574,  0.1478]],

         ...,

         [[ 0.1345,  0.1384,  0.0089],
          [ 0.1293,  0.1488,  0.2127],
          [ 0.1461,  0.1604,  0.6736],
          [ 0.1277,  0.1356,  0.2039],
          [ 0.1277,  0.1298, -0.1330],
          [ 0.1264,  0.1277, -0.2748]],

         [[ 0.1227,  0.1248, 

4
{'price': tensor([[[[ 0.0749,  0.1000,  0.1037],
          [ 0.0850,  0.0968,  0.3952],
          [ 0.0998,  0.1027, -0.4871],
          [ 0.0846,  0.1031,  0.4608],
          [ 0.0899,  0.1115,  0.8704],
          [ 0.0904,  0.1015, -0.2429]],

         [[ 0.1438,  0.1479, -0.1214],
          [ 0.1641,  0.1656,  0.1474],
          [ 0.1604,  0.1624, -0.1560],
          [ 0.1578,  0.1625,  0.0260],
          [ 0.1625,  0.1682,  0.1734],
          [ 0.1459,  0.1469, -0.4421]],

         [[ 0.1448,  0.1654, -0.1465],
          [ 0.1523,  0.1580, -0.9908],
          [ 0.1722,  0.1761, -0.0431],
          [ 0.1479,  0.1554, -1.3009],
          [ 0.1657,  0.1935,  0.5772],
          [ 0.1534,  0.1644, -1.6025]],

         ...,

         [[ 0.1085,  0.1378, -0.0798],
          [ 0.1144,  0.1364,  0.2039],
          [ 0.1439,  0.1490,  0.2482],
          [ 0.1238,  0.1388,  0.2038],
          [ 0.1293,  0.1515,  0.4432],
          [ 0.1285,  0.1310, -0.6204]],

         [[ 0.1166,  0.1182, 

5
{'price': tensor([[[[ 5.2274e-02,  5.9695e-02, -5.7473e-02],
          [ 4.5437e-02,  4.8941e-02, -6.3211e-01],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 3.2277e-02,  3.6638e-02, -1.6569e+00],
          [ 4.9290e-02,  5.3798e-02,  5.8424e-01]],

         [[ 1.2274e-01,  1.2433e-01,  2.3942e-01],
          [ 7.0735e-02,  7.1787e-02, -4.3659e-01],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 7.0599e-02,  7.0599e-02, -2.9725e-01],
          [ 7.8161e-02,  8.8554e-02,  1.7650e-01]],

         [[ 7.4227e-02,  8.4625e-02, -3.7071e-02],
          [ 3.3219e-02,  5.6017e-02, -2.2614e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
          [ 8.4989e-02,  1.2461e-01,  1.1678e+00],
          [ 8.1457e-02,  1.1164e-01,  1.7517e+00]],

         ...,

         [[ 7.2048e-02,  7.2778e-02, -2.9020e-01]

6
{'price': tensor([[[[ 1.0697e-02,  1.7093e-02,  5.9273e-02],
          [ 1.4102e-02,  1.5845e-02,  9.8650e-03],
          [ 3.9730e-03,  9.7850e-03, -3.1607e-01],
          [ 8.4460e-03,  1.0444e-02, -2.7655e-01],
          [ 5.9830e-02,  8.8698e-02,  7.3091e+00],
          [ 6.3860e-03,  1.7474e-02, -2.1730e-01]],

         [[ 4.9573e-02,  5.1774e-02,  2.9009e-01],
          [ 3.5040e-02,  3.6778e-02,  2.9009e-02],
          [ 2.5491e-02,  3.3303e-02, -2.9009e-02],
          [ 3.4171e-02,  3.6343e-02,  1.9340e-02],
          [ 3.4171e-02,  3.6776e-02, -6.7688e-02],
          [ 3.1558e-02,  3.3300e-02, -7.7356e-02]],

         [[ 1.5374e-02,  2.7660e-02, -1.0795e-01],
          [-2.0950e-03,  1.8694e-02, -1.2463e+00],
          [ 1.7498e-02,  2.7009e-02,  4.2197e-01],
          [ 1.1944e-02,  3.4035e-02,  6.6731e-01],
          [ 1.8360e-02,  1.8860e-02, -2.1589e-01],
          [ 1.5010e-02,  2.2876e-02,  0.0000e+00]],

         ...,

         [[ 2.1040e-02,  3.4026e-02,  5.4837e-01]

KeyboardInterrupt: 