<a href="https://colab.research.google.com/github/96jonesa/CSE-517-Project/blob/main/testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [1]:
!pip3 install --quiet "tensorflow-hub>=0.7.0"
!pip3 install --quiet seaborn
!pip3 install --quiet pandas-market-calendars

In [2]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from absl import logging
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import seaborn as sns
import json
import itertools
import pandas as pd
import torch
import pandas_market_calendars as mcal
import datetime
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm

#Layers

In [3]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, batch_first=False):
        super(GRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.batch_first = batch_first

        self.gru = nn.GRU(input_size, hidden_size, batch_first=self.batch_first)

    def forward(self, input):
        output, hn = self.gru(input)
        return output, hn

In [4]:
# output is ReLU(left^T W right + b) where W is a learned paramater matrix
# and b is a learned bias

class Blend(nn.Module):
    def __init__(self, left_size, right_size, output_size):
        super(Blend, self).__init__()
        self.left_size = left_size
        self.right_size = right_size
        self.output_size = output_size

        self.bilinear = nn.Bilinear(self.left_size, self.right_size, output_size, bias=True)
        self.relu = nn.ReLU()
    
    def forward(self, left, right):
        output = self.relu(self.bilinear(left, right))

        return output

In [5]:
class MANSF(nn.Module):
    def __init__(self, T, gru_hidden_size, attn_inter_size, use_embed_size,
                 blend_size, gat_1_inter_size, gat_2_inter_size, leakyrelu_slope, elu_alpha, U):
        super(MANSF, self).__init__()
        self.T = T
        self.gru_hidden_size = gru_hidden_size
        self.attn_inter_size = attn_inter_size
        self.use_embed_size = use_embed_size
        self.blend_size = blend_size
        self.leakyrelu_slope = leakyrelu_slope
        self.elu_alpha = elu_alpha
        self.U = U

        self.gru_p = GRU(3, gru_hidden_size, batch_first=True)
        self.gru_m = GRU(use_embed_size, gru_hidden_size, batch_first=True)
        self.gru_s = GRU(gru_hidden_size, gru_hidden_size, batch_first=True)
        self.blend = Blend(gru_hidden_size, gru_hidden_size, blend_size)
        self.sigmoid = nn.Sigmoid()
        self.elu = nn.ELU(elu_alpha)
        self.final_linear = nn.Linear(blend_size, 1, bias=True)

    # p is price data tensor of shape (num_stocks, T, 3), for the day under consideration
    # m is smi data list of tensors of shape (num_stocks, K, use_embed_size) of length T,
    #       where K is the number of tweets for the given stock on the day under consideration
    # neighorhoods is a list of adjacency lists, where each stock is indexed with the same
    #       indices they have in p and m
    def forward(self, p, m, m_mask, neighborhoods):
        ## price encoding
        _, h_p = self.gru_p(p)

        ## smi encoding (day level)
        r = torch.zeros(p.shape[0], 0, self.gru_hidden_size)
        r = r.to(device)
        for t in range(self.T):
            h_m, _ = self.gru_m(m[t])
            r = torch.cat((r, h_m), 1)

        ## smi encoding (aggregate)
        _, h_s = self.gru_s(r)

        ## blending
        x = self.blend(h_p, h_s)

        ## reshaping (eliminating superfluous dimension)
        x = x.squeeze()
        
        ## final layer
        y = self.sigmoid(self.final_linear(x))

        ## return result
        return y

#Data Processing

In [6]:
#!wget https://github.com/yumoxu/stocknet-dataset/archive/master.zip

In [7]:
#!unzip master.zip

In [8]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [9]:
tf.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()

Instructions for updating:
non-resource variables are not supported in the long term


In [10]:
stocknet_dataset_filepath = './stocknet-dataset-master'
train_start_date = '2014-01-01'
train_end_date = '2015-07-31'
val_start_date = '2015-08-01'
val_end_date = '2015-09-30'
test_start_date = '2015-10-01'
test_end_date = '2016-01-01'

In [11]:
def prep_dataset(dataset_filepath, start_date, end_date):
    cache = {}
    calendar = mcal.get_calendar('NYSE')
    def next_trading_day(start_day=None, SAFE_DELTA = 4):
        """Returns the next/previous trading date separated by a certain number of 
        trading days.
        """
        if start_day is None:
            start_day = datetime.datetime.utcnow().date()
        if start_day in cache:
            return cache[start_day]
        start = pd.to_datetime(start_day)
        end = start + np.timedelta64(SAFE_DELTA, 'D')
        business_days = calendar.valid_days(start_date=start, end_date=end)
        next_day = business_days[1].date()
        next_day = next_day.strftime("%Y-%m-%d")
        cache[start_day] = next_day
        return next_day
    
    raw_prices_filepath = stocknet_dataset_filepath + '/price/raw'
    preprocessed_tweets_filepath = stocknet_dataset_filepath + '/tweet/preprocessed'

    company_to_price_df = {}
    company_to_tweets = {}

    for filename in os.listdir(raw_prices_filepath):
        with open(raw_prices_filepath + '/' + filename) as file:
            company_name = filename.split('.')[0]
            
            # Not enough data for GMRE
            if company_name == 'GMRE':
                continue
            df = pd.read_csv(file)
            df.columns = ['date', 'open', 'high', 'low', 'close', 'adjust_close', 'volume']
            mask = (df['date'] >= start_date) & (df['date'] <= end_date)
            df = df.loc[mask]
            company_to_price_df[company_name] = df.dropna()

    for filename in tqdm(os.listdir(preprocessed_tweets_filepath)):
        company_name = filename.split('.')[0]
        dates_to_tweets = {}
        for tweet_filename in os.listdir(preprocessed_tweets_filepath + '/' + filename):
            if tweet_filename < start_date or tweet_filename > end_date:
                continue
            with open(preprocessed_tweets_filepath + '/' + filename + '/' + tweet_filename) as file:
                list_of_tweets = []
                for line in file:
                    tweet_json = json.loads(line)
                    list_of_tweets.append(tweet_json)
                date_idx = next_trading_day(tweet_filename)
                if date_idx not in dates_to_tweets:
                    dates_to_tweets[date_idx] = list_of_tweets
                else:
                    dates_to_tweets[date_idx] += list_of_tweets
        company_to_tweets[company_name] = dates_to_tweets
    
    # Reduce logging output.
    logging.set_verbosity(logging.ERROR)
    tf.get_logger().setLevel(logging.ERROR)
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

    # Import the Universal Sentence Encoder's TF Hub module
    def embed_useT(module):
        with tf.Graph().as_default():
            sentences = tf.placeholder(tf.string)
            embed = hub.Module(module)
            embeddings = embed(sentences)
            session = tf.train.MonitoredSession()
        return lambda x: session.run(embeddings, {sentences: x})
    embed_fn = embed_useT(module_url)

    # Generate embeddings
    for company in tqdm(company_to_tweets.keys()):
        for date in company_to_tweets[company].keys():
            messages = []
            for j in range(len(company_to_tweets[company][date])):
                messages.append(' '.join(company_to_tweets[company][date][j]['text']))
                message_embeddings = embed_fn(messages)
            for k in range(len(company_to_tweets[company][date])):
                company_to_tweets[company][date][k]['embedding'] = list(message_embeddings[k])
    
    # Create date mapping
    date_universe = set()
    for company in company_to_price_df.keys():
        date_universe = date_universe.union(set(company_to_price_df[company].date))
    for company in company_to_tweets.keys():
        date_universe = date_universe.union(set(company_to_tweets[company].keys()))
    date_universe = sorted(list(date_universe))
    index_to_date = {i-5:d for i,d in enumerate(date_universe)}
    date_to_index = {d:i-5 for i,d in enumerate(date_universe)}

    # Calculate dimensions for tensor
    n_stocks = len(company_to_tweets.keys())
    n_days = len(date_universe)
    max_tweets = 0
    for c,d in itertools.product(company_to_tweets.keys(), date_universe):
        if d in company_to_tweets[c]:
            max_tweets = max(max_tweets, len(company_to_tweets[c][d]))
    # Create index mapping for stocks alphabetically
    company_to_index = {c:i for i,c in enumerate(sorted(list(company_to_tweets.keys())))}

    return company_to_price_df, company_to_tweets, date_universe, n_days, n_stocks, max_tweets

In [12]:
train_company_to_price_df, train_company_to_tweets, train_date_universe, train_n_days, train_n_stocks, train_max_tweets = prep_dataset(stocknet_dataset_filepath, train_start_date, train_end_date)
val_company_to_price_df, val_company_to_tweets, val_date_universe, val_n_days, val_n_stocks, val_max_tweets = prep_dataset(stocknet_dataset_filepath, val_start_date, val_end_date)
test_company_to_price_df, test_company_to_tweets, test_date_universe, test_n_days, test_n_stocks, test_max_tweets = prep_dataset(stocknet_dataset_filepath, test_start_date, test_end_date)

HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




HBox(children=(IntProgress(value=0, max=87), HTML(value='')))




#Dataset and DataLoader

In [13]:
class StockDataset(Dataset):
    """Price dataset"""

    def __init__(self, company_to_price_df, company_to_tweets, date_universe, n_days, n_stocks, max_tweets):
        # Initialize class members
        self.n_stocks = n_stocks
        self.n_days = n_days
        self.max_tweets = max_tweets
        self.window = 6
        window = self.window

        # Build maps
        self.company_to_index = {c:i for i,c in enumerate(sorted(list(company_to_tweets.keys())))}
        self.date_to_index = {d:i for i,d in enumerate(date_universe)}
        self.index_to_date = {i:d for i,d in enumerate(date_universe)}

        # Store data
        self.company_to_price_df = company_to_price_df
        self.company_to_tweets = company_to_tweets
        
        # Get price data tensor: n_stocks, n_days, 3
        self.price_data = np.zeros((n_stocks, n_days, 3))
        for company in company_to_price_df.keys():
            df = company_to_price_df[company]
            df.reset_index(inplace=True, drop=True)
            # Look up specific rows in DF
            for index, row in df.iterrows():
                # Grab row with particular date
                if index != 0:
                    d_index = self.date_to_index[row['date']]
                    c_index = self.company_to_index[company]
                    self.price_data[c_index, d_index, 0] = row['high'] / prev_close
                    self.price_data[c_index, d_index, 1] = row['low'] / prev_close
                    self.price_data[c_index, d_index, 2] = row['close'] / prev_close
                prev_close = row['close']
                
        # Which stocks are usable for these dates, shape n_days n_stocks
        self.usable_stocks = torch.ones((self.n_days-7, self.n_stocks))
    
        # Labels of shape n_days, n_stocks
        self.labels = torch.zeros((self.n_days-7, self.n_stocks))
        

        # Get labels
        for i in range(self.n_days-7):
            # Day after (for label)
            day_after = self.index_to_date[i + window + 1]
            # Current day
            current_day = self.index_to_date[i + window]
            for company in self.company_to_price_df.keys():
                df = self.company_to_price_df[company]

                # Grab row with particular date
                post_row = df.loc[df['date'] == day_after]
                row = df.loc[df['date'] == current_day]
                c_index = self.company_to_index[company]

                if (len(post_row['close']) > 0) and (len(row['close']) > 0):
                    close = np.zeros((1))
                    close[0] = post_row['close']
                    close[0] /= row['close']
                    if close >= 1.0055:
                        self.labels[i, c_index] = 1
                    elif close <= 0.995:
                        self.labels[i, c_index] = 0
                    else:
                        self.usable_stocks[i, c_index] = 0
                else:
                    self.usable_stocks[i, c_index] = 0

    def __len__(self):
        return self.n_days-7

    def __getitem__(self, idx):
        """
        gets a price tensor of shape (n_stocks, 6, 3)
        gets a smi tensor of shape (n_stocks, 6, K, 512)
        """
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # Size of sliding window
        window = self.window
        
        # Current day's usable stocks from price filter
        usable_stocks = self.usable_stocks[idx]
        
        # Labels from price day
        labels = self.labels[idx]

        # Dates that we need to look up
        dates_range = [self.index_to_date[i] for i in range(idx + 1, idx + window + 1)]

        # Day after (for label)
        day_after = self.index_to_date[idx + window + 1]

        # Current day
        current_day = self.index_to_date[idx + window]

        # Get price data tensor: n_stocks, window, 3
        price_data = self.price_data[:, idx+1:idx+window+1, :]

        # Extract tweets for specific window
        smi_data = np.zeros((self.n_stocks, window, self.max_tweets, 512))
        tweet_counts = np.zeros((self.n_stocks, window))
        for company in self.company_to_tweets.keys():

            # Look up tweets from specific days
            for date_idx, date in enumerate(dates_range):
                n_tweets = 0
                tweets = []
                c_index = self.company_to_index[company]
                if date in self.company_to_tweets[company]:
                    n_tweets = len(self.company_to_tweets[company][date])
                    tweets = [self.company_to_tweets[company][date][k]['embedding'] for k in range(n_tweets)]
                else:
                    usable_stocks[c_index] = 0
                tweet_counts[c_index, date_idx] = n_tweets
                if n_tweets == 0:
                    usable_stocks[c_index] = 0
                for i,embedding in enumerate(tweets): 
                    #stocks, day, lags, tweet, embedding
                    smi_data[c_index, date_idx, i, :] = embedding[:]

        usable_stocks = (usable_stocks == 1)

        m_mask = torch.zeros(6, self.n_stocks, self.max_tweets, 1)
        for t in range(6):
            for i in range(self.n_stocks):
                m_mask[t, i, 0:int(round(tweet_counts[i][t])), 0] = 1

        price_output = price_data[usable_stocks,:,:]
        smi_output = smi_data[usable_stocks,:,:,:]
        tweet_count = tweet_counts[usable_stocks,:]
        m_mask = m_mask[:,usable_stocks,:,:]
        labels = labels[usable_stocks]
        
        # construct output
        return price_output, smi_output, tweet_count, usable_stocks, labels, m_mask

In [14]:
train_dataset = StockDataset(train_company_to_price_df, train_company_to_tweets, train_date_universe, train_n_days, train_n_stocks, train_max_tweets)
val_dataset = StockDataset(val_company_to_price_df, val_company_to_tweets, val_date_universe, val_n_days, val_n_stocks, val_max_tweets)
test_dataset = StockDataset(test_company_to_price_df, test_company_to_tweets, test_date_universe, test_n_days, test_n_stocks, test_max_tweets)

In [15]:
train_dataloader = DataLoader(train_dataset, batch_size=1,
                        shuffle=True, num_workers=0)

val_dataloader = DataLoader(val_dataset, batch_size=1,
                        shuffle=False, num_workers=0)

test_dataloader = DataLoader(test_dataset, batch_size=1,
                        shuffle=False, num_workers=0)

#Separator

In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


#Training

In [17]:
mansf = MANSF(T=6,
              gru_hidden_size=64,
              attn_inter_size=32,
              use_embed_size=512,
              blend_size=32,
              gat_1_inter_size=32,
              gat_2_inter_size=32,
              leakyrelu_slope=0.01,
              elu_alpha=1.0,
              U=8)

In [18]:
mansf = mansf.to(device)

In [19]:
optimizer = optim.Adam(mansf.parameters(), lr=5e-5)
loss_fn = nn.BCELoss(reduction='mean')

In [20]:
train_acc_list = []
val_acc_list = []

In [24]:
for epoch in range(18):
    mansf.train()
    correct = 0.0
    total = 0.0
    running_loss = 0.0
    for price, smi, n_tweets, usable_stocks, labels, m_mask in tqdm(train_dataloader):
        price = price.type(torch.FloatTensor)
        smi = smi.type(torch.FloatTensor)
        
        price = price.to(device).squeeze(axis=0)
        smi = smi.to(device).squeeze(axis=0).permute(1, 0, 2, 3)
        n_tweets = n_tweets.to(device).squeeze(axis=0)
        usable_stocks = usable_stocks.to(device).squeeze(axis=0)
        labels = labels.to(device)
        m_mask = m_mask.to(device).squeeze(axis=0)

        m = []
        for t in range(6):
            m.append(smi[t])

        neighborhoods = torch.eye(87, 87)
        neighborhoods = neighborhoods.to(device)
        neighborhoods = neighborhoods[usable_stocks, :]
        neighborhoods = neighborhoods[:, usable_stocks]

        if price.shape[0] != 0:
            y = mansf(price, smi, m_mask, neighborhoods)
            loss = loss_fn(y.view(-1), labels.view(-1))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            correct += torch.sum(((y > 0.5).view(-1) == labels.view(-1))).item()
            total += len(y)
            running_loss = loss.item() * len(y)

    train_acc = correct / total
    train_acc_list.append(train_acc)

    mansf.eval()
    correct = 0.0
    total = 0.0
    for price, smi, n_tweets, usable_stocks, labels, m_mask in tqdm(val_dataloader):
        price = price.type(torch.FloatTensor)
        smi = smi.type(torch.FloatTensor)
        
        price = price.to(device).squeeze(axis=0)
        smi = smi.to(device).squeeze(axis=0).permute(1, 0, 2, 3)
        n_tweets = n_tweets.to(device).squeeze(axis=0)
        usable_stocks = usable_stocks.to(device).squeeze(axis=0)
        labels = labels.to(device)
        m_mask = m_mask.to(device).squeeze(axis=0)

        m = []
        for t in range(6):
            m.append(smi[t])

        neighborhoods = torch.eye(87, 87)
        neighborhoods = neighborhoods.to(device)
        neighborhoods = neighborhoods[usable_stocks, :]
        neighborhoods = neighborhoods[:, usable_stocks]

        if price.shape[0] != 0:
            y = mansf(price, smi, m_mask, neighborhoods)
            correct += torch.sum((y > 0.5).view(-1) == labels.view(-1)).item()
            total += len(y)

    val_acc = correct / total
    val_acc_list.append(val_acc)

    print('epoch:', epoch, 'loss:', running_loss, 'train_acc:', train_acc, 'val_acc:', val_acc)

HBox(children=(IntProgress(value=0, max=392), HTML(value='')))

tensor([], size=(1, 0, 6, 3))
tensor([[[[ 97.3200,  96.4200,  91.3289],
          [ 97.8400,  96.6400,  91.9313],
          [ 99.2400,  97.5500,  93.2019],
          [ 99.4400,  98.2500,  92.5995],
          [ 98.7000,  97.6700,  92.3831],
          [ 97.4500,  95.3300,  89.9829]],

         [[122.3600, 121.3200, 113.3894],
          [122.8800, 121.6700, 113.9273],
          [123.0500, 121.6600, 113.7511],
          [124.3900, 122.2500, 114.3632],
          [131.2400, 128.6300, 120.5771],
          [129.3400, 127.0400, 118.1472]],

         [[364.8500, 358.5200, 358.6100],
          [324.8700, 314.7600, 324.0100],
          [324.8200, 316.5000, 320.4100],
          [322.9000, 319.5000, 320.0000],
          [322.7300, 318.5000, 322.5100],
          [320.6800, 311.8600, 312.9900]],

         [[ 50.4700,  49.9600,  48.9673],
          [ 50.2000,  49.9400,  48.9087],
          [ 50.0800,  49.5400,  48.5861],
          [ 49.8700,  49.4200,  48.3123],
          [ 50.2700,  49.6300,  48.8793]

tensor([[[[312.9800, 310.0100, 310.3500],
          [314.7500, 306.9600, 308.5200],
          [308.3800, 300.8500, 302.1900],
          [303.0000, 292.3800, 295.2900],
          [301.2800, 295.3300, 298.4200],
          [303.1400, 296.1100, 300.4600]],

         [[106.4700, 103.6900, 103.9400],
          [104.7200, 102.5200, 103.6000],
          [103.0200,  99.9000, 101.0000],
          [103.8500, 100.1100, 103.3200],
          [104.7400, 102.0300, 102.1300],
          [105.3400, 102.6800, 105.0300]],

         [[ 18.2100,  17.8900,  17.2040],
          [ 18.0300,  17.6800,  17.2136],
          [ 17.8100,  17.2900,  16.7136],
          [ 17.4400,  16.7800,  16.2135],
          [ 17.1800,  16.8700,  16.2905],
          [ 17.3400,  17.1000,  16.6270]],

         [[ 55.0700,  54.1000,  52.9178],
          [ 54.6900,  53.7900,  53.0645],
          [ 53.9600,  52.3300,  51.3922],
          [ 52.7700,  50.5300,  49.5830],
          [ 51.5500,  50.6200,  50.0426],
          [ 52.0800,  51.500

KeyboardInterrupt: 

In [23]:
mansf.eval()

price, smi, n_tweets, usable_stocks, labels, m_mask = next(iter(val_dataloader))

price = price.type(torch.FloatTensor)
smi = smi.type(torch.FloatTensor)

price = price.to(device)
smi = smi.to(device)
n_tweets = n_tweets.to(device)
usable_stocks = usable_stocks.to(device)
labels = labels.to(device)
m_mask = m_mask.to(device)

price = price.view(price.shape[1], price.shape[2], price.shape[3])
smi = smi.view(smi.shape[1], smi.shape[2], smi.shape[3], smi.shape[4])
n_tweets = n_tweets.view(n_tweets.shape[1], n_tweets.shape[2])
usable_stocks = usable_stocks.view(usable_stocks.shape[1])
m_mask = m_mask.view(m_mask.shape[1], m_mask.shape[2], m_mask.shape[3], m_mask.shape[4])

smi = smi.permute(1, 0, 2, 3)

m = []
for t in range(6):
    m.append(smi[t])

neighborhoods = torch.eye(87, 87)
neighborhoods = neighborhoods.to(device)
neighborhoods = neighborhoods[usable_stocks, :]
neighborhoods = neighborhoods[:, usable_stocks]

y = mansf(price, smi, m_mask, neighborhoods)

In [None]:
print(y)

In [None]:
print(labels)

#Figures

In [None]:
def plot(X, Y, xlabel, ylabel, legend, title):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    for i in range(len(Y)):
        ax.plot(X, Y[i], label=legend[i])

    plt.grid(color='0.95')
    plt.legend()
    ax.set(xlabel=xlabel, ylabel=ylabel, title=title)

In [None]:
"""plot(range(18),
     [train_acc_list, val_acc_list],
     'epoch',
     'accuracy',
     ['training accuracy', 'validation accuracy'],
     'accuracy vs. epoch')"""