<a href="https://colab.research.google.com/github/96jonesa/CSE-517-Project/blob/main/scaffolding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [28]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

#GRU

##GRU
This is just a wrapper around nn.GRU for the sake of consistency. Used in the Price Encoder, day-level SMI Encoder, and temporal SMI Encoder.

In [29]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, batch_first=False):
        super(GRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.batch_first = batch_first

        self.gru = nn.GRU(input_size, hidden_size, batch_first=self.batch_first)

    def forward(self, input, h_0):
        output, hn = self.gru(input, h_0)
        return output, hn

#Self-Attention

##LinearAttention
The attention mechanism used in Feng et. al. Used in the Price Encoder, day-level SMI Encoder, and temporal SMI Encoder. Given input $h$, returns
$q_t = \sum_{i=t-T}^T \beta_i h_i$ where $\beta_i = \dfrac{\exp\left( u^T \tanh \left( W h_i + b \right) \right)}{\sum_{k=t-T}^t \exp\left( u^T \tanh \left( W h_k + b \right) \right)}$.

In [30]:
# attention weights are softmax(u^T tanh(W input + b)) where W is learned parameter matrix, u is a learned parameter vector, and b is a learned offset

class LinearAttention(nn.Module):
    def __init__(self, input_size, intermediate_size, weights_size):
        super(LinearAttention, self).__init__()
        self.input_size = input_size
        self.intermediate_size = intermediate_size
        self.weights_size = weights_size

        self.linear_1 = nn.Linear(self.input_size, self.intermediate_size, bias=True)
        self.linear_2 = nn.Linear(self.intermediate_size, self.weights_size, bias=False)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=2)

    def forward(self, input):
        intermediate = self.tanh(self.linear_1(input))
        attention_weights = self.softmax(self.linear_2(intermediate))
        attention_weights = attention_weights.permute(0, 2, 1)
        output_features = torch.bmm(attention_weights, input)

        return output_features

#Blending

##Blend
Applies a learned bilinear transformation to the left and right vectors, then inputs the result to a ReLU non-linearity. Used to obtain Multi-Modal Encodings from Price Encodings and temporal SMI Encodings. Given Price Encodings $q_t$ and temporal SMI Encodings $c_t$, returns
$x_t = \mathcal{B} \left( c_t, q_t \right) = \text{ReLU} \left( q_t^T W c_t + b \right)$.

In [32]:
# output is ReLU(left^T W right + b) where W is a learned paramater matrix
# and b is a learned bias

class Blend(nn.Module):
    def __init__(self, left_size, right_size, output_size):
        super(Blend, self).__init__()
        self.left_size = left_size
        self.right_size = right_size
        self.output_size = output_size

        self.bilinear = nn.Bilinear(self.left_size, self.right_size, output_size, bias=True)
        self.relu = nn.ReLU()
    
    def forward(self, left, right):
        output = self.relu(self.bilinear(left, right))

        return output

#Single-Headed Graph Attention Network (SGAT)

##SharedLinear
This is just a wrapper around nn.Linear for the sake of consistency. Used to apply a shared linear transformation to all inputs of an SGAT layer. Under current implementation, this should be applied before passing inputs to SGAT.

In [33]:
# need shared learned parameter matrix W to multiply against each input vector

class SharedLinear(nn.Module):
    def __init__(self, input_size, output_size):
        super(SharedLinear, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        
        self.linear = nn.Linear(input_size, output_size, bias=False)
    
    def forward(self, input):
        output = self.linear(input)

        return output

##SGAT
A single-headed GAT layer. A shared linear transform $W$ is applied to all the nodes *before* passing them as input to this module (by passing them as input to a SharedLinear layer), then a shared self-attention mechanism is applied to each node $i$ in its immediate neighborhood $\mathcal{N}_i$. For each node $j\in \mathcal{N}_i$, normalized attention coefficients $\alpha_{i,j}$ are computed to represent the importance of the relations between stocks $i$ and $j$. That is,
$\alpha_{i,j} = \dfrac{\exp ( \text{LeakyReLU} ( a_w^T [ W x_i 
\oplus W x_j ] ) )}{\sum_{k\in \mathcal{N}_i} \exp ( \text{LeakyReLU} ( a_w^T [ W x_i \oplus W x_k ] ) )}$
where $\oplus$ denotes concatenation and $a_w$ is a learned parameter matrix. An updated feature vector $z_i$ for the $i$-th stock is computed by applying these attention weights to the linearly transformed multi-modal feature vectors of all of the stocks in $\mathcal{N}_i$

In [131]:
# merge code with MGAT code to form general case GAT code

class SGAT(nn.Module):
    def __init__(self, input_size, weights_size, leakyrelu_slope=0.01):
        super(SGAT, self).__init__()
        self.input_size = input_size
        self.weights_size = weights_size
        self.leakyrelu_slope = leakyrelu_slope
        
        self.linear = nn.Linear(2 * input_size, weights_size, bias=False)
        self.leakyrelu = nn.LeakyReLU(self.leakyrelu_slope)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input, neighborhoods, index):
        stock = input[index]
        neighborhood = neighborhoods[index]
        stack_stock = stock.expand(len(neighborhood), stock.shape[0])
        stack_neighbors = input[neighborhood]
        cat_stock = torch.cat((stack_stock, stack_neighbors), dim=1)
        attention_weights = self.softmax(self.leakyrelu(self.linear(cat_stock)))  # check this
        output_features = torch.mm(attention_weights.T, stack_neighbors)  # check this

        return output_features

#MAN-SF

In [None]:
class MANSF(nn.Module):
    def __init__(self, T, num_stocks, gru_hidden_size, attn_inter_size, use_embed_size,
                 blend_size, gat_1_inter_size, gat_2_inter_size, leakyrelu_slope, elu_alpha, U):
        super(MANSF, self).__init__()
        self.T = T
        self.num_stocks = num_stocks
        self.gru_hidden_size = gru_hidden_size
        self.attn_inter_size = attn_inter_size
        self.K = K
        self.use_embed_size = use_embed_size
        self.blend_size = blend_size
        self.gat_1_inter_size = gat_1_inter_size
        self.gat_2_inter_size = gat_2_inter_size
        self.leakyrelu_slope = leakyrelu_slope
        self.elu_alpha = elu_alpha
        self.U = U

        self.gru_p = GRU(3, gru_hidden_size, batch_first=True)
        self.gru_m = GRU(use_embed_size, gru_hidden_size, batch_first=True)
        self.gru_s = GRU(gru_hidden_size, gru_hidden_size, batch_first=True)
        self.attn_p = LinearAttention(gru_hidden_size, attn_inter_size, 1)
        self.attn_m = LinearAttention(gru_hidden_size, attn_inter_size, 1)
        self.attn_s = LinearAttention(gru_hidden_size, attn_inter_size, 1)
        self.blend = Blend(use_embed_size, use_embed_size, blend_size)
        self.shared_linears_1 = []
        for u in range(U):
            self.shared_linears_1.append(SharedLinear(blend_size, gat_1_inter_size))
        self.shared_linears_2 = []
        for u in range(U):
            self.shared_linears_2.append(SharedLinear(U * gat_1_inter_size, gat_2_inter_size))
        self.mgat_1 = []
        for u in range(U):
            self.mgat_1.append(SGAT(gat_1_inter_size, 1, leakyrelu_slope=leakyrelu_slope))
        self.mgat_2 = []
        for u in range(U):
            self.mgat_2.append(SGAT(gat_2_inter_size, 1, leakyrelu_slope=leakyrelu_slope))
        self.sigmoid = nn.Sigmoid()
        self.elu = nn.ELU(elu_alpha)
        self.final_linear = nn.Linear(U * gat_2_inter_size, 1, bias=True)

    # p is price data tensor of shape (num_stocks, T, 3), for the day under consideration
    #
    # m is smi data list of tensors of shape (num_stocks, K, use_embed_size) of length T,
    #       where K is the number of tweets for the given stock on the day under consideration
    #
    # neighorhoods is a list of adjacency lists, where each stock is indexed with the same
    #       indices they have in p and m
    #
    # TODO: tensorize day-level smi
    # TODO: tensorize sgat 
    def forward(self, p, m, neighborhoods):
        ## price encoding
        h_p, _ = self.gru_p(p)
        q = self.attn_p(h_p)

        ## smi encoding (day level)
        r = torch.zeros(self.num_stocks, 0, gru_hidden_size)
        for t in range(T):
            h_m, _ = self.gru_m(m[t])
            r_t = self.attn_m(h_m)
            r = torch.cat((r, r_t), 1)

        ## smi encoding (aggregate)
        h_s, _ = self.gru_s(r)
        c = self.attn_s(h_s)

        ## blending
        x = self.blend(q, c)

        ## reshaping (eliminating superfluous dimension)
        x = x.view(x.shape[0], x.shape[2])

        ## first gat layer
        #  first head
        shared_linear = self.shared_linears_1[0]
        Wx = shared_linear(x)
        sgat = self.mgat_1[0]
        z = sgat(Wx, neighborhoods, 0)
        z = self.elu(z)

        for i in range(1, self.num_stocks):
            z_i = sgat(Wx, neighborhoods, i)
            z_i = self.elu(z_i)
            z = torch.cat((z, z_i), 0)

        #  remaining heads
        for u in range(1, self.U):
            shared_linear = self.shared_linears_1[u]
            Wx = shared_linear(x)
            sgat = mgat_1[u]
            z_u = sgat(Wx, neighborhoods, 0)
            z_u = self.elu(z_u)

            for i in range(1, self.num_stocks):
                z_u_i = sgat(Wx, neighborhoods, i)
                z_u_i = self.elu(z_u_i)
                z_u = torch.cat((z_u, z_u_i), 0)
            
            z = torch.cat((z, z_u), 1)
        
        ## second gat layer
        #  first head
        shared_linear = self.shared_linears_2[0]
        Wx = shared_linear(z)
        sgat = self.mgat_2[0]
        new_z = sgat(Wx, neighborhoods, 0)
        new_z = self.sigmoid(new_z)

        for i in range(1, self.num_stocks):
            new_z_i = sgat(Wx, neighborhoods, i)
            new_z_i = self.sigmoid(new_z_i)
            new_z = torch.cat((new_z, new_z_i), 0)

        #  remaining heads
        for u in range(1, self.U):
            shared_linear = self.shared_linears_2[u]
            Wx = shared_linear(z)
            sgat = mgat_2[u]
            new_z_u = sgat(Wx, neighborhoods, 0)
            new_z_u = self.sigmoid(new_z_u)

            for i in range(1, self.num_stocks):
                new_z_u_i = sgat(Wx, neighborhoods, i)
                new_z_u_i = self.sigmoid(new_z_u_i)
                new_z_u = torch.cat((new_z_u, new_z_u_i), 0)
            
            new_z = torch.cat((new_z, new_z_u), 1)
        
        ## final layer
        y = self.sigmoid(self.final_linear(new_z))

        ## return result
        return y

#Sandbox

##Price Encoding

In [37]:
T = 5  # number of days in lookback window
batch_size = 4
gru_p_hidden_size = 64

p = torch.rand(batch_size, T, 3)  # p_i = [p_i^c, p_i^h, p_i^l], not bothering to normalize for shape tests

print('p.shape', p.shape)

h_p_0 = torch.randn(1, batch_size, gru_p_hidden_size)  # randomly initialized initial hidden state
gru_p = GRU(3, gru_p_hidden_size, batch_first=True)

h_p, h_p_n = gru_p(p, h_p_0)

print('h_p.shape', h_p.shape)
print('h_p_n.shape', h_p_n.shape)

p.shape torch.Size([4, 5, 3])
h_p.shape torch.Size([4, 5, 64])
h_p_n.shape torch.Size([1, 4, 64])


In [38]:
attn_p_intermediate_size = 10

attn_p = LinearAttention(gru_p_hidden_size, attn_p_intermediate_size, 1)

q = attn_p(h_p)

print('q.shape', q.shape)

q.shape torch.Size([4, 1, 64])


##SMI Encoding

In [98]:
K = [7, 9, 11, 13, 15]  # number of tweets for each day in lookback window
T = 5  # number or days in lookback window
batch_size = 4
gru_m_hidden_size = 64
use_embedding_size = 512

r = torch.zeros(batch_size, 0, gru_m_hidden_size)

gru_m = GRU(use_embedding_size, gru_m_hidden_size, batch_first=True)

for t in range(T):

    m = torch.rand(batch_size, K[0], use_embedding_size)

    print('m.shape', m.shape)

    h_m_0 = torch.randn(1, batch_size, gru_m_hidden_size)  # randomly initialized initial hidden state

    h_m, h_m_n = gru_m(m, h_m_0)

    print('h_m.shape', h_m.shape)
    print('h_m_n.shape', h_m_n.shape)

    attn_m_intermediate_size = 10

    attn_m = LinearAttention(gru_m_hidden_size, attn_m_intermediate_size, 1)

    r_t = attn_m(h_m)

    print('r_t.shape', r_t.shape)

    r = torch.cat((r, r_t), 1)

    print('r.shape', r.shape)

    print()

m.shape torch.Size([4, 7, 512])
h_m.shape torch.Size([4, 7, 64])
h_m_n.shape torch.Size([1, 4, 64])
r_t.shape torch.Size([4, 1, 64])
r.shape torch.Size([4, 1, 64])

m.shape torch.Size([4, 7, 512])
h_m.shape torch.Size([4, 7, 64])
h_m_n.shape torch.Size([1, 4, 64])
r_t.shape torch.Size([4, 1, 64])
r.shape torch.Size([4, 2, 64])

m.shape torch.Size([4, 7, 512])
h_m.shape torch.Size([4, 7, 64])
h_m_n.shape torch.Size([1, 4, 64])
r_t.shape torch.Size([4, 1, 64])
r.shape torch.Size([4, 3, 64])

m.shape torch.Size([4, 7, 512])
h_m.shape torch.Size([4, 7, 64])
h_m_n.shape torch.Size([1, 4, 64])
r_t.shape torch.Size([4, 1, 64])
r.shape torch.Size([4, 4, 64])

m.shape torch.Size([4, 7, 512])
h_m.shape torch.Size([4, 7, 64])
h_m_n.shape torch.Size([1, 4, 64])
r_t.shape torch.Size([4, 1, 64])
r.shape torch.Size([4, 5, 64])



In [99]:
gru_s_hidden_size = 64

print('r.shape', r.shape)

h_s_0 = torch.randn(1, batch_size, gru_s_hidden_size)  # randomly initialized initial hidden state
gru_s = GRU(gru_m_hidden_size, gru_s_hidden_size, batch_first=True)

h_s, h_s_n = gru_s(r, h_s_0)

print('h_s.shape', h_s_0.shape)
print('h_s_n.shape', h_s_n.shape)

r.shape torch.Size([4, 5, 64])
h_s.shape torch.Size([1, 4, 64])
h_s_n.shape torch.Size([1, 4, 64])


In [100]:
attn_s_intermediate_size = 10

attn_s = LinearAttention(gru_s_hidden_size, attn_s_intermediate_size, 1)

c = attn_s(h_s)

print('c.shape', c.shape)

c.shape torch.Size([4, 1, 64])


##Blending

In [295]:
blend_size = 9

In [296]:
blend = Blend(q.shape[2], c.shape[2], blend_size)

In [297]:
x = blend(q, c)

In [298]:
print('x.shape', x.shape)

x.shape torch.Size([4, 1, 9])


In [299]:
num_stocks = x.shape[0]

In [300]:
x = x.view(x.shape[0], x.shape[2])

In [301]:
print('x.shape', x.shape)

x.shape torch.Size([4, 9])


##GAT

In [302]:
intermediate_size = 5

In [303]:
nhoods = [[0,1], [0,1,2], [1,2], [3]]

In [304]:
shared_linear = SharedLinear(blend_size, intermediate_size)

In [305]:
Wx = shared_linear(x)

In [306]:
print('Wx.shape', Wx.shape)

torch.Size([3, 5])


##MGAT 1

In [312]:
U = 8

In [313]:
elu = nn.ELU()

In [314]:
sgat = SGAT(intermediate_size, 1)

z = sgat(Wx, nhoods, 0)

z = elu(z)

for i in range(1, num_stocks):
    z_i = sgat(Wx, nhoods, i)
    z_i = elu(z_i)
    z = torch.cat((z, z_i), 0)

for u in range(1, U):
    sgat = SGAT(intermediate_size, 1)

    z_u = sgat(Wx, nhoods, 0)

    z_u = elu(z_u)

    for i in range(1, num_stocks):
        z_u_i = sgat(Wx, nhoods, i)
        z_u_i = elu(z_u_i)
        z_u = torch.cat((z_u, z_u_i), 0)
    
    z = torch.cat((z, z_u), 1)

In [315]:
print('z.shape', z.shape)

z.shape torch.Size([4, 40])


##MGAT 2

In [316]:
U = 8

In [317]:
sigmoid = nn.Sigmoid()

In [318]:
new_intermediate_size = 11

In [319]:
shared_linear = SharedLinear(z.shape[1], new_intermediate_size)

In [320]:
Wx = shared_linear(z)

In [321]:
print('Wx.shape', Wx.shape)

Wx.shape torch.Size([4, 11])


In [322]:
sgat = SGAT(new_intermediate_size, 1)

z = sgat(Wx, nhoods, 0)

z = sigmoid(z)

for i in range(1, num_stocks):
    z_i = sgat(Wx, nhoods, i)
    z_i = sigmoid(z_i)
    z = torch.cat((z, z_i), 0)

for u in range(1, U):
    sgat = SGAT(new_intermediate_size, 1)

    z_u = sgat(Wx, nhoods, 0)

    z_u = sigmoid(z_u)

    for i in range(1, num_stocks):
        z_u_i = sgat(Wx, nhoods, i)
        z_u_i = sigmoid(z_u_i)
        z_u = torch.cat((z_u, z_u_i), 0)
    
    z = torch.cat((z, z_u), 1)

In [323]:
print('z.shape', z.shape)

z.shape torch.Size([4, 88])


##Final Layer

In [332]:
linear = nn.Linear(z.shape[1], 1, bias=True)
sigmoid = nn.Sigmoid()

In [333]:
sigmoid(linear(z))

tensor([[0.5032],
        [0.5310],
        [0.5262],
        [0.5571]], grad_fn=<SigmoidBackward>)