### LSTM
***
- LSTM(Long Short Term Memory) is a more enhanced RNN that is used to counteract the exploding/ vanishing gradient problems caused by a vanilla RNN (Recurrent Neural Network)

1. **Notebook Summary:**
- Inner Workings Behind LSTM
- Different Applications of LSTM
    - Stacked LSTM
    - Parallel LSTM
    - Bidirectional LSTM
- PyTorch Example of LSTM

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
import numpy as np

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [2]:
class LSTMScratch(nn.Module):
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Weights for the Inputs
        self.w_ii = nn.Parameter(torch.rand(input_size, hidden_size)).to(device)
        self.w_if = nn.Parameter(torch.rand(input_size, hidden_size)).to(device)
        self.w_ig = nn.Parameter(torch.rand(input_size, hidden_size)).to(device)
        self.w_io = nn.Parameter(torch.rand(input_size, hidden_size)).to(device)

        # Biases for the Inputs
        self.b_ii = nn.Parameter(torch.rand([1])).to(device)
        self.b_if = nn.Parameter(torch.rand([1])).to(device)
        self.b_ig = nn.Parameter(torch.rand([1])).to(device)
        self.b_io = nn.Parameter(torch.rand([1])).to(device)

        # Weights for the Hidden State
        self.w_hi = nn.Parameter(torch.rand(hidden_size, hidden_size)).to(device)
        self.w_hf = nn.Parameter(torch.rand(hidden_size, hidden_size)).to(device)
        self.w_hg = nn.Parameter(torch.rand(hidden_size, hidden_size)).to(device)
        self.w_ho = nn.Parameter(torch.rand(hidden_size, hidden_size)).to(device)

        # Biases for the Hidden State
        self.b_hi = nn.Parameter(torch.rand([1])).to(device)
        self.b_hf = nn.Parameter(torch.rand([1])).to(device)
        self.b_hg = nn.Parameter(torch.rand([1])).to(device)
        self.b_ho = nn.Parameter(torch.rand([1])).to(device)
        
    def forward(self, x_t, prev_h, prev_c):
        
        
        x_t = x_t.to(device)
        prev_h = prev_h.to(device)
        prev_c = prev_c.to(device)
        
        i_t = F.sigmoid(torch.matmul(x_t, self.w_ii) + self.b_ii + torch.matmul(self.w_hi, prev_h) + self.b_hi)
        f_t = F.sigmoid(torch.matmul(x_t, self.w_if) + self.b_if + torch.matmul(self.w_hf, prev_h) + self.b_hf)
        g_t = F.tanh(torch.matmul(x_t, self.w_ig) + self.b_ig + torch.matmul(self.w_hg, prev_h) + self.b_hg)
        o_t = F.tanh(torch.matmul(x_t, self.w_io) + self.b_io + torch.matmul(self.w_ho, prev_h) + self.b_ho)
        c_t = torch.multiply(f_t, prev_c) + torch.multiply(i_t, g_t)
        h_t = torch.multiply(o_t, F.tanh(c_t))

        return h_t, c_t
        

In [3]:
lstm_scratch = LSTMScratch(2, 1)
lstm_scratch.forward(torch.Tensor([[1,2]]), torch.FloatTensor(1), torch.FloatTensor(1))

(tensor([[0.5858]], device='cuda:0', grad_fn=<MulBackward0>),
 tensor([[0.6742]], device='cuda:0', grad_fn=<AddBackward0>))

In [25]:
class StackedLSTMScratch(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers:int=1):
        super().__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm1 = nn.LSTM(input_size, hidden_size)
        self.lstm2 = nn.LSTM(input_size, hidden_size)

    def forward(self, x):
        hidden_state_list = []
        cell_state_list = []
        h_t0, c_t0 = self.lstm1(x)
        hidden_state_list.append(h_t0)
        cell_state_list.append(c_t0)
        
        h_t1, c_t1 = self.lstm2(h_t0)
        
        hidden_state_list.append(h_t1)
        cell_state_list.append(c_t1)

        return h_t1, hidden_state_list, cell_state_list
        
        

stackLSTM = StackedLSTMScratch(2,1,2)
stackLSTM.forward(torch.Tensor([[1,2]]))

(tensor([[-0.0988]], grad_fn=<SqueezeBackward1>),
 [tensor([[0.1508]], grad_fn=<SqueezeBackward1>),
  tensor([[-0.0988]], grad_fn=<SqueezeBackward1>)],
 [(tensor([[0.1508]], grad_fn=<SqueezeBackward1>),
   tensor([[0.4671]], grad_fn=<SqueezeBackward1>)),
  (tensor([[-0.0988]], grad_fn=<SqueezeBackward1>),
   tensor([[-0.2313]], grad_fn=<SqueezeBackward1>))])

### 3. PyTorch 
***

**`nn.RNN(input_size, hidden_state, num_layers)`**
- `input_size`: the number of expect feature in the input x
- `hidden_size`: the number of features in the hidden state
- `num_layers`: the number of RNNs stacked together
    <br></br>
1. `.forward(inputs, hidden_state=torch.zeros(1, hidden_state)))`
     - forward has two inputs, the previous activation layer and the input values
     - for the previous activation layer, when set to a dimension, create a zero vector of that dimension as its default value
       - inputs: represents the data you want to look through

In [18]:
torch.manual_seed(42)
data = torch.rand(1, 2)
data

tensor([[0.8823, 0.9150]])

In [19]:
lstm = nn.LSTM(2, 1, num_layers=10) # Num of layers determines the number of units 

lstm(data)

(tensor([[0.2389]], grad_fn=<SqueezeBackward1>),
 (tensor([[ 0.1356],
          [ 0.2857],
          [-0.0390],
          [-0.0674],
          [ 0.0053],
          [ 0.0398],
          [ 0.0340],
          [-0.0032],
          [ 0.1631],
          [ 0.2389]], grad_fn=<SqueezeBackward1>),
  tensor([[ 0.2596],
          [ 0.4242],
          [-0.0732],
          [-0.2216],
          [ 0.0178],
          [ 0.0833],
          [ 0.2536],
          [-0.0051],
          [ 0.4905],
          [ 0.5370]], grad_fn=<SqueezeBackward1>)))

In [25]:
import keras
model = keras.Sequential()
model.add(keras.layers.LSTM(3))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [110]:
model = keras.Sequential()
# embedding_layer = keras.layers.Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)

# snn_model.add(embedding_layer)
model.add(keras.Input((1,10)))
model.add(keras.layers.LSTM(4)) # Determines the Number of LSTM Cells
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential_27"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_29 (LSTM)              (None, 4)                 240       
                                                                 
 dense_9 (Dense)             (None, 1)                 5         
                                                                 
Total params: 245 (980.00 Byte)
Trainable params: 245 (980.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [56]:
F.tanh(torch.Tensor([-10]))

tensor([-1.])