# LSTM Cell Writeup


## Input Gate($i$)

$$i_t = \sigma(W_{x_i}x_t+W_{h_i}h_{t-1}+b_i))$$
- Partials
	- $i_t = \sigma (p)$
		- $\nabla_pi_t=(1-\sigma(p))\sigma(p)$
	- $p = q + v + b_i$
		- $\nabla_qp=1$
		- $\nabla_vp=1$
		- $\nabla_{b_i}p=1$
	- $q = W_{x_i}x_t$
		- $\nabla_{W_{x_i}}q = x_t^T$
		- $\nabla_{x_t}q = W_{x_i}^T$
	- $v = W_{h_i}h_{t-1}$
		- $\nabla_{W_{h_i}}q = h_{t-1}^T$
		- $\nabla_{h_{t-1}}q = W_{h_i}^T$
- $\nabla_{W_{x_i}}{i_t}=\nabla_pi_t \nabla_qp \nabla_{W_{x_i}}q = x_t^T\nabla_pi_t$
- $\nabla_{W_{h_i}}{i_t}=\nabla_pi_t \nabla_vp \nabla_{W_{h_i}}v = h_{t-1}^T\nabla_pi_t$
- $\nabla_{b_i}i_t = \nabla_pi_t \nabla_{b_i}p = \nabla_pi_t$

## Forget gate($f$)
$$f_t=\sigma({W_{x_f}x_t}+{W_{h_f}h_{t-1}}+{b_f})$$
- Partials
	- $f_t = \sigma (p)$
		- $\nabla_pf_t=(1-\sigma(p))\sigma(p)$
	- $p = q + v + b_f$
		- $\nabla_qp=1$
		- $\nabla_vp=1$
		- $\nabla_{b_f}p=1$
	- $q = W_{x_f}x_t$
		- $\nabla_{W_{x_f}}q = x_t^T$
		- $\nabla_{x_t}q = W_{x_f}^T$
	- $v = W_{h_f}h_{t-1}$
		- $\nabla_{W_{h_f}}q = h_{t-1}^T$
		- $\nabla_{h_{t-1}}q = W_{h_f}^T$
- $\nabla_{W_{x_f}}{f_t}=\nabla_pf_t \nabla_qp \nabla_{W_{x_f}}q = x_t^T\nabla_pf_t$
- $\nabla_{W_{h_f}}{f_t}=\nabla_pf_t \nabla_vp \nabla_{W_{h_f}}v = h_{t-1}^T\nabla_pf_t$
- $\nabla_{b_f}f_t = \nabla_pf_t \nabla_{b_f}p = \nabla_pf_t$
## Output gate($o$)
$$o_t=\sigma({W_{x_o}x_t}+{W_{h_o}h_{t-1}}+{b_o})$$
- Partials
	- $o_t = \sigma (p)$
		- $\nabla_po_t=(1-\sigma(p))\sigma(p)$
	- $p = q + v + b_o$
		- $\nabla_qp=1$
		- $\nabla_vp=1$
		- $\nabla_{b_o}p=1$
	- $q = W_{x_o}x_t$
		- $\nabla_{W_{x_o}}q = x_t^T$
		- $\nabla_{x_t}q = W_{x_o}^T$
	- $v = W_{h_o}h_{t-1}$
		- $\nabla_{W_{h_o}}q = h_{t-1}^T$
		- $\nabla_{h_{t-1}}q = W_{h_o}^T$
- $\nabla_{W_{x_o}}{o_t}=\nabla_po_t \nabla_qp \nabla_{W_{x_o}}q = x_t^T\nabla_po_t$
- $\nabla_{W_{h_o}}{o_t}=\nabla_po_t \nabla_vp \nabla_{W_{h_o}}v = h_{t-1}^T\nabla_po_t$
- $\nabla_{b_o}o_t = \nabla_po_t \nabla_{b_o}p = \nabla_po_t$
## Gate gate($g$)
$$g_t=\tanh({W_{x_g}x_t}+{W_{h_g}h_{t-1}}+{b_g})$$
- Partials
	- $g_t = \sigma (p)$
		- $\nabla_pg_t=1-\tanh^2(p)$
	- $p = q + v + b_g$
		- $\nabla_qp=1$
		- $\nabla_vp=1$
		- $\nabla_{b_g}p=1$
	- $q = W_{x_g}x_t$
		- $\nabla_{W_{x_g}}q = x_t^T$
		- $\nabla_{x_t}q = W_{x_g}^T$
	- $v = W_{h_g}h_{t-1}$
		- $\nabla_{W_{h_g}}q = h_{t-1}^T$
		- $\nabla_{h_{t-1}}q = W_{h_g}^T$
- $\nabla_{W_{x_g}}{g_t}=\nabla_pg_t \nabla_qp \nabla_{W_{x_g}}q = x_t^T\nabla_pg_t$
- $\nabla_{W_{h_g}}{g_t}=\nabla_pg_t \nabla_vp \nabla_{W_{h_g}}v = h_{t-1}^T\nabla_pg_t$
- $\nabla_{b_g}g_t = \nabla_pg_t \nabla_{b_g}p = \nabla_pg_t$
## Cell State Update($c_t$)
$$c_t = {f_t \odot c_{t-1}}+{i_t \odot g_t}$$
- Partials
	- $c_t = q + p$
		- $\nabla_qc_t=1$
		- $\nabla_pc_t=1$
	- $q = f_t \odot c_{t-1}$
		- $\nabla_{f_t}q=c_{t-1}$
		- $\nabla_{c_{t-1}}q=f_t$
	- $p = i_t \odot g_t$
		- $\nabla_{i_t}p=g_t$
		- $\nabla_{g_t}p=i_t$
- $\nabla_{f_t}c_t=\nabla_qc_t\nabla_{f_t}q=c_{t-1}$
- $\nabla_{i_t}c_t= \nabla_{p}c_t\nabla_{i_t}p=g_t$
- $\nabla_{g_t}c_t= \nabla_{p}c_t\nabla_{g_t}p=i_t$

## Hidden State Update($h_t$)
$$h_t =o_t \odot \tanh(c_t)$$

- Partials
	- $h_t = o_t \odot q$
		- $\nabla_{o_t}h_t=q$
		- $\nabla_{q}h_t=o_t$
	- $q=\tanh(c_t)$
		- $\nabla_{c_t}q=1-\tanh^2(c_t)$
- $\nabla_{o_t}h_t=q$
- $\nabla_{c_t}h_t=\nabla_{q}h_t\nabla_{c_t}q=o_t\nabla_{c_t}q$



# LSTM Cell Code

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset, dataset_dict

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
data_path = "data/Car_data/car_data"
# List all CSV files in the folder
csv_files = [os.path.join(data_path, file) for file in os.listdir(data_path) if file.endswith('.csv')]

# Initialize an empty list to store DataFrames


# Load each CSV file into a DataFrame and append to the list
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)

# Concatenate all DataFrames into one
dataset = pd.concat(dataframes, ignore_index=True)



Index(['Local_X', 'Local_Y', 'v_Vel', 'v_Acc', 'Space_Headway', 'dis_cen',
       'i_l', 'i_r', 'i_f', 'dis_l', 'dis_r', 'dis_f'],
      dtype='object')


In [27]:
print(dataset.shape)

(629800, 12)


In [None]:

class LSTM(nn.Module):
    input_size = None
    hidden_size = None
    num_layers = None
    dropout = None
    # Weights and biases
    # Input gate
    Wi = None
    Ui = None
    bi = None
    # Forget gate
    Wf = None
    Uf = None
    bf = None
    # Cell gate
    Wg = None
    Ug = None
    bg = None
    # Output gate
    Wo = None
    Uo = None
    bo = None
    # Hidden state
    h = None
    # Cell state
    c = None

    
    def __init__(self, input_size=12, hidden_size = 64, num_layers=1, dropout=0.0):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        # Initialize weights and biases
        self.init_weights(input_size, hidden_size)


    def init_weights(self, input_size, hidden_size):  # xavier initialization
        # Input gate parameters
        self.Wi = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.Ui = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.bi = nn.Parameter(torch.zeros(hidden_size))
        
        # Forget gate parameters
        self.Wf = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.Uf = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.bf = nn.Parameter(torch.ones(hidden_size))  # Often initialized to 1 to encourage remembering
        
        # Cell gate parameters
        self.Wg = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.Ug = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.bg = nn.Parameter(torch.zeros(hidden_size))
        
        # Output gate parameters
        self.Wo = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.Uo = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.bo = nn.Parameter(torch.zeros(hidden_size))
        
        # Apply Xavier initialization to all weight matrices
        for weight in [self.Wi, self.Ui, self.Wf, self.Uf, self.Wg, self.Ug, self.Wo, self.Uo]:
            nn.init.xavier_uniform_(weight)

    def input_gate(self, x, h):
        # Input gate
        q = self.Wi @ x
        v = self.Ui @ h
        p = q + v + self.bi
        i = torch.sigmoid(p)
        return i
    
    def forget_gate(self, x, h):
        # Forget gate
        q = self.Wf @ x
        v = self.Uf @ h
        p = q + v + self.bf
        f = torch.sigmoid(p)
        return f

    def output_gate(self, x, h):
        # Output gate
        q = self.Wo @ x
        v = self.Uo @ h
        p = q + v + self.bo
        o = F.sigmoid(p)
        return o

    def cell_gate(self, x, h):
        # Cell gate
        q = self.Wg @ x
        v = self.Ug @ h
        p = q + v + self.bg
        g = torch.tanh(p)
        return g

    def cell_state_update(self, i, f, g, c_old):
        # Cell state update
        c_new = f * c_old + i * g
        return c_new
    
    def hidden_state_update(self, o, c_new):
        # Hidden state update
        h_new = o * torch.tanh(c_new)
        return h_new

    # LSTM by hand
    def lstm_cell(self, x, h_old, c_old):
        i = self.input_gate(x, h_old) 
        f = self.forget_gate(x, h_old)
        g = self.cell_gate(x, h_old)
        o = self.output_gate(x, h_old)

        # New cell state
        c_new = self.cell_state_update(i, f, g, c_old)
        # New hidden state
        h_new = self.hidden_state_update(o, c_new)
        return h_new, c_new
    
    def forward(self, x):

Using device: cuda
