In [None]:
import numpy as np

class LSTMCell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.W_f = np.random.randn(hidden_size + input_size, hidden_size)
        self.b_f = np.zeros((1, hidden_size))

        self.W_i = np.random.randn(hidden_size + input_size, hidden_size)
        self.b_i = np.zeros((1, hidden_size))

        self.W_C = np.random.randn(hidden_size + input_size, hidden_size)
        self.b_C = np.zeros((1, hidden_size))

        self.W_o = np.random.randn(hidden_size + input_size, hidden_size)
        self.b_o = np.zeros((1, hidden_size))

        self.C = np.zeros((1, hidden_size))
        self.h = np.zeros((1, hidden_size))

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def tanh(self, x):
        return np.tanh(x)

    def forward(self, x, h_prev):
        concat_input = np.concatenate((h_prev, x), axis=1)
        f_t = self.sigmoid(np.dot(concat_input, self.W_f) + self.b_f)  #forget gate
        i_t = self.sigmoid(np.dot(concat_input, self.W_i) + self.b_i)  #input gate
        tilde_C_t = self.tanh(np.dot(concat_input, self.W_C) + self.b_C)
        self.C = f_t * self.C + i_t * tilde_C_t
        o_t = self.sigmoid(np.dot(concat_input, self.W_o) + self.b_o)  #output gate
        self.h = o_t * self.tanh(self.C)

        return self.h, self.C

    def backward(self, x, h_prev, delta_h, delta_C, learning_rate):
        # Backpropagate through time
        concat_input = np.concatenate((h_prev, x), axis=1)

        # for output gate (o_t)
        delta_o = delta_h * self.tanh(self.C)
        delta_o_t = delta_o * o_t * (1 - o_t)
        grad_W_o = np.dot(concat_input.T, delta_o_t)
        grad_b_o = np.sum(delta_o_t, axis=0, keepdims=True)

        # for cell state (C_t)
        delta_C_total = delta_C + delta_h * o_t * (1 - self.tanh(self.C)**2)
        delta_C_t = delta_C_total * i_t * (1 - self.tanh(np.dot(concat_input, self.W_C) + self.b_C)**2)
        grad_W_C = np.dot(concat_input.T, delta_C_t)
        grad_b_C = np.sum(delta_C_t, axis=0, keepdims=True)

        # for input gate
        delta_i = delta_C_total * tilde_C_t * i_t * (1 - i_t)
        grad_W_i = np.dot(concat_input.T, delta_i)
        grad_b_i = np.sum(delta_i, axis=0, keepdims=True)

        # for forget gate
        delta_f = delta_C_total * self.C * f_t * (1 - f_t)
        grad_W_f = np.dot(concat_input.T, delta_f)
        grad_b_f = np.sum(delta_f, axis=0, keepdims=True)

        self.W_o -= learning_rate * grad_W_o
        self.b_o -= learning_rate * grad_b_o

        self.W_C -= learning_rate * grad_W_C
        self.b_C -= learning_rate * grad_b_C

        self.W_i -= learning_rate * grad_W_i
        self.b_i -= learning_rate * grad_b_i

        self.W_f -= learning_rate * grad_W_f
        self.b_f -= learning_rate * grad_b_f

        grad_h_prev = np.dot(delta_o_t, self.W_o[:self.hidden_size, :].T) + \
                      np.dot(delta_i, self.W_i[:self.hidden_size, :].T) + \
                      np.dot(delta_f, self.W_f[:self.hidden_size, :].T) + \
                      np.dot(delta_C_t, self.W_C[:self.hidden_size, :].T)

        grad_x = np.dot(delta_o_t, self.W_o[self.hidden_size:, :].T) + \
                 np.dot(delta_i, self.W_i[self.hidden_size:, :].T) + \
                 np.dot(delta_f, self.W_f[self.hidden_size:, :].T) + \
                 np.dot(delta_C_t, self.W_C[self.hidden_size:, :].T)

        return grad_h_prev, grad_x
