In [None]:
import numpy as np

def sigmoid(x):
    return 1/(1+np.exp(-x))

def d_sigmoid(x):
    return x*(1-x)

def tanh(x):
    return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))

def d_tanh(x):
    return 1 - x ** 2

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

class MyLSTMCell():
    def __init__(self, input_size, hidden_size):
        super(MyLSTMCell, self).__init__()
        self.Wix = np.random.randn(hidden_size, input_size)
        self.Wim = np.random.randn(hidden_size, hidden_size)
        self.Wic = np.random.randn(hidden_size)
        self.Wfx = np.random.randn(hidden_size, input_size)
        self.Wfm = np.random.randn(hidden_size, hidden_size)
        self.Wfc = np.random.randn(hidden_size)
        self.Wcx = np.random.randn(hidden_size, input_size)
        self.Wcm = np.random.randn(hidden_size, hidden_size)
        self.Wox = np.random.randn(hidden_size, input_size)
        self.Wom = np.random.randn(hidden_size, hidden_size)
        self.Woc = np.random.randn(hidden_size)
        
        self.bi = np.zeros((hidden_size, 1))
        self.bf = np.zeros((hidden_size, 1))
        self.bc = np.zeros((hidden_size, 1))
        self.bo = np.zeros((hidden_size, 1))

    def forward(self, x_t, h_prev, c_prev):
        i_t = sigmoid(np.dot(x_t, self.Wix)+np.dot(h_prev, self.Wim)+np.dot(c_prev, self.Wic)+self.bi)
        f_t = sigmoid(np.dot(x_t, self.Wfx)+np.dot(h_prev, self.Wfm)+np.dot(c_prev, self.Wfc)+self.bf)
        c_hat_t = np.dot(x_t, self.Wcx)+np.dot(h_prev, self.Wcm)+self.bc
        c_t = f_t@c_prev+i_t@tanh(c_hat_t)
        o_t = sigmoid(np.dot(x_t, self.Wox)+np.dot(h_prev, self.Wom)+np.dot(self.ct, self.Woc)+self.bo)
        m_t = o_t@tanh(c_t)

        temp = (x_t, h_prev, c_prev, i_t, f_t, c_hat_t, c_t, o_t, m_t)
        return m_t, c_t, temp

    def backward(self, temp, dm_next, dc_next):
        x_t, h_prev, c_prev, i_t, f_t, c_hat_t, c_t, o_t, m_t = temp
        
        do_t = dm_next * np.tanh(c_t)
        do_t *= d_sigmoid(o_t)

        dc_t = dm_next * o_t * (1 - np.tanh(c_t) ** 2)
        dc_t += dc_next  

        df_t = dc_t * c_prev
        df_t *= d_sigmoid(f_t)

        di_t = dc_t * c_hat_t
        di_t *= d_sigmoid(i_t)

        dc_hat_t = dc_t * i_t
        dc_hat_t *= d_tanh(c_hat_t)

        dW_fx = np.dot(df_t, x_t.T)
        dW_ix = np.dot(di_t, x_t.T)
        dW_cx = np.dot(dc_hat_t, x_t.T)
        dW_ox = np.dot(do_t, x_t.T)

        dW_fm = np.dot(df_t, h_prev.T)
        dW_im = np.dot(di_t, h_prev.T)
        dW_cm = np.dot(dc_hat_t, h_prev.T)
        dW_om = np.dot(do_t, h_prev.T)

        dW_fc = df_t * c_prev
        dW_ic = di_t * c_prev
        dW_oc = do_t * c_t

        db_f = np.sum(df_t, axis=1, keepdims=True)
        db_i = np.sum(di_t, axis=1, keepdims=True)
        db_c = np.sum(dc_hat_t, axis=1, keepdims=True)
        db_o = np.sum(do_t, axis=1, keepdims=True)

        dx_t = (
            np.dot(self.W_fx.T, df_t)
            + np.dot(self.W_ix.T, di_t)
            + np.dot(self.W_cx.T, dc_hat_t)
            + np.dot(self.W_ox.T, do_t)
        )

        dh_prev = (
            np.dot(self.W_fm.T, df_t)
            + np.dot(self.W_im.T, di_t)
            + np.dot(self.W_cm.T, dc_hat_t)
            + np.dot(self.W_om.T, do_t)
        )

        dc_prev = dc_t * f_t + df_t * self.W_fc + di_t * self.W_ic

        self.dW_fx = dW_fx
        self.dW_ix = dW_ix
        self.dW_cx = dW_cx
        self.dW_ox = dW_ox

        self.dW_fm = dW_fm
        self.dW_im = dW_im
        self.dW_cm = dW_cm
        self.dW_om = dW_om

        self.dW_fc = dW_fc
        self.dW_ic = dW_ic
        self.dW_oc = dW_oc

        self.db_f = db_f
        self.db_i = db_i
        self.db_c = db_c
        self.db_o = db_o

        return dx_t, dh_prev, dc_prev

In [None]:
class MyLSTM():
    def __init__(self, input_size, hidden_size, output_size):
        super(MyLSTM, self).init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.lstm_layer = MyLSTMCell(input_size, hidden_size)
        self.W = np.random.randn(hidden_size, output_size)
        self.b = np.zeros((output_size, 1))
    
    def forward(self, X):
        h_t = np.zeros((self.hidden_dim, 1))
        c_t = np.zeros((self.hidden_dim, 1))

        outputs = []
        temps = []

        for t in range(len(X)):
            x_t = X[t].reshape(-1, 1)
            h_t, c_t, temp = self.lstm_cell.forward(x_t, h_t, c_t)
            y_t = softmax(np.dot(self.W_hy, h_t) + self.b_y)
            outputs.append(y_t)
            temps.append(temp)

        return outputs, temps

    def backward(self, outputs, temps):
        dW = np.zeros_like(self.W)
        db = np.zeros_lie(self.b)

        dh_next = np.zeros((self.hidden_size, 1))
        dc_next = np.zeros((self.hidden_size, 1))

        for output, temp in zip(outputs, temps):
            dW_hy += np.dot(output, temp[0].T)
            db_y += output

            dh = np.dot(self.W_hy.T, output) + dh_next
            dx_t, dh_next, dc_next = self.lstm_cell.backward(temp, dh, dc_next)
            self.lstm_layer.backward(temp)
            
        return dW, db
    
    def update(self, grads, lr):
        dW, db = grads
        self.W -= dW * lr
        self.b -= db * lr