In [1]:
# 单个cell的前向传播过程
# 两个输入： x_t, s_prev, 以及 parameters

Sitian:

### RNN basics

RNN can be used for sequence prediction, such as predicting the stock price at time T<sub>t+1</sub> given a sequence of stock prices from T<sub>0</sub>, T<sub>1</sub>, ..., T<sub>t</sub>.

Once the length of the input, or say the observed sequence is determined, then the length of the unrolled RNN strucuture will also be determined. For example, if we have 5 time points in the stock price sequence, as [12, 13, 15, 16, 14], then we can 'unfold' the RNN into 5 rows, which each row takes an input, pass through a hidden unit, and give an output. The 'row' can be also called a time step. In this case, the output of a time step would also be the input of the next time step, e.g., 13 can be the output of 12 and the input of 15.  Note that the hidden unit of each time is connected with both the input and the hidden unit from the previous time, making the 5 time steps not independent from each other.

A common way to signify the parameters in an RNN is as follows:  
- x: the input of this time step
- y: the output of this time step
- h: the hidden state of this time step
- h_prev: the hidden state of previous time step

h is calculated as the activation (e.g., softmax) of the sum of np.dot(x, Wxh) + np.dot(h_prev, Whh) + bh, where: 

- Wxh: the weight matrix from an input x to N hidden units, which N is refers to as "hidden size" and assigned by user (e.g., 64).  
- Whh: the weight matrix from the previous hidden state to the current hidden state.

In addition, the collection of 'x' and 'y' values over time can be represented as vectors, denoted as 'X' and 'Y'. However, 'h' and 'Wxh' etc. at each time step, which are also essentially vectors, will be written in lowercase for ease of visualization.

### Key process in the Algorithm 



In [None]:
x = [1,2,3,4,5]
y = [a,b,c,d,e]


x_train = [1,2,3,4]
y_train = [2,3,4,5]

x_pred = [1,2,3,4,5]
y_pred = [2,3,4,5,6]


In [None]:
import numpy as np

In [1]:
# 每一时刻的前向计算,用以得到该时刻当前 paramter 下的 h 和 output

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def time_step_forward(x_t, h_prev, model_state):
    """
    This function defines the structure of the time step in an RNN at any time step.
    
    x_t: 当前t时刻的序列输入, 虽然只有一个值, 但为了矩阵相乘, 形状=(1, ), 即一维向量，且只有一个值
    h_prev: 上一个时刻的 hidden state (隐层状态), a vector, 长度由hidden size确定
    model_state: RNN share 的参数, dict. 其中各参数的形状稍有不同，但都取决于hidden size
    
    return: 当前隐层状态 h, 当前这层输出 out_pred
    """
    
    # 参数 parameters
    W_xh = model_state["W_xh"] # 形状 (hidden_size, 1); 
    W_hy = model_state["W_hy"] # 形状 (hidden_size, 1);
    W_hh = model_states["W_hh"] # 形状 (hidden_size, hidden_size);
    b_h = model_state["b_h"] # 形状 (hidden_size, ); 因为不需矩阵相乘计算，所以是一维向量
    b_y = model_state["b_y"] # 形状 (hidden_size, ); 同理, 因为不需矩阵相乘计算，所以是一维向量
    
    
    # 计算当前时刻的隐层
    # np.dot(W_xh, x_t)中 W_hx在前，x_t在后，符合矩阵相乘的形状要求: (hidden_size, 1) dot_multiply (1, )
    h_preactivation = np.dot(W_xh, x_t) + np.dot(W_hh, h_prev) + b_h
    h = np.tanh(h_preactivation)
    
    # 计算当前时刻的 y_pred (output prediction)
    y_pred = softmax(np.dot(h, W_hy) + b_y)
        
    return h_preactivation, h, y_pred

In [15]:
# 整个 RNN (即所有时刻) 的前向计算，用以得到当前 paramter 下每一时刻的 loss 和 h，用以反向传播

def rnn_forward(X, Y, model_state, usage = "train"):
    
    """
    X: 输入序列，np.array, 形状(1,T), T为序列长度, 1代表一个时刻只有一个值
    Y: 预期输出序列 (i.e., label), np.array, 形状(1,T), T为序列长度, 1代表一个时刻只有一个值
    model_state: 所有 time steps 的共享参数，W_xh, Why, W_hh, b_h, b_y.
    
    return: loss: 每一time step的 loss,用于反向传播计算
    """    

    
    # 准备记录每一time step的 h_preactivation, h, y_pred, 和 loss
    h_preactivation_all = []
    h_all = []
    y_pred_all = []
    loss_all = []
    
    # 循环对每一个 time step 进行前向计算,记录每一个time step的loss
    for t in range(len(X)):
        if t = 0:
            h_prev = model_state["h_init"]
        
        #对于t时刻 cell 进行输出. 对于第一个 time step, 指派一个 h_init 作为其 hidden unit 的 h_prev 输入
        h_preactivation, h_prev, y_pred = time_step_forward(X[t], h_prev, model_state)
        
        #记录每一时刻的 h_preactivation, h, y_pred;计算并记录 loss
        h_preactivation_all.append(h_preactivation)
        h_all.append(h_prev)
        y_pred_all.append(y_pred)
        loss_all.append(Y[t] - y_pred)

    model_state.update([('h_preactivation_all', h_preactivation_all), 
                        ('h_all', h_all)])
    if usage == "train":    
        return model_state, y_pred_all, loss_all
    elif usage == "predict":
        return y_pred_all
    else:
        raise InputError("Type 'train' or 'predict'")
        

In [2]:
# 每一时刻的反向传播过程, 用以得到当前时刻 loss 对 parameter 的 gradient

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

def time_step_backward(dy, t, model_state):
    
    """    
    return: 当前时刻共享参数的六个 gradients  
    """

    h_prev, h, W_hh, W_xh, W_hy, h_preactivation = modle_state["h_all"][t]
    
    dh_raw = tanh_derivative(h_preactivation) * (dh_next + np.dot(dy, W_hy.T))
    
    dW_hy = np.dot(loss_t, h_t.T)
    dW_hh = np.dot(dh_raw, h_prev.T)
    dW_xh = np.dot(dh_raw, x_t.T),
    db_h = np.sum(dh_raw, axis=0, keepdims=True)
    db_y = np.sum(dy, axis=0, keepdims=True)

    dh_prev = np.dot(dh_raw, W_hh)
  
    #把所有的导数保存到字典当中返回
    gradients = {'dW_hh': dW_hh, 'dW_xh': dW_xh, 'dW_hy': dW_hy, 'db_h': db_h, 'db_y': db_y, 'dh_prev': dh_prev}
    
    return gradients

In [3]:
# 所有cell的反向传播 
def rnn_backward(X, Y, model_state, y_pred_all, loss_all):
    """

    """
    # Initialize gradients with zeros in the same shape as the weights/biases
    dW_hh, dW_xh, dW_hy, db_h, db_y = np.zeros_like(model_state["W_hh"]), np.zeros_likemodel_state["W_xh"]), 
                                      np.zeros_like(model_state["W_hy"]), np.zeros_like(model_state["b_h"]), 
                                      np.zeros_like(model_state["b_y"])

    dh_next = np.zeros_like(model_state["h_all"][0]) 

    # Iterate through time steps in reverse order
    for t in reversed(range(len(X))):

        dy = loss_all[t]
        gradients = rnn_step_backward(dy, t, model_state)
        
        # Accumulate gradients from all time steps
        dW_hh += gradients['dW_hh']
        dW_xh += gradients['dW_xh']
        dW_hy += gradients['dW_hy']
        db_h += gradients['db_h']
        db_y += gradients['db_y']
        dh_next = gradients['dh_prev']  # Update dh_next for the next iteration (going backwards)

    # Pack all gradients in a dictionary for return
    all_gradients = {
        'dW_hh': dW_hh,
        'dW_xh': dW_xh,
        'dW_hy': dW_hy,
        'db_h': db_h,
        'db_y': db_y
    }

    return all_gradients



In [None]:
def update_model_state(model_state, gradients, learning_rate):
    """
    Update parameters using gradient descent.

    Arguments:
    parameters -- dictionary containing the parameters "W_hh", "W_xh", "W_hy", "b_h", "b_y"
    gradients -- dictionary containing the gradients for each parameter
    learning_rate -- the learning rate, scalar

    Returns:
    parameters -- dictionary containing the updated parameters
    """
    # Update each parameter according to the gradient descent update rule
    model_state['W_hh'] -= learning_rate * gradients['dW_hh']
    model_state['W_xh'] -= learning_rate * gradients['dW_xh']
    model_state['W_hy'] -= learning_rate * gradients['dW_hy']
    model_state['b_h'] -= learning_rate * gradients['db_h']
    model_state['b_y'] -= learning_rate * gradients['db_y']

    return model_state


In [None]:
learning_rate = 0.01
parameters = update_parameters(parameters, gradients, learning_rate)

In [None]:
def train_RNN(X, Y, epochs = 20, learning_rate = 0.001):


    # 初始化参数: W_xh, b_h, etc. 同时设定一个初始 h 作为第一个time_step 的 h_prev 
    hidden_size = 100
    W_xh = np.random.randn(1, hidden_size)
    W_hy = np.random.randn(hidden_size, 1)
    W_hh = np.random.randn(hidden_size, hidden_size)
    b_h = np.random.randn(hidden_size) # equivalent to np.random.randn(hidden_size, )
    b_y = np.random.randn(1)
    h_init = np.zeros(hidden_size)
    MODEL_STATE = {"W_xh": W_xh, "W_hh": W_hh, "W_hy": W_hy,
                  "b_h": b_h, "b_y": b_y, "h_init": h_init,
                  "h_preactivation_all":None, "h_all": None}

    for i in epochs:
        MODEL_STATE, y_pred_all, loss_all = run_forward(X, Y, MODEL_STATE)
        GRADIENT = rnn_backward(X, Y, MODEL_STATE, y_pred_all, loss_all)
        MODEL_STATE = update_model_state(MODEL_STATE, GRADIENT, LEARNING_RATE)
    
    return MODEL_STATE

In [None]:
def predict(X, model_state):
    run_forward()

In [None]:
stock_price =  = [1,2,3,4,5,6]
X = stock_price[:-2]
Y = stock_price[1:]

MODEL_STATE = train_RNN(X, Y, epochs = 50,learning_rate = 0.01)


In [1]:
#https://www.tensorflow.org/text/tutorials/text_generation#download_the_shakespeare_dataset
path_to_file = '/home/sitian/.keras/datasets/shakespeare.txt'
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

In mathematical terms, if you have a loss function L and the output y_t = W_hy * h_t + b_y, then by the chain rule, dW_hy = dL/dy_t * dy_t/dW_hy. The term dy_t/dW_hy is essentially h_t, the input to the weights.