In [1]:
import numpy as np

data = "abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz abcdefghijklmnopqrstuvwxyz"
chs_set = sorted(set(data))
ch2int = {ch: idx for idx, ch in enumerate(chs_set)}
int2ch = {idx: ch for idx, ch in enumerate(chs_set)}

def label_encoder(ch):
    return ch2int[ch]

data=list(map(label_encoder,data))    # 数字化

OUT_SIZE=len(chs_set)

$$
h_{t}=tanh(W_{x}x_{t}+W_{h_{t-1}}h_{t-1}+b_{h_{t-1}}) \\
y_{t}=W_{h_{t}}h_{t}+b_{h_{t}} \\
$$

保持与之前DNN实现上的一致，易得三个权重矩阵的维度分别为：

$$
W_{x}=(x,h) \\
W_{h_{t-1}}=(h,h) \\
W_{h_{t}}=(h,y) \\
$$

参数设置与初始化：

In [2]:
from collections import OrderedDict
unit_I = len(chs_set)
unit_h = 100
unit_O = unit_I

t_size = 10    # 状态数，即时间窗口大小
max_iter = 1000
lr = 0.1

params = OrderedDict({
    'W_x':np.random.randn(unit_I, unit_h)*0.01,
    'W_h_pre':np.random.randn(unit_h, unit_h)*0.01,
    'b_h_pre':np.zeros((1, unit_h)),
    'W_h_cur':np.random.randn(unit_h, unit_O)*0.01,
    'b_h_cur':np.zeros((1, unit_O)),
})

h_pre = np.zeros((1, unit_h))    # h_{t-1}

首先是前向传播，因为RNN的一个样本有多个有序状态，所以一个样本的loss是该样本所有状态loss的累加：

In [3]:
def softmax_row(row_data):
    return np.exp(row_data)/np.sum(np.exp(row_data))


def forward_prop(x, y, h_pre):
    '''
    x: 数字表示的字符串
    y: 数字表示的字符串，由x右移一位得到
    h_pre: 隐层的上一个状态
    '''
    x_s = np.zeros((t_size, unit_I))    # 使用矩阵保存x当前轮的所有状态，一行为一个状态下的one-hot向量
    h_s = np.zeros((t_size+1, unit_h))    # 使用最后一行保存上一轮的隐层状态，使用-1调用
    h_s[-1] = h_pre
    y_pred_s = np.zeros((t_size, unit_O))    # score
    logit_s = np.zeros((t_size, unit_O))    # 下一个字符的概率向量
    loss = 0

    for t in range(t_size):    # 串行计算所有状态下的变量
        x_s[t][x[t]] = 1    # one-hot向量的对应位置置1
        h_s[t] = np.tanh(np.dot(x_s[t], params['W_x'])+np.dot(h_s[t-1], params['W_h_pre'])+params['b_h_pre'])    # 隐层状态
        logit_s[t] = softmax_row(
            np.dot(h_s[t], params['W_h_cur'])+params['b_h_cur'])    # softmax输出
        loss += -np.log(logit_s[t][y[t]])    # 概率向量中，真实标签idx对应位置的值即为损失

    cache={
        'logit_s':logit_s,
        'h_s':h_s,
        'x_s':x_s
    }
    return loss, cache


# test
# x = data[:t_size]
# y = data[1:t_size+1]
# h_pre = np.zeros((1, unit_h))
# loss, cache=forward_prop(x, y, h_pre)

反向传播：

In [4]:
def backward_prop(cache, x, y):
    logit_s, h_s, x_s = cache['logit_s'], cache['h_s'], cache['x_s']

    # loss关于参数的梯度
    grads = OrderedDict({
        'W_x': np.zeros_like(params['W_x']),
        'W_h_pre': np.zeros_like(params['W_h_pre']),
        'b_h_pre': np.zeros_like(params['b_h_pre']),
        'W_h_cur': np.zeros_like(params['W_h_cur']),
        'b_h_cur': np.zeros_like(params['b_h_cur']),
    })

    dh_post = np.zeros_like(h_s[0])    # loss对h_{t+1}的梯度

    for t in range(t_size-1, -1, -1):    # 倒推
        dlogit = np.copy(logit_s[t])
        dlogit[y[t]] -= 1

        grads['W_h_cur'] += np.dot(np.mat(h_s[t]).T, np.mat(dlogit))    # 向量转矩阵
        grads['b_h_cur'] += dlogit

        # 后一状态的dh_post是对当前层dh_cur有贡献的
        dh_cur = np.dot(dlogit, params['W_h_cur'].T) + dh_post
        dh_pre = (1-h_s[t]*h_s[t])*dh_cur

        grads['b_h_pre'] += dh_pre
        grads['W_h_pre'] += np.dot(h_s[t-1].T, dh_pre)
        grads['W_x'] += np.dot(np.mat(x_s[t]).T, np.mat(dh_pre))

        dh_post = np.dot(dh_pre, params['W_h_pre'].T)    # 更新后一状态的dh_post

    # 梯度截断
    for grad in grads.values():
        np.clip(grad, -5, 5, out=grad)

    return grads


# test
# grads = backward_prop(cache, x, y)
# for param in params.keys():
#     assert grads[param].shape==params[param].shape

训练模型：

In [5]:
idx = 0    # 子串起始游标

# AdaGrad的累计平方梯度
R_AdaGrad = OrderedDict({
    'W_x': np.zeros_like(params['W_x']),
    'W_h_pre': np.zeros_like(params['W_h_pre']),
    'b_h_pre': np.zeros_like(params['b_h_pre']),
    'W_h_cur': np.zeros_like(params['W_h_cur']),
    'b_h_cur': np.zeros_like(params['b_h_cur']),
})

for iter_cnt in range(max_iter):
    if idx+t_size >= len(data) or iter_cnt == 0:
        h_pre = np.zeros((1, unit_h))
        idx = 0

    X = data[idx:idx+t_size]
    Y = data[idx+1:idx+t_size+1]

    loss, cache = forward_prop(X, Y, h_pre)
    h_pre = cache['h_s'][-2]
    grads = backward_prop(cache, X, Y)

    for param in params.keys():
        R_AdaGrad[param] += grads[param]**2
        params[param] -= lr*grads[param]/np.sqrt(R_AdaGrad[param]+1e-8)

    idx += t_size
    
    if (iter_cnt+1)%100==0:
        print('{} iter: loss:{}'.format(iter_cnt+1,loss))

100 iter: loss:35.98180993952694
200 iter: loss:1.037309804680368
300 iter: loss:0.7041263367788266
400 iter: loss:0.1770009519520916
500 iter: loss:0.23905165689674382
600 iter: loss:0.12560925425913708
700 iter: loss:0.0684359747189963
800 iter: loss:0.07240104857925224
900 iter: loss:0.0444685356632706
1000 iter: loss:0.06587915855459452


使用参数预测：

In [8]:
def predict(ch, t_size):
    one_hot_vec = np.zeros((1, unit_O))
    one_hot_vec[0,ch2int[ch]] = 1
    h = np.zeros((1, unit_h))
    res=list()

    for t in range(t_size):
        h = np.tanh(np.dot(one_hot_vec, params['W_x'])
                    + np.dot(h, params['W_h_pre'])
                    + params['b_h_pre'])    # 隐层状态
        logit = softmax_row(np.dot(h, params['W_h_cur'])
                            + params['b_h_cur'])    # softmax输出
        
        next_item=np.random.choice(range(unit_O), p=logit.ravel())    # 按概率选择下一个出现的对象
        res.append(int2ch[next_item])
        
        one_hot_vec = np.zeros((1, unit_O))
        one_hot_vec[0,next_item] = 1
    
    return res

predict('y',5)

['z', ' ', 'a', 'b', 'c']