<a href="https://colab.research.google.com/github/1190303311/AI/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

LSTM模型单层实现，参考博客https://blog.csdn.net/FlyingLittlePig/article/details/72229041

LSTM前向和反向传播：
http://arunmallya.github.io/writeups/nn/lstm/index.html#/

In [None]:
import numpy as np
import nltk, itertools, csv
import codecs

TXTCODING = 'utf-8'
unknown_token = 'UNKNOWN_TOKEN'
start_token = 'START_TOKEN'
end_token = 'END_TOKEN'

nltk.download('punkt')
# 解析评论文件为数值向量
class tokenFile2vector:
    def __init__(self, file_path, dict_size):
        self.file_path = file_path
        self.dict_size = dict_size

    # 将文本拆成句子，并加上句子开始和结束标志
    def _get_sentences(self):
        sents = []
        with open(self.file_path, 'rb') as f:
            #reader = csv.reader(f, skipinitialspace=True)
            reader = csv.reader(codecs.iterdecode(f, 'utf-8'), skipinitialspace=True)
            # 去掉表头 
            # 解析每个评论为句子
            next(reader)
            sents = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
            sents = ['%s %s %s' % (start_token, sent, end_token) for sent in sents]
            print ('Get sentences.', len(sents))

            return sents

    # 得到每句话的单词，并得到字典及字典中每个词的下标
    def _get_dict_wordsIndex(self, sents):
        sent_words = [nltk.word_tokenize(sent) for sent in sents]
        word_freq = nltk.FreqDist(itertools.chain(*sent_words))
        print ('Get words.', len(word_freq))

        common_words = word_freq.most_common(self.dict_size-1)
        # 生成词典
        dict_words = [word[0] for word in common_words]
        dict_words.append(unknown_token)
        # 得到每个词的下标，用于生成词向量
        index_of_words = dict((word, ix) for ix, word in enumerate(dict_words))

        return sent_words, dict_words, index_of_words

    # 得到训练数据
    def get_vector(self):
        sents = self._get_sentences()
        sent_words, dict_words, index_of_words = self._get_dict_wordsIndex(sents)

        # 将每个句子中没包含进词典dict_words中的词替换为unknown_token
        for i, words in enumerate(sent_words):
            sent_words[i] = [w if w in dict_words else unknown_token for w in words]

        X_train = np.array([[index_of_words[w] for w in sent[:-1]] for sent in sent_words])
        y_train = np.array([[index_of_words[w] for w in sent[1:]] for sent in sent_words])

        return X_train, y_train, dict_words, index_of_words\

file_path = 'results-20170508-103637.csv'
dict_size = 8000
myTokenFile = tokenFile2vector(file_path, dict_size)
X_train, y_train, dict_words, index_of_words = myTokenFile.get_vector()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Get sentences. 24700
Get words. 30347




In [None]:
y_train[0]

[401, 1]

处理好的文本输入格式example(X_train[0], y_train[0])：
x: [1，2，3] y:[2,3,4]

In [None]:
import numpy as np

def softmax(x):
  x = np.array(x)
  max_x = np.max(x)
  return np.exp(x-max_x) / np.sum(np.exp(x-max_x))

def sigmoid(x):
  return 1.0/(1.0 + np.exp(-x))

def tanh(x):
  return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))

class myLSTM():
  def __init__(self, data_dim, hidden_dim=100):
    #data_dim就是输入维度（vocab_size)，hidden_dim是记忆细胞和隐藏层的维度（二者一样）
    self.data_dim = data_dim
    self.hidden_dim = hidden_dim

    #初始化权值向量
    self.whi, self.wxi, self.bi = self._init_wh_wx()
    self.whf, self.wxf, self.bf = self._init_wh_wx()                           
    self.who, self.wxo, self.bo = self._init_wh_wx()
    self.wha, self.wxa, self.ba = self._init_wh_wx()
    self.wy, self.by = np.random.uniform(-np.sqrt(1.0/self.hidden_dim), np.sqrt(1.0/self.hidden_dim), 
                                   (self.data_dim, self.hidden_dim)), \
                           np.random.uniform(-np.sqrt(1.0/self.hidden_dim), np.sqrt(1.0/self.hidden_dim), 
                                   (self.data_dim, 1))
  def _init_wh_wx(self):
    wh = np.random.uniform(-np.sqrt(1.0/self.hidden_dim), np.sqrt(1.0/self.hidden_dim), 
                                   (self.hidden_dim, self.hidden_dim))
    wx = np.random.uniform(-np.sqrt(1.0/self.data_dim), np.sqrt(1.0/self.data_dim), 
                                   (self.hidden_dim, self.data_dim))
    b = np.random.uniform(-np.sqrt(1.0/self.data_dim), np.sqrt(1.0/self.data_dim), 
                                   (self.hidden_dim, 1))

    return wh, wx, b

  def _init_s(self, T):
    iss = np.array([np.zeros((self.hidden_dim, 1))] * (T + 1))  # input gate
    fss = np.array([np.zeros((self.hidden_dim, 1))] * (T + 1))  # forget gate
    oss = np.array([np.zeros((self.hidden_dim, 1))] * (T + 1))  # output gate
    ass = np.array([np.zeros((self.hidden_dim, 1))] * (T + 1))  # current inputstate
    hss = np.array([np.zeros((self.hidden_dim, 1))] * (T + 1))  # hidden state
    css = np.array([np.zeros((self.hidden_dim, 1))] * (T + 1))  # cell state
    ys = np.array([np.zeros((self.data_dim, 1))] * T)    # output value

    return {'iss': iss, 'fss': fss, 'oss': oss, 
                'ass': ass, 'hss': hss, 'css': css, 
                'ys': ys}
  def forward(self, x):
    # 向量时间长度
    T = len(x)        
    # 初始化各个状态向量
    stats = self._init_s(T)               

    for t in range(T):
            # 前一时刻隐藏状态
      ht_pre = np.array(stats['hss'][t-1]).reshape(-1, 1)

            # input gate
      stats['iss'][t] = self._cal_gate(self.whi, self.wxi, self.bi, ht_pre, x[t], sigmoid)
            # forget gate
      stats['fss'][t] = self._cal_gate(self.whf, self.wxf, self.bf, ht_pre, x[t], sigmoid)
            # output gate
      stats['oss'][t] = self._cal_gate(self.who, self.wxo, self.bo, ht_pre, x[t], sigmoid)
            # current inputstate
      stats['ass'][t] = self._cal_gate(self.wha, self.wxa, self.ba, ht_pre, x[t], tanh)

            # cell state, ct = ft * ct_pre + it * at
      stats['css'][t] = stats['fss'][t] * stats['css'][t-1] + stats['iss'][t] * stats['ass'][t]            
            # hidden state, ht = ot * tanh(ct)
      stats['hss'][t] = stats['oss'][t] * tanh(stats['css'][t])

            # output value, yt = softmax(self.wy.dot(ht) + self.by)
      stats['ys'][t] = softmax(self.wy.dot(stats['hss'][t]) + self.by)

    return stats
  
  def _cal_gate(self, wh, wx, b, ht_pre, x, activation):
        return activation(wh.dot(ht_pre) + wx[:, x].reshape(-1,1) + b)


  def predict(self, x):
    stats = self.forward(x)
    pre_y = np.argmax(stats['ys'].reshape(len(x), -1), axis=1)         
    return pre_y

  #softmax交叉熵损失，(x,y)多个样本
  def loss(self, x, y):
    cost = 0        
    for i in range(len(y)):
        stats = self.forward(x[i])
            # 取出 y[i] 中每一时刻对应的预测值
        pre_yi = stats['ys'][range(len(y[i])), y[i]]
        cost -= np.sum(np.log(pre_yi))

        # 统计所有y中词的个数, 计算平均损失
    N = np.sum([len(yi) for yi in y])
    ave_loss = cost / N

    return ave_loss


  def _init_wh_wx_grad(self):
    dwh = np.zeros(self.whi.shape)
    dwx = np.zeros(self.wxi.shape)
    db = np.zeros(self.bi.shape)

    return dwh, dwx, db


  #x,y 为一个样本
  def bptt(self, x, y):
    dwhi, dwxi, dbi = self._init_wh_wx_grad()
    dwhf, dwxf, dbf = self._init_wh_wx_grad()                           
    dwho, dwxo, dbo = self._init_wh_wx_grad()
    dwha, dwxa, dba = self._init_wh_wx_grad()
    dwy, dby = np.zeros(self.wy.shape), np.zeros(self.by.shape)


    delta_ct = np.zeros((self.hidden_dim, 1))
    delta_ct_pre = np.zeros((self.hidden_dim, 1))

    stats = self.forward(x)
    #目标函数对y的偏导
    delta_o = stats['ys']
    delta_o[np.arange(len(y)), y] -= 1 #(o-t)，MSE loss

    for t in np.arange(len(y))[::-1]:#倒序
      #输出层wy、by的偏导
      dwy += delta_o[t].dot(stats['hss'][t].reshape(1, -1))  
      dby += delta_o[t]

      # 目标函数对隐藏状态的偏导数
      if t == len(y)-1:
        delta_ht_pre = np.zeros((self.hidden_dim, 1))
      else:
        delta_C = stats['oss'][t+1] * (1-tanh(stats['css'][t+1])**2)
        delta_c = self.whf.dot(stats['fss'][t+1] * (1-stats['fss'][t+1]) * stats['css'][t]) + \
              self.whi.dot(stats['iss'][t+1] * (1-stats['iss'][t+1]) * stats['ass'][t+1]) + \
              self.wha.dot(stats['iss'][t+1] * (1-stats['ass'][t+1]**2))
        delta_hh = self.who.dot(stats['oss'][t+1] * (1-stats['oss'][t+1]) * tanh(stats['css'][t+1])) + delta_C * delta_c
        delta_ht_pre = delta_ht_pre * delta_hh

      delta_ht = self.wy.T.dot(delta_o[t]) + delta_ht_pre

      # 各个门及状态单元的偏导数
      delta_ot = delta_ht * tanh(stats['css'][t])
      delta_ct += delta_ht * stats['oss'][t] * (1-tanh(stats['css'][t])**2) + delta_ct_pre
      delta_it = delta_ct * stats['ass'][t]
      delta_ft = delta_ct * stats['css'][t-1]
      delta_at = delta_ct * stats['iss'][t]
      
      delta_at_net = delta_at * (1-stats['ass'][t]**2)
      delta_it_net = delta_it * stats['iss'][t] * (1-stats['iss'][t])
      delta_ft_net = delta_ft * stats['fss'][t] * (1-stats['fss'][t])
      delta_ot_net = delta_ot * stats['oss'][t] * (1-stats['oss'][t])

      dwhf, dwxf, dbf = self._cal_grad_delta(dwhf, dwxf, dbf, delta_ft_net, stats['hss'][t-1], x[t])                              
      dwhi, dwxi, dbi = self._cal_grad_delta(dwhi, dwxi, dbi, delta_it_net, stats['hss'][t-1], x[t])                              
      dwha, dwxa, dba = self._cal_grad_delta(dwha, dwxa, dba, delta_at_net, stats['hss'][t-1], x[t])            
      dwho, dwxo, dbo = self._cal_grad_delta(dwho, dwxo, dbo, delta_ot_net, stats['hss'][t-1], x[t])

      delta_ct_pre = delta_ct * stats['fss'][t]
      delta_ht_pre = delta_ht

    return [dwhf, dwxf, dbf,
         dwhi, dwxi, dbi,
         dwha, dwxa, dba,
        dwho, dwxo, dbo,
           dwy, dby]

  def _cal_grad_delta(self, dwh, dwx, db, delta_net, ht_pre, x):
    dwh += delta_net * ht_pre
    dwx += delta_net * x
    db += delta_net

    return dwh, dwx, db

  #计算梯度，x,y一个样本
  def sgd_step(self, x, y, lr):
    dwhf, dwxf, dbf, \
    dwhi, dwxi, dbi, \
    dwha, dwxa, dba, \
    dwho, dwxo, dbo, \
    dwy, dby  = self.bptt(x, y)

    self.whf, self.wxf, self.bf = self._update_wh_wx(lr, self.whf, self.wxf, self.bf, dwhf, dwxf, dbf)
    self.whi, self.wxi, self.bi = self._update_wh_wx(lr, self.whi, self.wxi, self.bi, dwhi, dwxi, dbi)
    self.wha, self.wxa, self.ba = self._update_wh_wx(lr, self.wha, self.wxa, self.ba, dwha, dwxa, dba)
    self.who, self.wxo, self.bo = self._update_wh_wx(lr, self.who, self.wxo, self.bo, dwho, dwxo, dbo)

    self.wy, self.by = self.wy - lr*dwy, self.by-lr*dby

  def _update_wh_wx(self, lr, wh, wx, b, dwh, dwx, db):
    wh -= lr*dwh
    wx -= lr*dwx
    b -= lr*db

    return wh, wx, b

  def train(self, X_train, y_train, lr=0.005, n_epoch=5):
    losses = []
    num_examples = 0

    for epoch in range(n_epoch):
      for i in range(len(y_train)):
        self.sgd_step(X_train[i], y_train[i], lr)
        num_examples += 1
      
      loss = self.loss(X_train, y_train)
      losses.append(loss)
      print('epoch: ',epoch+1, 'loss = ', loss)
      if len(losses) > 1 and losses[-1] > losses[-2]:
        lr *= 0.5
        print('decrease lr to ', lr)



In [None]:
lstm = myLSTM(8000, hidden_dim=100)
lstm.train(X_train[:200], y_train[:200], lr=0.005, n_epoch=3)

  if __name__ == '__main__':


epoch:  1 loss =  6.430563203617257
epoch:  2 loss =  6.128558763228318
epoch:  3 loss =  6.005181943263411
