In [1]:
import csv
import torch.nn as nn
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime

import matplotlib.pyplot as plt

In [2]:
#nltk.download("book")

原始数据的处理:
1.将文本转换为Tokens
2.删除低频词语，添加UNKNOWN_TOKEN
3.添加开始和结束标记
4.建立训练数据矩阵(单词到序号的映射)

In [3]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Read the data and append SENTENCE_START and SENTENCE_END tokens
print("Reading CSV file...")
with open('data/reddit-comments-2015-08.csv', 'rt') as f:
    reader = csv.reader(f, skipinitialspace=True)
    # 跳过第一个body
    next(reader)
    # 将剩下的所有评论都按照句子划分，并且都小写
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    # 增加SENTENCE_START和SENTENCE_END标记
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print ("Parsed %d sentences." % (len(sentences)))
    
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print ("Found %d unique words tokens." % len(word_freq.items()))

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print ("Using vocabulary size %d." % vocabulary_size)
print ("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print ("\nExample sentence: '%s'" % sentences[0])
print ("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])

Reading CSV file...
Parsed 79170 sentences.
Found 65408 unique words tokens.
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'documentary' and appeared 10 times.

Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']'


In [4]:
# create the training data
# x,y分别为输入和目标输出，其中，x为前面n-1个单词，y为后面n-1个单词，并且保存为词汇表的序号
x_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])


In [5]:
# Print an training data example
x_example, y_example = x_train[17], y_train[17]
print("x:\n%s\n%s"%(" ".join([index_to_word[x] for x in x_example]), x_example))
print("\ny:\n%s\n%s"%(" ".join([index_to_word[x] for x in y_example]), y_example))

x:
SENTENCE_START what are n't you understanding about this ? !
[0, 51, 27, 16, 10, 858, 54, 25, 34, 69]

y:
what are n't you understanding about this ? ! SENTENCE_END
[51, 27, 16, 10, 858, 54, 25, 34, 69, 1]


RNN的输入x将会是一串单词，但是，我们需要将上面的序号转换为one-hot编码方式，其中，公式为:
s(t)=tanh(U*x(t) + W*s(t-1))
o(t)=softmax(V*s(t))

x:输入数据 8000
U 100×8000
s:隐藏层数据 100
W 100×100
o:输出层数据 8000
V 8000×100

下面是对这个网络的参数的设置:
词汇表的大小为(vocabulary size):C=8000
隐藏层的大小为(hidden layer size):H=100
ps:隐藏层的大小可以看作是我们网络的记忆力，它越大我们就可以学习更加复杂的部分，但是同时增加了计算量。

训练参数量为2HC+H^2, 1,610,000

模型的瓶颈为：
由于输入的x中只有一个为1其他为0,所以仅仅选择U的一列，而不必进行全部乘法计算，所以，最大的乘法计算为V，所以希望词汇表尽可能的小。

RNN的初始化方法很特别，原因待定。
word_dim是词汇表的大小
hidden_dim是隐藏层的大小
现在先不考虑bptt_truncate参数，之后在BPTT中会讲解

正向传播:
现在使用上面的公式所定义的正向传播(预测词语的概率)
在正向传播中我们将所有的隐藏层保存在s中
我们在初始的隐藏层增加了一个额外的元素为0。
不仅返回输出还返回隐藏状态
o为单词的概率向量

计算损失函数:
我们可以先不用损失函数直接随机来生成，检验代码是否正确，然后，我们开始训练，当然首先的是损失函数的计算了。

首先，我们通常使用交叉熵来计算损失，如果我们是有N个训练样本(我们文本中的词语)和C个类别(词汇表的大小)，那么我们计算的输出o和实际的标签y之间的误差为:
L(y,o) = - 1/N×叠加(y_n*log(o_n))

这个公式有点复杂，但是它的功能只是对我们的训练实例进行总结，然后根据我们的预测值的偏差来增加损失，我们使用了calculate_loss函数

然后，我们使用了随机梯度下降算法SGD，其中，我们需要计算代价函数对于训练参数的偏导，由于RNN结构的特殊性，我们需要使用BackPropagation Through Time(BPTT)算法。

因为网络中的每一步的参数都共享，每一步的输出的梯度不仅仅取决于当前时刻的计算，也受到之前时刻的影响，


也就是应用了链式法则，我们使用了bptt函数，输入x,y，输出梯度。

梯度检查(这部分略过):
在执行反响传播的时候，最后执行梯度检查，这是一种验证实现正确的方法，梯度检查的思想是参数的导数等于该点的斜率，我们可以通过稍微改变参数然后除以变量来近似。

SGD实施:
使用之前的BPTT算法来计算梯度，实现SGD可以分为两步:
1.函数sdg_step用于计算渐变并执行一批更新
2.循环遍历训练及并调整学习率的外循环

In [6]:
def softmax(z):
    return np.exp(z)/((np.exp(z)).sum())

class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
        
    def forward_propagation(self, x):
        # 正向传递步骤数
        T = len(x)
        s = np.zeros((T + 1, self.hidden_dim))
        s[-1] = np.zeros(self.hidden_dim)
        # 输出结果保存
        o = np.zeros((T, self.word_dim))
        # 对于每一步的运算
        for t in np.arange(T):
            # Note that we are indxing U by x[t]. This is the same as multiplying U with a one-hot vector.
            s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
        return [o, s]
    
    def predict(self,x):
        o,s = self.forward_propagation(x)
        return np.argmax(o,axis=1)
    
    def calculate_total_loss(self, x, y):
        L = 0
        # For each sentence...
        for i in np.arange(len(y)):
            o, s = self.forward_propagation(x[i])
            # We only care about our prediction of the "correct" words
            correct_word_predictions = o[np.arange(len(y[i])), y[i]]
            # Add to the loss based on how off we were
            L += -1 * np.sum(np.log(correct_word_predictions))
        return L
    
    def calculate_loss(self, x, y):
        # x 为输入的特征，y 为实际的标签
        # Divide the total loss by the number of training examples
        N = np.sum((len(y_i) for y_i in y))
        return self.calculate_total_loss(x,y)/N
    
    def bptt(self, x, y):
        T = len(y)
        # Perform forward propagation
        o, s = self.forward_propagation(x)
        # We accumulate the gradients in these variables
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        delta_o = o
        delta_o[np.arange(len(y)), y] -= 1.
        # For each output backwards...
        for t in np.arange(T)[::-1]:
            dLdV += np.outer(delta_o[t], s[t].T)
            # Initial delta calculation
            delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                dLdW += np.outer(delta_t, s[bptt_step-1])              
                dLdU[:,x[bptt_step]] += delta_t
                # Update delta for next step
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
        return [dLdU, dLdV, dLdW]


    # Performs one step of SGD.
    def numpy_sdg_step(self, x, y, learning_rate):
        # Calculate the gradients
        dLdU, dLdV, dLdW = self.bptt(x, y)
        # Change parameters according to gradients and learning rate
        self.U -= learning_rate * dLdU
        self.V -= learning_rate * dLdV
        self.W -= learning_rate * dLdW

    # Outer SGD Loop
    # - model: The RNN model instance
    # - X_train: The training data set
    # - y_train: The training data labels
    # - learning_rate: Initial learning rate for SGD
    # - nepoch: Number of times to iterate through the complete dataset
    # - evaluate_loss_after: Evaluate the loss after this many epochs
    # nepoch 随机梯度下降的批数
    # evaluate_loss_after 每隔多少批次进行误差的评估
    def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
        # We keep track of the losses so we can plot them later
        losses = []
        num_examples_seen = 0
        for epoch in range(nepoch):
            # Optionally evaluate the loss
            if (epoch % evaluate_loss_after == 0):
                loss = model.calculate_loss(X_train, y_train)
                losses.append((num_examples_seen, loss))
                time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                print("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
                # Adjust the learning rate if loss increases
                if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                    learning_rate = learning_rate * 0.5  
                    print("Setting learning rate to %f" % learning_rate)
                sys.stdout.flush()
            # For each training example...
            for i in range(len(y_train)):
                # One SGD step
                model.numpy_sdg_step(X_train[i], y_train[i], learning_rate)
                num_examples_seen += 1

这一步是用来看输入训练数据之后RNN的输出，没有训练，随机参数，下面的代码可以跳过不运行

In [7]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o,s = model.forward_propagation(x_train[10])
print(x_train[10])
print(o.shape)
print(o)

[0, 72, 63, 13, 124, 5, 26, 1128, 208, 5, 324, 3, 329, 4, 112, 32, 75, 7, 4746, 4, 8, 84, 52, 9, 7, 3155, 1021, 492, 7534, 8, 133, 48, 3096, 4, 10, 95, 51, 4, 128, 17, 37, 314, 577, 2, 40]
(45, 8000)
[[0.00012408 0.0001244  0.00012603 ... 0.00012515 0.00012488 0.00012508]
 [0.00012536 0.00012582 0.00012436 ... 0.00012482 0.00012456 0.00012451]
 [0.00012387 0.0001252  0.00012474 ... 0.00012559 0.00012588 0.00012551]
 ...
 [0.00012471 0.0001243  0.00012524 ... 0.00012475 0.00012522 0.00012623]
 [0.00012564 0.00012431 0.00012481 ... 0.0001244  0.00012609 0.00012486]
 [0.00012447 0.00012509 0.00012469 ... 0.00012473 0.00012506 0.00012641]]


这一步是显示输入训练数据后，输出的字，没有训练，随机参数，可以跳过不运行

In [8]:
predictions = model.predict(x_train[10])
print(predictions.shape)
print(predictions)

(45,)
[1284 5221 7653 7430 1013 3562 7366 1874  224 6601 7299 6722 6892 3198
 4480 5853 2926  261 4073 2371 6299 5376 4146 3761 7051 5981 1549 3765
 4958 1835 6166 5192 2579 5879 4864 5132 6569 2800 2752 6821 4437 7021
 3943 6912 3922]


这一步是显示初始之后的误差，和理论计算的误差之间只有较小的差距，所以完成，可以跳过不运行

In [9]:
print("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print("Actual loss: %f" % model.calculate_loss(x_train[:1000], y_train[:1000]))

Expected Loss for random predictions: 8.987197




Actual loss: 8.987393


In [10]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.numpy_sdg_step(x_train[10], y_train[10], 0.005)

235 ms ± 3.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
np.random.seed(10)
# Train on a small subset of the data to see what happens
model = RNNNumpy(vocabulary_size)
losses = model.train_with_sgd(x_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1)



2020-10-20 20:27:39: Loss after num_examples_seen=0 epoch=0: 8.987280
2020-10-20 20:27:52: Loss after num_examples_seen=100 epoch=1: 8.976046
2020-10-20 20:28:06: Loss after num_examples_seen=200 epoch=2: 8.959871
2020-10-20 20:28:18: Loss after num_examples_seen=300 epoch=3: 8.929739
2020-10-20 20:28:31: Loss after num_examples_seen=400 epoch=4: 8.851977
2020-10-20 20:28:44: Loss after num_examples_seen=500 epoch=5: 6.804030
2020-10-20 20:28:58: Loss after num_examples_seen=600 epoch=6: 6.271116
2020-10-20 20:29:10: Loss after num_examples_seen=700 epoch=7: 6.004432
2020-10-20 20:29:23: Loss after num_examples_seen=800 epoch=8: 5.833906
2020-10-20 20:29:36: Loss after num_examples_seen=900 epoch=9: 5.715240
