In [1]:
import collections
import d2lzh as d2l
from mxnet import gluon, init, nd
from mxnet.contrib import text
from mxnet.gluon import data as gdata, loss as gloss, nn, rnn, utils as gutils
import os
import random
import tarfile

In [2]:
# 本函数已保存在d2lzh包中方便以后使用
def download_imdb(data_dir='../data'):
    url = ('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
    sha1 = '01ada507287d82875905620988597833ad4e0903'
    fname = gutils.download(url, data_dir, sha1_hash=sha1)
    with tarfile.open(fname, 'r') as f:
        f.extractall(data_dir)

download_imdb()

In [3]:
def read_imdb(folder='train'):  # 本函数已保存在d2lzh包中方便以后使用
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join('../data/aclImdb/', folder, label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

train_data, test_data = read_imdb('train'), read_imdb('test')

In [4]:
import spacy
spacy_en = spacy.load('en')

# 基于空格分词
def get_tokenized_imdb(data):  # 本函数已保存在d2lzh包中方便以后使用
#     def tokenizer(text):
#         return [tok.lower() for tok in text.split(' ')]
    def tokenizer(text): 
        return [tok.text for tok in spacy_en.tokenizer(text)]
    return [tokenizer(review) for review, _ in data]

In [5]:
# 过滤掉次数少于5的词
def get_vocab_imdb(data):  # 本函数已保存在d2lzh包中方便以后使用
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return text.vocab.Vocabulary(counter, min_freq=5,
                                 reserved_tokens=['<pad>'])

vocab = get_vocab_imdb(train_data)
'# words in vocab:', len(vocab)

('# words in vocab:', 31278)

In [6]:
def preprocess_imdb(data, vocab):  # 本函数已保存在d2lzh包中方便以后使用
    max_l = 500  # 将每条评论通过截断或者补'<pad>'，使得长度变成500

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [
            vocab.token_to_idx['<pad>']] * (max_l - len(x))

    tokenized_data = get_tokenized_imdb(data)
    features = nd.array([pad(vocab.to_indices(x)) for x in tokenized_data])
    labels = nd.array([score for _, score in data])
    return features, labels

In [7]:
batch_size = 64
train_set = gdata.ArrayDataset(*preprocess_imdb(train_data, vocab))
test_set = gdata.ArrayDataset(*preprocess_imdb(test_data, vocab))
train_iter = gdata.DataLoader(train_set, batch_size, shuffle=True)
test_iter = gdata.DataLoader(test_set, batch_size)

In [8]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)

X (64, 500) y (64,)


('#batches:', 391)

In [9]:
class BiRNN(nn.Block):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # bidirectional设为True即得到双向神经循环网络
        self.encoder = rnn.LSTM(num_hiddens, num_layers=num_layers, bidirectional=True, input_size=embed_size)
        self.decoder = nn.Dense(2)
    
    def forward(self, input):
        # inputs的形状（批量，词数）因为LSTM需要序列作为第一维度，
        # 所以输入转置为（词数，批量），提取词特征，输出形状为（词数，批量，词向量维度）
        embeddings = self.embedding(input.T)
        # rnn.LSTM只输入embeddings,因此只返回最后一层的隐藏层在各个时间步上的隐藏状态。
        # outputs的形状是（词数， 批量， 2*隐藏单元个数）
        outputs = self.encoder(embeddings)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入
        # 形状为（批量， 4*隐藏单元个数）
        encoding = nd.concat(outputs[0], outputs[-1])
        outs = self.decoder(encoding)
        return outs

In [10]:
embed_size, num_hiddens, num_layers, ctx = 100, 100, 2, d2l.try_all_gpus()
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)
net.initialize(init.Xavier(), ctx=ctx)

In [11]:
# 加载词向量
glove_embedding = text.embedding.create(
    'glove', pretrained_file_name='glove.6B.100d.txt', vocabulary=vocab)

In [12]:
# 预训练词向量的维度需要与创建的模型中的嵌入层输出大小embed_size一致
net.embedding.weight.set_data(glove_embedding.idx_to_vec)
# 设置空梯度不再更新向量
net.embedding.collect_params().setattr('grad_req', 'null')

In [13]:
lr, num_epochs = 0.01, 5
trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
loss = gloss.SoftmaxCrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs)

training on [gpu(0)]
epoch 1, loss 0.4989, train acc 0.743, test acc 0.840, time 40.3 sec
epoch 2, loss 0.3526, train acc 0.852, test acc 0.857, time 41.9 sec
epoch 3, loss 0.3076, train acc 0.872, test acc 0.862, time 43.3 sec
epoch 4, loss 0.2753, train acc 0.888, test acc 0.860, time 44.0 sec
epoch 5, loss 0.2433, train acc 0.902, test acc 0.852, time 44.2 sec


In [26]:
# 本函数已保存在d2lzh包中方便以后使用
def predict_sentiment(net, vocab, sentence):
    sentence = nd.array(vocab.to_indices(sentence), ctx=d2l.try_gpu())
    label = nd.argmax(net(sentence.reshape((1, -1))), axis=1)
    return 'positive' if label.asscalar() == 1 else 'negative'

In [27]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'beautiful'])

'positive'

In [28]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'stupid'])

'negative'

In [17]:
import gc 
gc.collect()

186