###  IMDB电影评论情感倾向分类

**基于PaddlePaddle2.0基础API构建模型，利用互联网电影资料库Imdb数据来进行电影评论情感倾向预测。**


*数据集简介：IMDB数据集是一个对电影评论标注为正向评论与负向评论的数据集，共有25000条文本数据作为训练集，25000条文本数据作为测试集。 该数据集的官方地址为： [http://ai.stanford.edu/~amaas/data/sentiment/](http://)*


### 加载数据集

In [1]:
import numpy as np
import paddle

#准备数据
#加载IMDB数据
imdb_train = paddle.text.datasets.Imdb(mode='train') #训练数据集
imdb_test = paddle.text.datasets.Imdb(mode='test') #测试数据集

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  def convert_to_list(value, n, name, dtype=np.int):


In [2]:
#获取字典
word_dict = imdb_train.word_idx

#在字典中增加一个<pad>字符串
word_dict['<pad>'] = len(word_dict)

# 查看句子字典
for k in list(word_dict)[:5]:
    print("{}:{}".format(k.decode('ASCII'), word_dict[k]))
print("...")
for k in list(word_dict)[-5:]:
    print("{}:{}".format(k if isinstance(k, str) else k.decode('ASCII'), word_dict[k]))
print("totally {} words".format(len(word_dict)))

#参数设定
vocab_size = len(word_dict)
embedding_size = 256
hidden_size = 256
n_layers = 2
dropout = 0.5
seq_len = 200
batch_size = 64
epochs = 10
pad_id = word_dict['<pad>']

the:0
and:1
a:2
of:3
to:4
...
virtual:5143
warriors:5144
widely:5145
<unk>:5146
<pad>:5147
totally 5148 words


In [3]:
classes = ['negative', 'positive']

# 生成句子列表
def ids_to_str(ids):
    # print(ids)
    words = []
    for k in ids:
        w = list(word_dict)[k]
        words.append(w if isinstance(w, str) else w.decode('ASCII'))
    return " ".join(words)

# 取出来第一条数据看看样子。
sent = imdb_train.docs[0]
label = imdb_train.labels[0]
print('sentence list id is:', sent)
print('sentence label id is:', label)
print('--------------------------')
print('sentence list is: ', ids_to_str(sent))
print('sentence label is: ', classes[label])


sentence list id is: [5146, 43, 71, 6, 1092, 14, 0, 878, 130, 151, 5146, 18, 281, 747, 0, 5146, 3, 5146, 2165, 37, 5146, 46, 5, 71, 4089, 377, 162, 46, 5, 32, 1287, 300, 35, 203, 2136, 565, 14, 2, 253, 26, 146, 61, 372, 1, 615, 5146, 5, 30, 0, 50, 3290, 6, 2148, 14, 0, 5146, 11, 17, 451, 24, 4, 127, 10, 0, 878, 130, 43, 2, 50, 5146, 751, 5146, 5, 2, 221, 3727, 6, 9, 1167, 373, 9, 5, 5146, 7, 5, 1343, 13, 2, 5146, 1, 250, 7, 98, 4270, 56, 2316, 0, 928, 11, 11, 9, 16, 5, 5146, 5146, 6, 50, 69, 27, 280, 27, 108, 1045, 0, 2633, 4177, 3180, 17, 1675, 1, 2571]
sentence label id is: 0
--------------------------
sentence list is:  <unk> has much in common with the third man another <unk> film set among the <unk> of <unk> europe like <unk> there is much inventive camera work there is an innocent american who gets emotionally involved with a woman he doesnt really understand and whose <unk> is all the more striking in contrast with the <unk> br but id have to say that the third man has a more <u

In [4]:
# 每个样本的单词数量不一样，用Padding使得每个样本输入大小为seq_len
# 常见的处理方式是把数据集中的数据都统一成同样长度的数据。这包括：对于较长的数据进行截断处理，对于较短的数据用特殊的词<pad>进行填充。
def padding(dataset):
    padded_sents = []
    labels = []
    for batch_id, data in enumerate(dataset):
        sent, label = data[0].astype('int64'), data[1].astype('int64')
        padded_sent = np.concatenate([sent[:seq_len], [pad_id] * (seq_len - len(sent))]).astype('int64')
        padded_sents.append(padded_sent)
        labels.append(label)
    return np.array(padded_sents), np.array(labels)

train_x, train_y = padding(imdb_train)
test_x, test_y = padding(imdb_test)
    
class IMDBDataset(paddle.io.Dataset):
    def __init__(self, sents, labels):
        self.sents = sents
        self.labels = labels

    def __getitem__(self, index):
        data = self.sents[index]
        label = self.labels[index]
        return data, label

    def __len__(self):
        return len(self.sents)

train_dataset = IMDBDataset(train_x, train_y)
test_dataset = IMDBDataset(test_x, test_y)

train_loader = paddle.io.DataLoader(train_dataset, return_list=True, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = paddle.io.DataLoader(test_dataset, return_list=True, shuffle=True, batch_size=batch_size, drop_last=True)

### 任务一：构建模型


In [5]:
#构建模型
import paddle.nn as nn

class MyModel(paddle.nn.Layer):
    def __init__(self):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 256)
        self.gru = nn.GRU(256, 256, num_layers=2, direction='bidirectional', dropout=0.5)
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(in_features=256*2, out_features=2)

        
    def forward(self, inputs):
        emb = self.dropout(self.embedding(inputs))
        output, hidden = self.gru(emb)
        hidden = paddle.concat((hidden[-2,:,:], hidden[-1,:,:]), axis=1)
        hidden = self.dropout(hidden)
        return self.linear(hidden)

In [6]:

class Bow(paddle.nn.Layer):
    def __init__(self):
        super(Bow, self).__init__()
        self.emb = paddle.nn.Embedding(vocab_size, 256)
        self.fc = paddle.nn.Linear(in_features=256, out_features=2)
        self.dropout = paddle.nn.Dropout(0.5)

    def forward(self, x):
        x = self.emb(x)
        x = paddle.mean(x, axis=1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

In [7]:
# 定义一个用于情感分类的网络实例，SentimentClassifier
class LSTM(paddle.nn.Layer):
    
    def __init__(self):
        super(LSTM, self).__init__()

        # num_embeddings (int) - 嵌入字典的大小， input中的id必须满足 0 =< id < num_embeddings 。 。
        # embedding_dim (int) - 每个嵌入向量的维度。
        # padding_idx (int|long|None) - padding_idx的配置区间为 [-weight.shape[0], weight.shape[0]，如果配置了padding_idx，那么在训练过程中遇到此id时会被用
        # sparse (bool) - 是否使用稀疏更新，在词嵌入权重较大的情况下，使用稀疏更新能够获得更快的训练速度及更小的内存/显存占用。
        # weight_attr (ParamAttr|None) - 指定嵌入向量的配置，包括初始化方法，具体用法请参见 ParamAttr ，一般无需设置，默认值为None。
        self.embedding = nn.Embedding(vocab_size, 256)

        # input_size (int) - 输入的大小。
        # hidden_size (int) - 隐藏状态大小。
        # num_layers (int，可选) - 网络层数。默认为1。
        # direction (str，可选) - 网络迭代方向，可设置为forward或bidirect（或bidirectional）。默认为forward。
        # time_major (bool，可选) - 指定input的第一个维度是否是time steps。默认为False。
        # dropout (float，可选) - dropout概率，指的是出第一层外每层输入时的dropout概率。默认为0。
        # weight_ih_attr (ParamAttr，可选) - weight_ih的参数。默认为None。
        # weight_hh_attr (ParamAttr，可选) - weight_hh的参数。默认为None。
        # bias_ih_attr (ParamAttr，可选) - bias_ih的参数。默认为None。
        # bias_hh_attr (ParamAttr，可选) - bias_hh的参数。默认为None。
        self.lstm = nn.LSTM(256, 256, num_layers=2, direction='bidirectional',dropout=0.5)

        # in_features (int) – 线性变换层输入单元的数目。
        # out_features (int) – 线性变换层输出单元的数目。
        # weight_attr (ParamAttr, 可选) – 指定权重参数的属性。默认值为None，表示使用默认的权重参数属性，将权重参数初始化为0。具体用法请参见 ParamAttr 。
        # bias_attr (ParamAttr|bool, 可选) – 指定偏置参数的属性。 bias_attr 为bool类型且设置为False时，表示不会为该层添加偏置。 bias_attr 如果设置为True或者None，则表示使用默认的偏置参数属性，将偏置参数初始化为0。具体用法请参见 ParamAttr 。默认值为None。
        # name (str，可选) – 具体用法请参见 Name ，一般无需设置，默认值为None。
        self.linear = nn.Linear(in_features=256*2, out_features=2)

        self.dropout = nn.Dropout(0.5)

    def forward(self, inputs):
        
        emb = self.dropout(self.embedding(inputs))
        
        output, (hidden, _) = self.lstm(emb)
        #output形状大小为[batch_size,seq_len,num_directions * hidden_size]
        #hidden形状大小为[num_layers * num_directions, batch_size, hidden_size]
        #把前向的hidden与后向的hidden合并在一起
        hidden = paddle.concat((hidden[-2,:,:], hidden[-1,:,:]), axis = 1)
        hidden = self.dropout(hidden)
        #hidden形状大小为[batch_size, hidden_size * num_directions]
        return self.linear(hidden) 

In [8]:

# 构建自己的神经网络
class RNN(nn.Layer):
    def __init__(self):
        super(RNN, self).__init__()
        self.emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=256)
        self.drop1 = nn.Dropout(0.5)
        self.rnn = nn.SimpleRNN(input_size=256, hidden_size=256, num_layers=2, direction='forward', dropout=0.3)
        self.linear = nn.Linear(256*2, 2)
        self.drop2 = nn.Dropout(0.5)

    def forward(self, x):
        x = self.emb(x)
        x = self.drop1(x)
        output, x = self.rnn(x)
        x = paddle.concat((x[-2, :, :], x[-1, :, :]), axis=1)
        x = self.drop2(x)
        x = self.linear(x)
        return x

### 模型训练

In [9]:
#封装模型
model = MyModel()
# model = Bow()
# model = LSTM()
# model = RNN()

W1114 16:36:34.330693  5169 device_context.cc:362] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 10.1, Runtime API Version: 10.1
W1114 16:36:34.334748  5169 device_context.cc:372] device: 0, cuDNN Version: 7.6.


In [10]:

model = paddle.Model(model) #用Model封装模型

#配置模型优化器、损失函数、评估函数
model.prepare(paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()),
              paddle.nn.CrossEntropyLoss(),
              paddle.metric.Accuracy())

# 自定义 存储最优模型参数 回调函数：
class Best_model_checkpoint(paddle.callbacks.Callback):
    def __init__(self, baseline=0, save_dir=None):
        self.baseline = baseline
        self.save_dir = save_dir

    def on_eval_end(self, logs=None):
        acc = logs['acc']
        if acc > self.baseline:
            self.baseline = acc
            path = '{}/best_model'.format(self.save_dir)
            print('save checkpoint at {}'.format(path))
            self.model.save(path)
        print(logs)

plot_callback = paddle.callbacks.VisualDL(log_dir='visualdl_log_dir/RNN')
best_model_checkpoint = Best_model_checkpoint(save_dir='model_params')  # 加入在线显示 曲线部分

#模型训练
model.fit(train_loader,test_loader,
          epochs=epochs,
          eval_freq=1,
          batch_size=batch_size,
          verbose=1,
          callbacks=[plot_callback, best_model_checkpoint]
          )

The loss value printed in the log is the current step, and the metric is the average value of previous step.
Epoch 1/10


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if isinstance(slot[0], (np.ndarray, np.bool, numbers.Number)):
  return (isinstance(seq, collections.Sequence) and


Eval begin...
The loss value printed in the log is the current batch, and the metric is the average value of previous step.
Eval samples: 24960
save checkpoint at model_params/best_model
{'loss': [0.6954046], 'acc': 0.49603365384615383, 'step': 389, 'batch_size': 64}
Epoch 2/10
Eval begin...
The loss value printed in the log is the current batch, and the metric is the average value of previous step.
Eval samples: 24960
save checkpoint at model_params/best_model
{'loss': [0.70869875], 'acc': 0.4997596153846154, 'step': 389, 'batch_size': 64}
Epoch 3/10
Eval begin...
The loss value printed in the log is the current batch, and the metric is the average value of previous step.
Eval samples: 24960
{'loss': [0.6994867], 'acc': 0.49583333333333335, 'step': 389, 'batch_size': 64}
Epoch 4/10
Eval begin...
The loss value printed in the log is the current batch, and the metric is the average value of previous step.
Eval samples: 24960
{'loss': [0.723459], 'acc': 0.4935096153846154, 'step': 389, '

### 任务二：输出测试集的最好精度

In [11]:
#模型评估
# import os
# print(os.getcwd())
model_state_dict = paddle.load('model_params/best_model.pdparams')  # 导入最优模型
# model = MyModel()
model = RNN()

model.set_state_dict(model_state_dict)
model = paddle.Model(model)
model.prepare(metrics=paddle.metric.Accuracy())
 
eval_result = model.evaluate(test_loader, verbose=1)

Eval begin...
The loss value printed in the log is the current batch, and the metric is the average value of previous step.
Eval samples: 24960


In [12]:
!visualdl service upload --logdir ./visualdl_log_dir

Uploading file `./visualdl_log_dir/RNN/vdlrecords.1636878998.log`
Uploading file `./visualdl_log_dir/GRU/vdlrecords.1636875463.log`
Uploading file `./visualdl_log_dir/LSTM/vdlrecords.1636878910.log`
Uploading file `./visualdl_log_dir/Bow/vdlrecords.1636874156.log`
View your visualization results at: `https://paddlepaddle.org.cn/paddle/visualdl/service/app?id=8149280384c94fce7d09162da61e514b`.
