# 10.7 文本情感分类：使用循环神经网络

In [138]:
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import MeCab
import re

import sys
sys.path.append("..") 
import d2lzh_pytorch as d2l

os.environ["CUDA_VISIBLE_DEVICES"] = "0"    # 服务器中有多个GPU，选择特定的GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DATA_ROOT = "C:\\Users\\CK\\Dive-into-DL-PyTorch\\data"

print(torch.__version__, device)

1.3.1 cuda


## 10.7.1 文本情感分类数据
### 10.7.1.1 读取数据

In [139]:
fname = os.path.join(DATA_ROOT, "chap5.zip")
if not os.path.exists(os.path.join(DATA_ROOT, "diary_learning_data_DL")):
    print("从压缩包解压...")
    with tarfile.open(fname, 'r') as f:
        f.extractall(DATA_ROOT)

In [140]:
from tqdm import tqdm
def read_diary(folder='train', data_root="C:\\Users\\CK\\Dive-into-DL-PyTorch\\data\\diary_learning_data_DL"):  # 本函数已保存在d2lzh_pytorch包中方便以后使用
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

train_data, test_data = read_diary('train'), read_diary('test')


  0%|          | 0/317 [00:00<?, ?it/s]
100%|██████████| 317/317 [00:00<00:00, 2456.94it/s]

100%|██████████| 316/316 [00:00<00:00, 3397.09it/s]

  0%|          | 0/317 [00:00<?, ?it/s]
100%|██████████| 317/317 [00:00<00:00, 1886.48it/s]

  0%|          | 0/317 [00:00<?, ?it/s]
100%|██████████| 317/317 [00:00<00:00, 2934.59it/s]


### 10.7.1.2 预处理数据

In [141]:
def get_tokenized_diary(data):  
    """
    data: list of [string, label]
    """
    tagger = MeCab.Tagger('-Owakati')
    
    def make_wakati(sentence):
        # MeCabで分かち書き
        sentence = tagger.parse(sentence)
        # 半角全角英数字除去
        sentence = re.sub(r'[0-9０-９a-zA-Zａ-ｚＡ-Ｚ]+', " ", sentence)
        # 記号もろもろ除去
        sentence = re.sub(r'[\．_－―─！＠＃＄％＾＆\-‐|\\＊\“（）＿■×+α※÷⇒—●★☆〇◎◆▼◇△□(：〜～＋=)／*&^%$#@!~`){}［］…\[\]\"\'\”\’:;<>?＜＞〔〕〈〉？、。・,\./『』【】「」→←○《》≪≫\n\u3000]+', "", sentence)
        # スペースで区切って形態素の配列へ
        wakati = sentence.split(" ")
        # 空の要素は削除
        wakati = list(filter(("").__ne__, wakati))
        return wakati
            
    def tokenizer(text):
        wakati = make_wakati(text)
        return [word for word in wakati]
    
    return [tokenizer(review) for review, _ in data]

In [142]:
def get_vocab_diary(data):  # 本函数已保存在d2lzh_pytorch包中方便以后使用
    tokenized_data = get_tokenized_diary(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter)

vocab = get_vocab_diary(train_data)
'# words in vocab:', len(vocab)

('# words in vocab:', 1497)

In [143]:
def preprocess_diary(data, vocab):  # 本函数已保存在d2lzh_torch包中方便以后使用
    max_l = 25  # 将每条评论通过截断或者补0，使得长度变成500

    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x))

    tokenized_data = get_tokenized_diary(data)
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

### 10.7.1.3 创建数据迭代器

In [144]:
batch_size = 32
train_set = Data.TensorDataset(*preprocess_diary(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_diary(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [145]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    break
'#batches:', len(train_iter)

X torch.Size([32, 25]) y torch.Size([32])


('#batches:', 20)

## 10.7.2 使用循环神经网络的模型

In [146]:
class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embed_size, 
                                hidden_size=num_hiddens, 
                                num_layers=num_layers,
                                bidirectional=True)
        self.decoder = nn.Linear(4*num_hiddens, 2) # 初始时间步和最终时间步的隐藏状态作为全连接层输入

    def forward(self, inputs):
        # inputs的形状是(批量大小, 词数)，因为LSTM需要将序列长度(seq_len)作为第一维，所以将输入转置后
        # 再提取词特征，输出形状为(词数, 批量大小, 词向量维度)
        embeddings = self.embedding(inputs.permute(1, 0))
        # rnn.LSTM只传入输入embeddings，因此只返回最后一层的隐藏层在各时间步的隐藏状态。
        # outputs形状是(词数, 批量大小, 2 * 隐藏单元个数)
        outputs, _ = self.encoder(embeddings) # output, (h, c)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入。它的形状为
        # (批量大小, 4 * 隐藏单元个数)。
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
        outs = self.decoder(encoding)
        return outs

In [147]:
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

### 10.7.2.1 加载预训练的词向量

In [148]:
jawiki_vocab = Vocab.Vectors(name='C:\\Users\\CK\\Dive-into-DL-PyTorch\\data\\jawiki_20180420_100d.txt', cache=os.path.join(DATA_ROOT, "jawiki_20180420_100d"))


In [149]:
def load_pretrained_embedding(words, pretrained_vocab):
    """从预训练好的vocab中提取出words对应的词向量"""
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0]) # 初始化为0
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 0
    if oov_count > 0:
        print("There are %d oov words.")
    return embed

net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, jawiki_vocab))
net.embedding.weight.requires_grad = False # 直接加载预训练好的, 所以不需要更新它

### 10.7.2.2 训练并评价模型

In [150]:
lr, num_epochs = 0.01, 20
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
d2l.train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.7178, train acc 0.490, test acc 0.508, time 0.7 sec
epoch 2, loss 0.3049, train acc 0.675, test acc 0.647, time 0.7 sec
epoch 3, loss 0.1560, train acc 0.773, test acc 0.741, time 0.8 sec
epoch 4, loss 0.0880, train acc 0.850, test acc 0.735, time 0.8 sec
epoch 5, loss 0.0526, train acc 0.896, test acc 0.768, time 0.7 sec
epoch 6, loss 0.0386, train acc 0.907, test acc 0.738, time 0.7 sec
epoch 7, loss 0.0234, train acc 0.938, test acc 0.749, time 0.7 sec
epoch 8, loss 0.0182, train acc 0.942, test acc 0.762, time 0.7 sec
epoch 9, loss 0.0108, train acc 0.961, test acc 0.778, time 0.7 sec
epoch 10, loss 0.0052, train acc 0.983, test acc 0.756, time 0.7 sec
epoch 11, loss 0.0057, train acc 0.983, test acc 0.748, time 0.6 sec
epoch 12, loss 0.0043, train acc 0.978, test acc 0.782, time 0.7 sec
epoch 13, loss 0.0013, train acc 0.992, test acc 0.795, time 0.6 sec
epoch 14, loss 0.0146, train acc 0.935, test acc 0.711, time 0.8 sec
epoch 15, loss 0.0085, tr

In [151]:
# 本函数已保存在d2lzh包中方便以后使用
def predict_sentiment(net, vocab, sentence):
    """sentence是词语的列表"""
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item() == 1 else 'negative'

In [152]:
predict_sentiment(net, vocab, ['猫', 'が', '脱走', 'し', 'て', 'しまっ', 'た', 'が', '帰っ', 'て', 'き', 'た'])

'negative'

In [153]:
predict_sentiment(net, vocab, ['ふと', '気', 'が', '付い', 'たら', '真夜中', 'に', 'なっ', 'て', 'い', 'て', '明日', '朝', '早い', 'ので', '後悔', 'し', 'た'])

'negative'

## 3.16.5 $K$折交叉验证

In [None]:
def get_k_fold_data(k, i, X, y):
    # 返回第i折交叉验证时所需要的训练和验证数据
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
    return X_train, y_train, X_valid, y_valid

In [None]:
def k_fold(k, X_train, y_train, num_epochs,
           learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net(X_train.shape[1])
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
                         range(1, num_epochs + 1), valid_ls,
                         ['train', 'valid'])
        print('fold %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k

## 3.16.6 模型选择

In [None]:
k, num_epochs, lr, weight_decay, batch_size = 10, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (k, train_l, valid_l))