# 使用Pytorch做文本分类

CNN虽然解决了全连接网络参数过多的问题，但是CNN不能学习到“序列”上的知识，距离相对远一点的知识就捕捉不到，并且整体的网络效果容易被某些局部信息影响。所以在NLP任务上使用最多的其实是RNN这类能够学习到序列信息的网络。

现在开始我们将逐渐使用工程的思路来做演示，把之前几节课用到的许多工具都做了封装，需要时直接调用，或者在原始的基础上进行修改即可。这里的`utils.tokenizer`和`utils.get_emb`就是封装了之前的Tokenizer和获取词向量的一些类、函数。

In [1]:
import torch
import numpy as np
from utils.tokenizer import Tokenizer
from utils.get_emb import *

现在在notebook里调用就很简洁了，总之我们力求主文件能够比较简单，这样出BUG的时候排查起来也方便。

In [2]:
emb, dict_length, emb_size = get_emb()
tokenizer = Tokenizer(emb.keys())
emb_matrix = get_emb_matrix(emb, tokenizer, dict_length, emb_size)

dict_length:  9109
emb_size:  300
UNK 0
， 1
的 2
。 3
、 4
和 5
在 6
年 7
“ 8
了 9


然后我们来建立一个LSTM网络，这里暂时先不涉及各种时序上的机制。我们就用最简单的方法先来做文本分类。

In [10]:
from torch import nn

class LSTMClassifierNet(nn.Module):
    def __init__(self, seq_length, label_len, hidden_dims=None, bidirectional=False, num_layers=1):
        super(LSTMClassifierNet, self).__init__()
        self.seq_length = seq_length
        self.label_len = label_len
        # 控制是否使用双向LSTM
        self.bidirectional = bidirectional
        if num_layers == 1:
            self.lstm_dropout = 0.0
        else:
            self.lstm_dropout = 0.2
        self.fc_dropout = 0.1
        
        self.emb = nn.Embedding.from_pretrained(torch.tensor(emb_matrix))
        self.emb_size = self.emb.embedding_dim
        if hidden_dims is not None:
            self.hidden_dims = hidden_dims
        else:
            self.hidden_dims = self.emb_size
        
        # 循环神经网络，输入为(seq_len, batch, input_size)，(h_0, c_0), 如果没有给出h_0和c_0则默认为全零
        # 输出为(seq_len, batch, num_directions * hidden_size), (h_final, c_final)
        # 关于hidden_state和cell_state，可以理解为“短期记忆”和“长期记忆”
        self.lstm = nn.LSTM(self.emb_size, self.hidden_dims,
                            num_layers=1, dropout=self.lstm_dropout,
                            bidirectional=self.bidirectional)
        
        # 输出层，输入为(batch_size, hidden_dims)，输出为(batch_size, label_len)
        self.FC_out = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True),
            nn.Dropout(self.fc_dropout),
            nn.Linear(self.hidden_dims, self.label_len)
        )
        
        # softmax分类层
        self.softmax = nn.Softmax(dim=-1)
        # 交叉熵损失函数
        self.loss_fct = nn.CrossEntropyLoss()
        
    def forward(self, x, y=None):
        # 通过词嵌入得到词的分布式表示，输出是(batch_size, seq_len, input_size)
        x = self.emb(x)
        # 但是LSTM要的输入是(seq_len, batch_size, input_size)，做一下维度变换
        # 你也可以在建立LSTM网络的时候设置"batch_first = True"，使得LSTM要的输入就是(batch_size, seq_len, input_size)
        x = x.permute(1, 0, 2)
        # 使用LSTM，输出为(seq_len, batch_size, num_directions * hidden_size)
        # LSTM输出的其实是最后一层的每个时刻的“短期记忆”
        x, (final_h, final_c) = self.lstm(x)
        # 我们就用最终的“长期记忆”来做分类，也就是final_c，它的维度是: (num_layers * num_directions, batch_size, hidden_size)
        # 我们把batch_size放到最前面，所以现在是(batch_size, num_layers * num_directions, hidden_size)
        final_c = final_c.permute(1, 0, 2)
        
        # 把每一层和每个方向的取个平均值，变成(batch_size, hidden_size)，现在就可以去做FC操作了
        final_c = final_c.sum(dim=1)
        
        logits = self.FC_out(final_c)
        if y is None:
            return logits
        else:
            return self.loss_fct(logits, y)

In [11]:
# 我们做的是酒店评价的情感分析，最长为30
seq_length = 30
# 情感只有正负两类
label_len = 2
model = LSTMClassifierNet(seq_length, label_len)
# 使用print可以打印出网络的结构
print(model)

total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(str(total_trainable_params), 'parameters is trainable.')

if torch.cuda.is_available():
    model.to(torch.device('cuda'))

LSTMClassifierNet(
  (emb): Embedding(9110, 300)
  (lstm): LSTM(300, 300)
  (FC_out): Sequential(
    (0): Linear(in_features=300, out_features=300, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=300, out_features=2, bias=True)
  )
  (softmax): Softmax(dim=-1)
  (loss_fct): CrossEntropyLoss()
)
813302 parameters is trainable.


In [12]:
from dataset_readers.single_sent_clf import *

reader = ChnSentiCorp_Clf()
train_examples = reader.get_train_examples()
dev_examples = reader.get_dev_examples()

In [13]:
from torch.utils.data import TensorDataset, DataLoader

def generate_dataloader(examples, tokenizer, seq_length):
    features = convert_example_to_feature(examples, tokenizer, seq_length)
    ids = torch.tensor([f.ids for f in features], dtype=torch.long)
    label = torch.tensor([f.label_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(ids, label)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
    return dataloader

In [14]:
train_dataloader = generate_dataloader(train_examples, tokenizer, seq_length)
dev_dataloader = generate_dataloader(dev_examples, tokenizer, seq_length)

In [15]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=0.001)
print(optimizer)

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)


In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

epoch = 10
for i in range(epoch):
    model.train()
    total_loss = []
    for ids, label_ids in train_dataloader:
        if torch.cuda.is_available():
            ids = ids.to(torch.device('cuda'))
            label_ids = label_ids.to(torch.device('cuda'))
        optimizer.zero_grad()
        loss = model(ids, label_ids)
        total_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    print("epoch: %d, loss: %.6f" % (i + 1, sum(total_loss) / len(total_loss)))
    
    model.eval()
    total_gold = []
    total_pred = []
    for ids, label_ids in dev_dataloader:
        if torch.cuda.is_available():
            ids = ids.to(torch.device('cuda'))
        logits = model(ids)
        logits = logits.detach().cpu().numpy()
        logits = np.argmax(logits, axis=-1)
        logits = logits.tolist()
        total_pred.extend(logits)
        label_ids = label_ids.numpy().tolist()
        total_gold.extend(label_ids)
    # eval_p = precision_score(total_gold, total_pred)
    # eval_r = recall_score(total_gold, total_pred)
    eval_f1 = f1_score(total_gold, total_pred)
    print("eval_f1: %.2f%%" % (eval_f1 * 100))

epoch: 1, loss: 0.675642
eval_f1: 50.67%
epoch: 2, loss: 0.624533
eval_f1: 67.99%
epoch: 3, loss: 0.526861
eval_f1: 75.23%
epoch: 4, loss: 0.453669
eval_f1: 70.38%
epoch: 5, loss: 0.369783
eval_f1: 74.86%
epoch: 6, loss: 0.276966
eval_f1: 76.32%
epoch: 7, loss: 0.204645
eval_f1: 72.36%
epoch: 8, loss: 0.161313
eval_f1: 75.21%
epoch: 9, loss: 0.143107
eval_f1: 78.28%
epoch: 10, loss: 0.099640
eval_f1: 79.79%
