# 使用Pytorch做文本分类

有没有发现其实我们前面做LSTM进行文本分类时，最大的一部分信息，也就是每个时刻的输出被我们舍弃掉了？

所以我们来尝试使用Attention机制，让这些信息也能够被使用上。

In [1]:
from utils.tokenizer import Tokenizer
from utils.get_emb import *
import torch
import numpy as np

In [2]:
emb, dict_length, emb_size = get_emb()
tokenizer = Tokenizer(emb.keys())
emb_matrix = get_emb_matrix(emb, tokenizer, dict_length, emb_size)

dict_length:  9109
emb_size:  300
UNK 0
， 1
的 2
。 3
、 4
和 5
在 6
年 7
“ 8
了 9


然后我们来建立一个LSTM网络，这个网络比较复杂，用到了attention机制。这种机制能够有效地识别出在序列上哪些位置更“重要”。
![avatar](素材/Bi-LSTM+Attention.jpeg)

In [10]:
from torch import nn

class LSTMAttentionClassifierNet(nn.Module):
    def __init__(self, seq_length, label_len, emb_matrix, hidden_dims=None, bidirectional=False, num_layers=1):
        super(LSTMAttentionClassifierNet, self).__init__()
        self.seq_length = seq_length
        self.label_len = label_len
        # 控制是否使用双向LSTM
        self.bidirectional = bidirectional
        if num_layers == 1:
            self.lstm_dropout = 0.0
        else:
            self.lstm_dropout = 0.2
        self.fc_dropout = 0.1

        self.emb = nn.Embedding.from_pretrained(torch.tensor(emb_matrix))
        self.emb_size = self.emb.embedding_dim
        if hidden_dims is not None:
            self.hidden_dims = hidden_dims
        else:
            self.hidden_dims = self.emb_size

        # 循环神经网络，输入为(seq_len, batch, input_size)，(h_0, c_0), 如果没有给出h_0和c_0则默认为全零
        # 输出为(seq_len, batch, num_directions * hidden_size), (h_final, c_final)
        # 关于hidden_state和cell_state，可以理解为“短期记忆”和“长期记忆”
        self.lstm = nn.LSTM(self.emb_size, self.hidden_dims,
                            num_layers=1, dropout=self.lstm_dropout,
                            bidirectional=self.bidirectional)

        # attention层
        self.attention = nn.Sequential(
            nn.Linear(self.hidden_dims, 1),
            nn.ReLU(inplace=True)
        )

        # 输出层，输入为(batch_size, hidden_dims)，输出为(batch_size, label_len)
        self.FC_out = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True),
            nn.Dropout(self.fc_dropout),
            nn.Linear(self.hidden_dims, self.label_len)
        )

        # softmax分类层
        self.softmax = nn.Softmax(dim=-1)
        # 交叉熵损失函数
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, x, y=None):
        # 通过词嵌入得到词的分布式表示，输出是(batch_size, seq_len, input_size)
        x = self.emb(x)
        # 但是LSTM要的输入是(seq_len, batch_size, input_size)，做一下维度变换
        # 你也可以在建立LSTM网络的时候设置"batch_first = True"，使得LSTM要的输入就是(batch_size, seq_len, input_size)
        x = x.permute(1, 0, 2)
        # 使用LSTM，输出为(seq_len, batch_size, num_directions * hidden_size)
        # LSTM输出的其实是最后一层的每个时刻的“短时记忆”
        x, (final_h, final_c) = self.lstm(x)
        # 重新把维度换成(batch, seq_len, num_directions * hidden_size)
        x = x.permute(1, 0, 2)

        # 双向的话，我们把两个方向的取和，现在x的形状是(batch, seq_len, hidden_size)
        if self.bidirectional:
            x = torch.chunk(x, 2, -1)
            x = x[0] + x[1]

        # 接下来我们计算attention

        # (batch, seq_len, hidden_size)
        x = nn.Tanh()(x)

        # atten_context (batch_size, seq_len, 1)
        atten_context = self.attention(x)
        atten_context = atten_context.permute(0, 2, 1)
        # softmax_w (batch_size, 1, seq_len)
        softmax_w = self.softmax(atten_context)

        # atten_x (batch_size, 1, hidden_dims)
        atten_x = torch.bmm(softmax_w, x)
        # (batch_size, hidden_dims)
        atten_x = atten_x.squeeze(dim=1)
        logits = self.FC_out(atten_x)
        if y is None:
            return logits
        else:
            return self.loss_fct(logits, y)

In [11]:
# 我们做的是酒店评价的情感分析，最长为30
seq_length = 30
# 情感只有正负两类
label_len = 2
model = LSTMClassifierNet(seq_length, label_len)
# 使用print可以打印出网络的结构
print(model)

total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(str(total_trainable_params), 'parameters is trainable.')

if torch.cuda.is_available():
    model.to(torch.device('cuda'))

LSTMClassifierNet(
  (emb): Embedding(9110, 300)
  (lstm): LSTM(300, 300)
  (attention): Sequential(
    (0): Linear(in_features=300, out_features=300, bias=True)
    (1): ReLU(inplace=True)
  )
  (FC_out): Sequential(
    (0): Linear(in_features=300, out_features=300, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=300, out_features=2, bias=True)
  )
  (softmax): Softmax(dim=-1)
  (loss_fct): CrossEntropyLoss()
)
903602 parameters is trainable.


In [12]:
from dataset_readers.single_sent_clf import *

reader = ChnSentiCorp_Clf()
train_examples = reader.get_train_examples()
dev_examples = reader.get_dev_examples()

In [13]:
from torch.utils.data import TensorDataset, DataLoader

def generate_dataloader(examples, tokenizer, seq_length):
    features = convert_example_to_feature(examples, tokenizer, seq_length)
    ids = torch.tensor([f.ids for f in features], dtype=torch.long)
    label = torch.tensor([f.label_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(ids, label)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
    return dataloader

In [14]:
train_dataloader = generate_dataloader(train_examples, tokenizer, seq_length)
dev_dataloader = generate_dataloader(dev_examples, tokenizer, seq_length)

In [15]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=0.001)
print(optimizer)

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)


In [16]:
from sklearn.metrics import precision_score, recall_score, f1_score

epoch = 10
for i in range(epoch):
    model.train()
    total_loss = []
    for ids, label_ids in train_dataloader:
        if torch.cuda.is_available():
            ids = ids.to(torch.device('cuda'))
            label_ids = label_ids.to(torch.device('cuda'))
        optimizer.zero_grad()
        loss = model(ids, label_ids)
        total_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    print("epoch: %d, loss: %.6f" % (i + 1, sum(total_loss) / len(total_loss)))
    
    model.eval()
    total_gold = []
    total_pred = []
    for ids, label_ids in dev_dataloader:
        if torch.cuda.is_available():
            ids = ids.to(torch.device('cuda'))
        logits = model(ids)
        logits = logits.detach().cpu().numpy()
        logits = np.argmax(logits, axis=-1)
        logits = logits.tolist()
        total_pred.extend(logits)
        label_ids = label_ids.numpy().tolist()
        total_gold.extend(label_ids)
    # eval_p = precision_score(total_gold, total_pred)
    # eval_r = recall_score(total_gold, total_pred)
    eval_f1 = f1_score(total_gold, total_pred)
    print("eval_f1: %.2f%%" % (eval_f1 * 100))

epoch: 1, loss: 0.645538
eval_f1: 59.87%
epoch: 2, loss: 0.545199
eval_f1: 64.76%
epoch: 3, loss: 0.497852
eval_f1: 69.59%
epoch: 4, loss: 0.430785
eval_f1: 76.56%
epoch: 5, loss: 0.386537
eval_f1: 79.37%
epoch: 6, loss: 0.334536
eval_f1: 80.21%
epoch: 7, loss: 0.282924
eval_f1: 78.92%
epoch: 8, loss: 0.241811
eval_f1: 82.47%
epoch: 9, loss: 0.197273
eval_f1: 81.44%
epoch: 10, loss: 0.151812
eval_f1: 80.74%
