# 使用Pytorch做文本分类

上一个notebook我们使用了最简单的全连接方法来做文本分类，大家可以看到5个字的文本分类就已经需要15w+的参数了，因此全连接层在NLP任务的前几层是很少见的，一般会用在最后面几层。我们现在再给大家讲讲CNN方法做文本分类。

In [1]:
import torch
from matplotlib import pyplot as plt
from IPython import display
import numpy as np
import collections

In [2]:
char_list = []
emb_list = []

# 读取切分好的一行，返回词和词向量（numpy的矩阵）
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

with open('素材\sgns.wiki.char', 'r', encoding='utf-8') as emb_file:
    # 文件的开头是词表长度和词嵌入维度
    dict_length, emb_size = emb_file.readline().rstrip().split()
    print('dict_length: ', dict_length)
    print('emb_size: ', emb_size)
    dict_length, emb_size = int(dict_length), int(emb_size)
    # 对每一行做处理，结果存到顺序词典中
    emb = collections.OrderedDict(get_coefs(*l.rstrip().split()) for l in emb_file.readlines())
for k, v in emb.items():
    print(k, v.shape)
    break

dict_length:  9109
emb_size:  300
， (300,)


In [3]:
class Tokenizer:
    # 初始化的时候读取词表
    def __init__(self, vocab_list):
        self.vocab = self.load_vocab(vocab_list)
        for i, (k, v) in enumerate(self.vocab.items()):
            if i > 9:
                break
            print(k, v)
    
    # 读取词表
    def load_vocab(self, vocab_list):
        # 我们一般使用顺序字典来存储词表，这样能够保证历遍时index升序排列
        vocab = collections.OrderedDict()
        # 一般我们使用'UNK'来表示词表中不存在的词，放在0号index上
        vocab['UNK'] = 0
        index = 1
        # 依次插入词
        for token in vocab_list:
            token = token.strip()
            vocab[token] = index
            index += 1
        return vocab

    # 将单个字/词转换为数字id
    def token_to_id(self, token):
        # 不在词表里的词
        if token not in self.vocab.keys():
            return self.vocab['UNK']
        else:
            return self.vocab[token]

    # 将多个字/词转换为数字id
    def tokens_to_ids(self, tokens):
        ids_list = list(map(self.token_to_id, tokens))
        return ids_list

In [4]:
tokenizer = Tokenizer(emb.keys())

UNK 0
， 1
的 2
。 3
、 4
和 5
在 6
年 7
“ 8
了 9


In [5]:
# 生成一个全0矩阵，大小为（词典长度+1，嵌入维度）
emb_matrix = np.zeros((1 + dict_length, emb_size), dtype='float32')

for word, id in tokenizer.vocab.items():
    emb_vector = emb.get(word)
    if emb_vector is not None:
        # 将编号为id的词的词向量放在id行上
        emb_matrix[id] = emb_vector
print(emb_matrix.shape)

(9110, 300)


In [7]:
from torch import nn

class CNNClassifierNet(nn.Module):
    def __init__(self, seq_length, label_len):
        super(CNNClassifierNet, self).__init__()
        self.seq_length = seq_length
        self.label_len = label_len
        self.kernel_size = 3
        # 第一层是一个嵌入层，输入为(batch_size, seq_length),输出为(batch_size, seq_length, emb_size)
        # 嵌入层如果使用了from_pretrained，会关掉自动梯度，也就是变得不能训练。如果需要可以手动开启。
        self.emb = nn.Embedding.from_pretrained(torch.tensor(emb_matrix))
        self.emb_size = self.emb.embedding_dim
        # ReLU层无参数，可以共用
        self.relu = nn.ReLU()
        
        # 卷积层，输入为(batch_size, emb_size, seq_length)，输出为(batch_size, out_channels, seq_length-self.kernel_size+1)
        self.conv1 = nn.Conv1d(in_channels=300, out_channels=100, kernel_size=3)
        # 池化层，一般都是一个卷积一个池化，输入为(batch_size, out_channels, seq_length-self.kernel_size+1)
        # 输出为(batch_size, out_channels, 1) -> batch_size, out_channels)
        self.avg1 = nn.AvgPool1d(kernel_size=self.seq_length-self.kernel_size+1)
        # self.max1 = nn.MaxPool1d(kernel_size=self.seq_length-self.kernel_size+1, return_indices=False)
        # dropout层
        self.dropout = nn.Dropout(p=0.2)
        
        # 全连接层，输入为(batch_size, out_channels)，输出为(batch_size, 20)
        self.linear2 = nn.Linear(100, 20)
        # 全连接层，输入为(batch_size, 20)，输出为(batch_size, label_len)
        self.linear3 = nn.Linear(20, self.label_len)
        # softmax分类层
        self.softmax = nn.Softmax(dim=-1)
        # 使用交叉熵损失函数
        # 交叉熵损失函数实际上等于nn.Softmax+nn.NLLLoss（负对数似然损失），所以用这个损失的时候不需要先过softmax层
        self.loss = nn.CrossEntropyLoss()

    # forward 定义前向传播，参数不同，输出结果也不同
    def forward(self, x, y=None):
        # 嵌入层，输出为(batch_size, seq_length, emb_size)
        x = self.emb(x)
        # 卷积层需要的输入为(batch_size, emb_size, seq_length)，我们需要将后两维换一下顺序
        # (0, 1, 2) 
        x = x.permute(0, 2, 1)
        # 过第一个线性层
        x = self.conv1(x)
        # 过了avg_pooling后大小为(batch_size, channel_size, 1)
        x = self.avg1(x)
        # 我们不需要最后那一维，去掉
        x = x.squeeze_(dim=-1)
        
        # batch_size, channel_size
        # 非线性激活函数
        x = self.relu(x)
        # 过第二个线性层
        x = self.linear2(x)
        # dropout层
        x = self.dropout(x)
        # 非线性激活函数
        x = self.relu(x)
        # 过第三个线性层
        x = self.linear3(x)
        
        # 如果没有输入y，那么是在预测，我们返回分类的结果
        if y is None:
            return self.softmax(x)
        # 如果有输入y，那么是在训练，我们返回损失函数的值
        else:
            return self.loss(x, y)
        
# 我们做的是酒店评价的情感分析，最长为30
seq_length = 30
# 情感只有正负两类
label_len = 2

model = CNNClassifierNet(seq_length, label_len)
# 使用print可以打印出网络的结构
print(model)

total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(str(total_trainable_params), 'parameters is trainable.')

if torch.cuda.is_available():
    model.to(torch.device('cuda'))

CNNClassifierNet(
  (emb): Embedding(9110, 300)
  (relu): ReLU()
  (conv1): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
  (avg1): AvgPool1d(kernel_size=(28,), stride=(28,), padding=(0,))
  (dropout): Dropout(p=0.2, inplace=False)
  (linear2): Linear(in_features=100, out_features=20, bias=True)
  (linear3): Linear(in_features=20, out_features=2, bias=True)
  (softmax): Softmax(dim=-1)
  (loss): CrossEntropyLoss()
)
92162 parameters is trainable.


In [8]:
# 原始数据和标签 
class data_example:
    def __init__(self, text, label):
        self.text = text
        self.label = label

# 处理完毕的数据和标签
class data_feature:
    def __init__(self, ids, label):
        self.ids = ids
        self.label = label

In [10]:
# 读原始数据
examples = []
with open('素材/sentiment/正面评价.txt', 'r', encoding='utf-8') as pos_file:
    for line in pos_file:
        line = line.strip()
        examples.append(data_example(line, 'positive'))
with open('素材/sentiment/负面评价.txt', 'r', encoding='utf-8') as pos_file:
    for line in pos_file:
        line = line.strip()
        examples.append(data_example(line, 'negative'))

print('num of example: %d' % len(examples))
for i in range(3):
    print(examples[i].text, examples[i].label)

num of example: 3920
距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单. positive
商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错! positive
位置离我们单位很近,从价格来说,性价比很高.我要的大床房,168元,前台服务员态度很好,房间硬件一般,但是想想价格也就这样了.还算干净,就是床垫子太硬. positive


In [11]:
# 处理原始数据
def convert_example_to_feature(examples):
    features = []
    for i in examples:
        # 使用tokenizer将字符串转换为数字id
        ids = tokenizer.tokens_to_ids(i.text)
        # 我们规定了最大长度，超过了就切断，不足就补齐（一般补unk，也就是这里的[0]，也有特殊补位符[PAD]之类的）
        if len(ids) > seq_length:
            ids = ids[0: seq_length]
        else:
            ids = ids + [0] * (seq_length - len(ids))
        # 如果这个字符串全都不能识别，那就放弃掉
        if sum(ids) == 0:
            continue
        assert len(ids) == seq_length
        # 处理标签，正面为1，负面为0
        if i.label == 'positive':
            label = 1
        else:
            label = 0
        features.append(data_feature(ids, label))
    return features

features = convert_example_to_feature(examples)

for i in range(3):
    print(features[i].ids, features[i].label)

[550, 658, 388, 551, 319, 149, 246, 320, 38, 50, 13, 319, 913, 220, 2277, 57, 44, 38, 105, 1014, 13, 54, 1720, 931, 107, 54, 2, 412, 38, 52] 1
[849, 1217, 71, 1785, 1071, 1, 1071, 167, 231, 71, 1, 1785, 23, 68, 314, 458, 1, 1152, 318, 1064, 1199, 187, 780, 774, 1614, 57, 1577, 953, 0, 0] 1
[80, 409, 658, 201, 258, 567, 80, 231, 320, 38, 59, 1433, 559, 86, 151, 38, 221, 1433, 155, 231, 181, 76, 201, 128, 2, 71, 1785, 1071, 38, 66] 1


In [12]:
from torch.utils.data import TensorDataset, DataLoader

ids = torch.tensor([f.ids for f in features], dtype=torch.long)
label = torch.tensor([f.label for f in features], dtype=torch.long)

dataset = TensorDataset(ids, label)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [13]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=0.001)
print(optimizer)

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)


In [14]:
epoch = 9
for i in range(epoch):
    total_loss = []
    for ids, label in dataloader:
        if torch.cuda.is_available():
            ids = ids.to(torch.device('cuda'))
            label = label.to(torch.device('cuda'))
        # 因为我们这次loss已经写在模型里面了，所以就不用再计算模型了
        optimizer.zero_grad()
        loss = model(ids, label)
        total_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    print("epoch: %d, loss: %.6f" % (i + 1, sum(total_loss) / len(total_loss)))

epoch: 1, loss: 0.672779
epoch: 2, loss: 0.592413
epoch: 3, loss: 0.559138
epoch: 4, loss: 0.543333
epoch: 5, loss: 0.533890
epoch: 6, loss: 0.518462
epoch: 7, loss: 0.510866
epoch: 8, loss: 0.511546
epoch: 9, loss: 0.500470


In [15]:
# 将输出的概率还原成标签
def tensor_to_label(logits):
    logits = logits.detach().cpu().numpy()
    logits = np.argmax(logits, axis=-1)
    if logits[0] == 1:
        return 'positive'
    else:
        return 'negative'

# 还记得网络中我们加了dropout吗？
# 当我们将model设置为eval状态时，dropout/BatchNorm不生效
model.eval()

while True:
    s = input()
    if s == 'quit':
        break
    s = [data_example(s, 0)]
    s = convert_example_to_feature(s)
    ids = torch.tensor([f.ids for f in s], dtype=torch.long)
    with torch.no_grad():
        if torch.cuda.is_available():
            ids = ids.to(torch.device('cuda'))
        logits = tensor_to_label(model(ids))
        print(logits)

房间不错但是太贵了！
negative
可以看到海
positive
性价比高
positive
quit
