# 使用Pytorch做文本分类

文本分类是自然语言处理中最基本的任务之一，我们可以根据已有的文本分类结果进行有监督训练后进行分类，也可以根据一定的要求将文本进行聚类。

In [1]:
import torch
from matplotlib import pyplot as plt
from IPython import display
import numpy as np
import collections

In [2]:
class Tokenizer():
    def __init__(self, vocab_list):
        self.vocab = self.load_vocab(vocab_list)

    def load_vocab(self, vocab_list):
        vocab = collections.OrderedDict()
        vocab['UNK'] = 0
        index = 1
        for token in vocab_list:
            token = token.strip()
            vocab[token] = index
            index += 1
        return vocab

    def tokenize(self, str):
        str = [s for s in str.strip()]
        return str

    def token_to_id(self, token):
        if token not in self.vocab.keys():
            return self.vocab['UNK']
        else:
            return self.vocab[token]

    def tokens_to_ids(self, tokens):
        ids_list = list(map(self.token_to_id, tokens))
        return ids_list

In [3]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

In [4]:
char_list = []
emb_list = []
# noinspection JupyterKernel,JupyterKernel
with open('素材\sgns.wiki.char', 'r', encoding='utf-8') as emb_file:
    dict_length, emb_size = emb_file.readline().rstrip().split()
    dict_length, emb_size = int(dict_length), int(emb_size)
    emb = collections.OrderedDict(get_coefs(*l.rstrip().split()) for l in emb_file.readlines())
tokenizer = Tokenizer(emb.keys())
emb_matrix = np.zeros((1 + dict_length, emb_size), dtype='float32')
for word, id in tokenizer.vocab.items():
    emb_vector = emb.get(word)
    if emb_vector is not None:
        emb_matrix[id] = emb_vector
print(emb_matrix.shape)

(9110, 300)


In [5]:
from torch import nn

# 初始化网络的实例
seq_length = 5
label_len = 2

class LinearClassifierNet(nn.Module):
    def __init__(self, seq_length, label_len):
        super(LinearClassifierNet, self).__init__()
        self.seq_length = seq_length
        self.label_len = label_len
        self.emb = nn.Embedding.from_pretrained(torch.tensor(emb_matrix))
        self.emb_size = self.emb.embedding_dim
        self.relu = nn.ReLU()
        self.linear1 = nn.Linear(seq_length * emb_size, 100)
        self.linear2 = nn.Linear(100, 20)
        self.linear3 = nn.Linear(20, self.label_len)
        self.softmax = nn.Softmax(dim=-1)
        self.loss = nn.CrossEntropyLoss()

    # forward 定义前向传播
    def forward(self, x, y=None):
        x = self.emb(x)
        x = x.view(-1, self.seq_length * self.emb_size)
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.linear3(x)
        if y is None:
            return self.softmax(x)
        loss = self.loss(x, y)
        # y_hat
        return loss


net = LinearClassifierNet(seq_length, label_len)
# 使用print可以打印出网络的结构
print(net)

if torch.cuda.is_available():
    net.to(torch.device('cuda'))

LinearClassifierNet(
  (emb): Embedding(9110, 300)
  (relu): ReLU()
  (linear1): Linear(in_features=1500, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=20, bias=True)
  (linear3): Linear(in_features=20, out_features=2, bias=True)
  (softmax): Softmax(dim=-1)
  (loss): CrossEntropyLoss()
)


In [6]:
class sentiment_example():
    def __init__(self, text, label):
        self.text = text
        self.label = label

class sentiment_feature():
    def __init__(self, ids, label):
        self.ids = ids
        self.label = label
        

In [7]:
examples = []
with open('素材/sentiment/正面情感词语（中文）.txt', 'r', encoding='gbk') as pos_file:
    for line in pos_file:
        line = line.strip()
        examples.append(sentiment_example(line, 1))

with open('素材/sentiment/负面情感词语（中文）.txt', 'r', encoding='gbk') as pos_file:
    for line in pos_file:
        line = line.strip()
        examples.append(sentiment_example(line, 0))


def convert_example_to_feature(examples):
    features = []
    for i in examples:
        ids = tokenizer.tokens_to_ids(i.text)
        if len(ids) > seq_length:
            ids = ids[0: seq_length]
        else:
            ids = ids + [0] * (seq_length - len(ids))
        if sum(ids) == 0:
            continue
        assert len(ids) == seq_length
        features.append(sentiment_feature(ids, i.label))
    return features

for i in range(3):
    print(examples[i].text, examples[i].label)

features = convert_example_to_feature(examples)
for i in range(3):
    print(features[i].ids, features[i].label)

爱 1
爱不忍释 1
爱不释手 1
[332, 0, 0, 0, 0] 1
[332, 57, 2028, 2070, 0] 1
[332, 57, 2070, 444, 0] 1


In [8]:
from torch.utils.data import TensorDataset, DataLoader

ids = torch.tensor([f.ids for f in features], dtype=torch.long)
label = torch.tensor([f.label for f in features], dtype=torch.long)

dataset = TensorDataset(ids, label)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [9]:
from torch.optim import Adam

optimizer = Adam(net.parameters(), lr=0.002)

epoch = 10
for i in range(epoch):
    total_loss = []
    for ids, label in dataloader:
        if torch.cuda.is_available():
            ids = ids.to(torch.device('cuda'))
            label = label.to(torch.device('cuda'))
        loss = net(ids, label)
        total_loss.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("epoch: %d, loss: %.6f" % (i + 1, sum(total_loss) / len(total_loss)))


epoch: 1, loss: 0.451051
epoch: 2, loss: 0.276323
epoch: 3, loss: 0.204856
epoch: 4, loss: 0.147314
epoch: 5, loss: 0.119233
epoch: 6, loss: 0.083407
epoch: 7, loss: 0.077122
epoch: 8, loss: 0.059252
epoch: 9, loss: 0.044757
epoch: 10, loss: 0.036423


In [12]:
def tensor_to_label(logits):
    logits = logits.detach().cpu().numpy()
    logits = np.argmax(logits, axis=-1)
    if logits[0] == 1:
        return 'positive'
    else:
        return 'negative'


while True:
    s = input()
    if s == 'quit':
        break
    s = [sentiment_example(s, 0)]
    s = convert_example_to_feature(s)
    ids = torch.tensor([f.ids for f in s], dtype=torch.long)
    with torch.no_grad():
        if torch.cuda.is_available():
            ids = ids.to(torch.device('cuda'))
        logits = tensor_to_label(net(ids))
        print(logits)

你好
positive
再见
positive
凄凄惨惨戚戚
negative
杯具
positive
悲剧
negative
quit
