# TextCNN
TextCNN就是使用CNN来处理文本的神经网络，具体来说，虽然文本的序列是一个长度为l的一维序列，但是由于每个文本会被词嵌入算法嵌入为一个固定长度K的词嵌入向量，因此可以把整段文本看作是一个只有一个通道的$l\times k$的特征图，在这个特征图上进行($l'\times k$)的卷积操作，（其中l‘大约可取3-5），就可以提取出文本特征图的局部信息，构成一个新的特征图c，把这个特征图输入一个全连接神经网络来得到对文本的分类

In [1]:
# Prepares training data, uses IMDB dataset from torchtext to train a text classification network
import random
import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader


print("Start Loading datasets")
data_iter = torchtext.datasets.IMDB(split='train') # Iterator of WikiText dataset
data_set = to_map_style_dataset(data_iter) # Array-like variable containing all strings in train set
tokenizer = get_tokenizer('basic_english') # Converts string into list of words

data_set = list(data_set)
random.shuffle(data_set)
data_set = data_set[:10000] # Shrink down dataset because the original one is too large
train_set = data_set[:9000] 
dev_set = data_set[9000:]

# Build a vocabulary
# Feeds whole train set into the vocabulary
# Words appearing less than 3 times are not included
print("Building vocabulary...")
def yield_tokens(dataset):
    for _, text in dataset:
        yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(data_set), specials=["<unk>", "<pad>"], min_freq=3)
vocab.set_default_index(vocab['<unk>'])
print("Counted %d words from train dataset"%(len(vocab)))

Start Loading datasets
Building vocabulary...
Counted 25988 words from train dataset


In [194]:
# Setup train_loader and dev_loader
import torch

batch_size = 50
length_size = 300

# Replace all 'neg', 'pos' tags with 0 and 1
# Trim all text longer than length_size to length_size
# Expand all text short than length_size to length_size with "<pad>"
# Convert all text into indexes 
# The input: list of items in train_set
def collate_batch(batch):
    # Replace tags with 0-1 tensor
    batch_size = len(batch)
    classes = [1 if tag == "pos" else 0 for tag, _ in batch]
    classes = torch.tensor(classes)

    # turn texts into tensor(batch_size, length_size)
    inputs = torch.zeros((batch_size, length_size))
    for i, sample in enumerate(batch):
            _, text = sample
            tokens = tokenizer(text)
            for j in range(length_size):
                if j < len(tokens):
                    inputs[i, j] = vocab[tokens[j]]
                else:
                    inputs[i, j] = vocab["<pad>"]

        
    return classes, inputs.long()


# train_loader = DataLoader(train_set, shuffle=True, batch_size=batch_size, collate_fn=collate_batch)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
dev_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

## TextCNN模型
下面是模型示意图：

<img src="https://image.panwenbo.icu/blog20210719103042.png" alt="截屏2021-07-19 上午10.30.32" style="zoom:50%;" />

首先，我们把一段文本进行词嵌入处理后得到一张$n \times k$的原始特征图，原文中作者同时使用来重新生成的词嵌入（可训练的）和预训练的Word2Vec词嵌入模型（不可训练）来构成了一个两层的输入，我们这里不使用预训练的输入。

接着这个特征图会分别进行$(h \times k), h \in \{3,4,5\}$大小的卷积操作，每个卷积操作有100个卷积核，也就是产生了300张新的特征图$(n-h+1, 1)$。接着，每个特征图会被一个全局最大池化压缩到一个值，最后我们把这一个300维的向量输入逻辑回归模型来对文本进行一个二分类。


In [195]:
# Defines the TextCNN module
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.activation import LogSoftmax


class TextCNN(nn.Module):
    """Convolutional Neural Network for text classification.

    :param filters: The list of all convs, containing all height and output channels of convs.
    :type filters: List of filters, e.g. [(3, 100), (4, 100), (5, 100)]
    :param embedding_dim: The dimension of word embedding
    :param n_classes: The number of output classes, defaults to 2
    """
    def __init__(self, filters, embedding_dim, vocab_size, n_classes=2, dropout_p=0.5):
        super().__init__()
        
        self.n_features = sum([ch for _, ch in filters]) # the total number of output conv filters
        self.embedding_dim = embedding_dim
        self.n_classes = n_classes
        self.vocab_size = vocab_size

        # Must use nn.ModuleList here, because only registered parameters can be updated.
        # Torch searches for every member of class instance and registeres them.
        # The parameter in list will not be detected and rigistered, ModuleList solved that.
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, ch, (h, embedding_dim)) for h, ch in filters
        ])
        self.dropout = nn.Dropout(dropout_p)
        self.out = nn.Linear(self.n_features, n_classes)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input):
        """Forward function

        :param input: the indexes tensors of input texts with mini-batch
        :type input: tensor(batch_size, sentence_length)
        """
        embedded = self.embedding(input)
        batch_size = input.size(0)
        embedded = embedded.unsqueeze(dim=1) # (batch_size, channel_num=1, sentence_length, embedding_dim)

        # features: the list of vectors of all filter's global max-pool outputs.
        features = []
        for conv in self.convs:
            feature = F.relu(conv(embedded)) # (batch_size, channel_num=3, sentence_length - h + 1, 1)
            feature = F.max_pool2d(feature, (feature.size(2), feature.size(3)))
            features.append(feature[:,:,0,0])
        stacked_feature = torch.cat(features, dim=1)
        droped_feature = self.dropout(stacked_feature)
        output = self.out(droped_feature)
        return self.softmax(output)


In [200]:
# Runs on dev set to evaluate accuracy
from tqdm import tqdm

def evaluate():
    total_correct = 0
    for batch in tqdm(dev_loader):
        target, input = batch
        output = model(input)
        total_correct += torch.sum(output.argmax(dim=1) == target)
    return total_correct / len(dev_set)

In [None]:
# Training on IMDB dataset
from utils import Timer
filters = [(3, 50), (4, 50), (5, 50)]
num_epoches = 16
embedding_dim = 100
log_interval = 100

model = TextCNN(filters, embedding_dim, len(vocab))
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

Timer.Start(len(train_loader) * num_epoches)
for epoch in range(num_epoches):
    running_loss = 0
    for idx, batch in enumerate(train_loader):
        target, input = batch
        output = model(input)
        
        # back propagation
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        Timer.Step()

        # Log progress
        running_loss += loss.item()
        if idx % log_interval == log_interval - 1:
            print("Epoch: %d\tStep: %d\tLoss=%.3f\t%s\tAcc: %.3f" % (
                epoch + 1,
                idx,
                running_loss / log_interval,
                Timer.Remain(),
                evaluate()
            ))
            running_loss = 0