<h1>Text classification with a convolutional neural network</h1>

Adapted from https://github.com/pytorch/tutorials/blob/master/beginner_source/text_sentiment_ngrams_tutorial.py

In [2]:
import sys
import numpy as np

import torch
from torchtext.datasets import text_classification

In [3]:
NGRAMS = 1
import os
if not os.path.isdir('./.data'):
	os.mkdir('./.data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./.data', ngrams=NGRAMS, vocab=None)
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

120000lines [00:05, 20391.13lines/s]
120000lines [00:11, 10891.14lines/s]
7600lines [00:00, 12822.96lines/s]


In [83]:
import torch.nn as nn
import torch.nn.functional as F
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv1d(embed_dim, embed_dim, 3)
        #self.conv2 = nn.Conv1d(embed_dim, embed_dim, 3)
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, text, masks, lens):
        embedded = self.embedding(text)
        masked_emb = embedded * masks.unsqueeze(-1)
        x = masked_emb.permute(0, 2, 1)
        x = self.conv1(x)
        x = F.relu(x)
        #x = self.conv2(x)
        #x = F.relu(x)
        x = x.permute(0, 2, 1)
        
        # global avg pool
        x = torch.sum(x, 1) / lens.unsqueeze(-1)
        
        # global max pool
        #x, _ = torch.max(x, 1)
        return self.fc(x)

In [84]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [85]:
from torch.utils.data import DataLoader

# Generate the pad id
pad_id = train_dataset.get_vocab()['<pad>']

def pad_data(data):
    # Find max length of the mini-batch
    lens = list(zip(*data))[0]
    max_len = max(lens)
    label_list = list(zip(*data))[2]
    txt_list = list(zip(*data))[3]
    padded_tensors = torch.stack([torch.cat((txt, \
            torch.tensor([pad_id] * (max_len - len(txt))).long())) \
            for txt in txt_list])
    lens = torch.LongTensor(lens)
    mask = torch.arange(max_len)[None, :] < lens[:, None]
    return padded_tensors, torch.LongTensor(label_list), mask, lens

def get_data_loader(dataset_, **kwargs):
    # Generate a list of tuples of text length, index, label, text
    dataset_sorted = [(len(txt), idx, label, txt) for idx, (label, txt) in enumerate(dataset_)]
    dataset_sorted.sort() # sort by length and pad sequences with similar lengths
    return DataLoader(dataset_sorted, collate_fn=pad_data, **kwargs)

In [86]:
def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = get_data_loader(sub_train_, batch_size=BATCH_SIZE, shuffle=True)
    for i, (text, labels, masks, lens) in enumerate(data):
        optimizer.zero_grad()
        text, labels, masks, lens = (v.to(device) for v in (text, labels, masks, lens))
        output = model(text, masks, lens)
        loss = criterion(output, labels)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == labels).sum().item()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = get_data_loader(data_, batch_size=BATCH_SIZE)
    for text, labels, masks, lens in data:
        text, labels, masks, lens = (v.to(device) for v in (text, labels, masks, lens))
        with torch.no_grad():
            output = model(text, masks, lens)
            loss = criterion(output, labels)
            loss += loss.item()
            acc += (output.argmax(1) == labels).sum().item()

    return loss / len(data_), acc / len(data_)

In [87]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 7
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters())

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')


Epoch: 1  | time in 0 minutes, 29 seconds
	Loss: 0.0304(train)	|	Acc: 82.4%(train)
	Loss: 0.0001(valid)	|	Acc: 88.1%(valid)
Epoch: 2  | time in 0 minutes, 29 seconds
	Loss: 0.0163(train)	|	Acc: 91.4%(train)
	Loss: 0.0001(valid)	|	Acc: 89.6%(valid)
Epoch: 3  | time in 0 minutes, 31 seconds
	Loss: 0.0128(train)	|	Acc: 93.2%(train)
	Loss: 0.0001(valid)	|	Acc: 90.5%(valid)
Epoch: 4  | time in 0 minutes, 30 seconds
	Loss: 0.0106(train)	|	Acc: 94.4%(train)
	Loss: 0.0001(valid)	|	Acc: 90.7%(valid)
Epoch: 5  | time in 0 minutes, 29 seconds
	Loss: 0.0090(train)	|	Acc: 95.3%(train)
	Loss: 0.0000(valid)	|	Acc: 90.5%(valid)
Epoch: 6  | time in 0 minutes, 27 seconds
	Loss: 0.0077(train)	|	Acc: 96.0%(train)
	Loss: 0.0000(valid)	|	Acc: 90.8%(valid)
Epoch: 7  | time in 0 minutes, 29 seconds
	Loss: 0.0065(train)	|	Acc: 96.6%(train)
	Loss: 0.0000(valid)	|	Acc: 90.5%(valid)


In [89]:
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
	Loss: 0.0001(test)	|	Acc: 91.3%(test)


In [90]:
import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

ag_news_label = {1 : "World",
                 2 : "Sports",
                 3 : "Business",
                 4 : "Sci/Tec"}

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in ngrams_iterator(tokenizer(text), ngrams)])
        text_len = text.size()[0]
        text = text.unsqueeze(0)
        mask = torch.BoolTensor([[True]*text_len])
        len_tensor = torch.LongTensor([text_len])
        output = model(text, mask, len_tensor)
        return output.argmax(1).item() + 1

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

vocab = train_dataset.get_vocab()
model = model.to("cpu")

print("This is a %s news" %ag_news_label[predict(ex_text_str, model, vocab, 2)])


This is a Sci/Tec news
