## Prepare for Data

In [1]:
import logging
from torchtext.utils import extract_archive, unicode_csv_reader
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets.text_classification import *
from torchtext.datasets.text_classification import _csv_iterator,_create_data_from_iterator

def _setup_datasets(dataset_name, dataset_tar, root='.data', ngrams=1, vocab=None, include_unk=False):
#     dataset_tar = download_from_url(URLS[dataset_name], root=root)
    extracted_files = extract_archive(dataset_tar)

    for fname in extracted_files:
        if fname.endswith('train.csv'):
            train_csv_path = fname
        if fname.endswith('test.csv'):
            test_csv_path = fname

    if vocab is None:
        logging.info('Building Vocab based on {}'.format(train_csv_path))
        vocab = build_vocab_from_iterator(_csv_iterator(train_csv_path, ngrams))
    else:
        if not isinstance(vocab, Vocab):
            raise TypeError("Passed vocabulary is not of type Vocab")
    logging.info('Vocab has {} entries'.format(len(vocab)))
    logging.info('Creating training data')
    train_data, train_labels = _create_data_from_iterator(
        vocab, _csv_iterator(train_csv_path, ngrams, yield_cls=True), include_unk)
    logging.info('Creating testing data')
    test_data, test_labels = _create_data_from_iterator(
        vocab, _csv_iterator(test_csv_path, ngrams, yield_cls=True), include_unk)
    if len(train_labels ^ test_labels) > 0:
        raise ValueError("Training and test labels don't match")
    return (TextClassificationDataset(vocab, train_data, train_labels),
            TextClassificationDataset(vocab, test_data, test_labels))

In [2]:
from torch.utils.data.dataset import random_split

def split_train_set(train_set):
    train_set_size = int(len(train_set) * 0.80)
    val_set_size = len(train_set) - train_set_size
    
    return random_split(train_set, [train_set_size, val_set_size])

In [3]:
def prepare_data(dataset_name, dataset_tar, ngrams=1):
    print("Preparing data...")
    
    # Gets train and test sets.
    train_set, test_set = _setup_datasets(dataset_name=dataset_name, dataset_tar=dataset_tar, ngrams=ngrams)
    
    vocab_size = len(train_set.get_vocab())
    cls_num = len(train_set.get_labels())
    
    # Splits train set into train set and val set.
    train_set, val_set = split_train_set(train_set)
    
    return train_set, val_set, test_set, vocab_size, cls_num

## Model

In [4]:
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, vocab_size, embed_dim, cls_num):
        super().__init__()
        
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)  # sparse=True: significantly increase speed
        
        self.fc = nn.Linear(embed_dim, cls_num)
        self.softmax = nn.Softmax(dim=1)
        
        self.init_weights()
    
    def forward(self, texts, offsets):
        embeds = self.embedding(texts, offsets)
        
        outputs = self.fc(embeds)
        outputs = self.softmax(outputs)
        
        return outputs
    
    def init_weights(self):
        init_range = 0.5
        
        self.embedding.weight.data.uniform_(-init_range, init_range)
        
        self.fc.weight.data.uniform_(-init_range, init_range)
        self.fc.bias.data.zero_()

# Training and validating

In [5]:
def collate_fn(batch):
    labels = torch.tensor([entry[0] for entry in batch], device=device)
    texts = [entry[1] for entry in batch]  # entry[1] are tensors. torch.tensor(tensor) is not valid
    
    offsets = torch.tensor([0] + [len(text) for text in texts[:-1]], device=device)
    
    # Concats texts.
    texts = torch.cat(texts, dim=0).to(device)
    # Calculates offsets.
    offsets = offsets.cumsum(dim=0)

    return texts, labels, offsets

In [6]:
from torch.utils.data import DataLoader

def _train(model, train_set, criterion, optimizer, scheduler, batch_size):

    data_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, 
                     collate_fn=collate_fn)
    
    train_loss = 0
    train_acc = 0
    
    for i, (texts, labels, offsets) in enumerate(data_loader):
        # Zeros grad.
        optimizer.zero_grad()

        # Foreward.
        outputs = model(texts, offsets)
        loss = criterion(outputs, labels)

        # Backward.
        loss.backward()
        
        # Updates params.
        optimizer.step()
        
        train_loss += loss
        train_acc += (outputs.argmax(dim=1) == labels).sum().item()
        
    # Updates lr.
    scheduler.step()
    
    return train_loss / len(train_set), train_acc / len(train_set)

In [7]:
def _valid(model, valid_set, criterion, batch_size):
    
    data_loader = DataLoader(dataset=valid_set, batch_size=batch_size, shuffle=False, 
                     collate_fn=collate_fn)
    
    valid_loss = 0
    valid_acc = 0
    
    for texts, labels, offsets in data_loader:
        with torch.no_grad():  # no grad
            # Foreward.
            outputs = model(texts, offsets)
            loss = criterion(outputs, labels)

            valid_loss += loss
            valid_acc += (outputs.argmax(dim=1) == labels).sum().item()
        
    return valid_loss / len(valid_set), valid_acc / len(valid_set)

In [8]:
import torch.optim as optim

def train(train_set, valid_set, model, criterion, optimizer, scheduler, n_epochs=10, batch_size=10):
    print("Training...")
    
    for epoch in range(1, n_epochs + 1):
        train_loss, train_acc = _train(model, train_set, criterion, optimizer, scheduler, batch_size)
        valid_loss, valid_acc = _valid(model, valid_set, criterion, batch_size)
        
        print(f"epoch: {epoch}")
        print(f"train loss: {train_loss:.3f}\ttrain acc: {train_acc:.6%}")
        print(f"valid loss: {valid_loss:.3f}\tvalid acc: {valid_acc:.6%}")

## Testing

In [9]:
def test(test_set, model, criterion, batch_size=10):
    print("Testing...")
    
    test_loss, test_acc = _valid(model, test_set, criterion, batch_size=batch_size)
    
    print(f"test loss: {test_loss:.3f}\ttest acc: {test_acc:.6%}")

## Main

In [10]:
import torch

if __name__ == '__main__':
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     device = torch.device("cpu")
    
    # Gets datasets.
    ngrams = 3
#     train_set, val_set, test_set, vocab_size, cls_num = prepare_data(
#         dataset_name="SogouNews", dataset_tar=".data/sogou_news_csv.tar.gz", ngrams=ngrams)
    train_set, val_set, test_set, vocab_size, cls_num = prepare_data(
        dataset_tar='./.data/ag_news_csv.tar.gz',dataset_name="AG_NEWS", ngrams=ngrams)
    
    # Model.
    embed_dim = 30  # if it's set too big, cuda will be out of memory
    model = Model(vocab_size, embed_dim, cls_num).to(device)

    # Criterion.
    criterion = nn.CrossEntropyLoss()

    # Optimizer.
    learning_rate = 5
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    gamma = 0.9
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=gamma)

    # Training and validating.
    batch_size = 16
    train(train_set, val_set, model, criterion, optimizer, scheduler, batch_size=batch_size)

    # Testing.
    test(test_set, model, criterion, batch_size=batch_size)

Preparing data...


120000lines [00:10, 11163.69lines/s]
120000lines [00:18, 6435.63lines/s]
7600lines [00:01, 6810.58lines/s]


Training...
epoch: 1
train loss: 0.064	train acc: 73.325000%
valid loss: 0.055	valid acc: 88.237500%
epoch: 2
train loss: 0.053	train acc: 90.813542%
valid loss: 0.053	valid acc: 89.829167%
epoch: 3
train loss: 0.051	train acc: 93.593750%
valid loss: 0.053	valid acc: 90.500000%
epoch: 4
train loss: 0.050	train acc: 95.010417%
valid loss: 0.052	valid acc: 90.708333%
epoch: 5
train loss: 0.049	train acc: 95.785417%
valid loss: 0.052	valid acc: 91.037500%
epoch: 6
train loss: 0.049	train acc: 96.230208%
valid loss: 0.052	valid acc: 90.950000%
epoch: 7
train loss: 0.049	train acc: 96.536458%
valid loss: 0.052	valid acc: 91.091667%
epoch: 8
train loss: 0.049	train acc: 96.709375%
valid loss: 0.052	valid acc: 91.137500%
epoch: 9
train loss: 0.049	train acc: 96.853125%
valid loss: 0.052	valid acc: 91.216667%
epoch: 10
train loss: 0.049	train acc: 96.961458%
valid loss: 0.052	valid acc: 91.225000%
Testing...
test loss: 0.058	test acc: 80.921053%
