In [1]:
import torch
import torchtext
from torchtext.data import Field, Dataset, BPTTIterator
from torchtext.datasets import WikiText2
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math 
import numpy as np
from time import time

import re

In this notebook, a recurrent neural net of LSTM units is used for prediction of next items in sequences of text data from wikitext corpus. The text sequences here split to single symbols, so the task is to predict what symbol should go next based on several previous values.

In [2]:
# Tokenizer for splitting texts.
tokenize = lambda x: re.findall(".", x)
# torchtext.data.Field variable, necessary for preprocessing.
TEXT = Field(sequential=True, tokenize=tokenize, eos_token="<eos>", lower=True)
# Splitting WikiText2 dataset to train, validation and test sets.
train, valid, test = WikiText2.splits(TEXT)
# Building of vocabulary for embeddings.
TEXT.build_vocab(train, vectors="glove.6B.200d")
# Vocabulary check. Each symbol assigned an id.
print("Vocabulary length: ", len(list(TEXT.vocab.stoi.items())))
print(list(TEXT.vocab.stoi.items())[:30])

Vocabulary length:  245
[('<unk>', 0), ('<pad>', 1), ('<eos>', 2), (' ', 3), ('e', 4), ('t', 5), ('a', 6), ('n', 7), ('i', 8), ('o', 9), ('r', 10), ('s', 11), ('h', 12), ('d', 13), ('l', 14), ('u', 15), ('c', 16), ('m', 17), ('f', 18), ('g', 19), ('p', 20), ('w', 21), ('b', 22), ('y', 23), ('k', 24), (',', 25), ('.', 26), ('v', 27), ('<', 28), ('>', 29)]


In [3]:
# Variables initializing. Length of sequence set to 30.
batch_size = 128
sequence_length = 30

In [4]:
# Makes BPTTIterator for splitting corpus to sequential batches with target, shifted by 1.
train_iter, valid_iter, test_iter = BPTTIterator.splits((train, valid, test),
                                                         batch_size=batch_size,
                                                         bptt_len=sequence_length,    
                                                         shuffle=True)

In [5]:
# More variables initializing.
eval_batch_size = 128
grad_clip = 0.1
best_val_loss = None
log_interval = 100

weight_matrix = TEXT.vocab.vectors
ntokens = weight_matrix.shape[0] # Number of tokens for embedding.
nfeatures = weight_matrix.shape[1]

In [6]:
class RNNModel(nn.Module):
    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5, lnorm=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp) # Shape: (vocabulary length, number of features).
        self.lnorm = None
        if lnorm:
            self.lnorm = nn.LayerNorm(ninp)        
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)  # Shape: (number of features, hidden state units, layers).        
        self.decoder = nn.Linear(nhid, ntoken) # Conversion to vocabulary tokens for final multilabel classification task.
        self.init_weights()        
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):        
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, x, hidden=None):
        emb = self.drop(self.encoder(x)) # Output shape: (sequence length, batch size, number of features)
        if self.lnorm is not None:
            emb = self.lnorm(emb)        
        output, hidden = self.rnn(emb, hidden) 
        # output shape: (sequence length, batch size, hidden size)
        # hidden shape: (2 * (number of layers), batch size, hidden size). 1st for hidden state, 2nd for cell state.
        output = self.drop(output)
        # decoder input shape: (batch size * sequence length, hidden size).
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        # decoder output shape: (batch size * sequence length, vocabulary length).
        # returns: (sequence length, batch size, vocabulary length).
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):        
        weight = next(self.parameters()).data        
        return (weight.new(self.nlayers, bsz, self.nhid).zero_(),
                weight.new(self.nlayers, bsz, self.nhid).zero_())        

In [7]:
# Validation function.
def evaluate(data_iter):
    model.eval()
    total_loss = 0        
    hidden = model.init_hidden(eval_batch_size)
    for i, batch_data in enumerate(data_iter):        
        data, targets = batch_data.text, batch_data.target # data and targets from torchtext.data.BPTTIterator.        
        output, hidden = model(data) # Net ouput.
        output_flat = output.view(-1, ntokens)        
        total_loss += criterion(output_flat, targets.view(-1)).item() # Cumulative loss.    
    return total_loss / len(data_iter) # returns mean loss.

In [8]:
# Train function.
def train():
    model.train()
    total_loss = 0    
    for batch, batch_data in enumerate(train_iter):
        data, targets = batch_data.text, batch_data.target        
        optimizer.zero_grad()        
        output, hidden = model(data) # Net output.        
        loss = criterion(output.view(-1, ntokens), targets.view(-1)) # Loss and backprop.
        loss.backward()        
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip) # Clipping gradient to counter RNNs exploding gradient problem.        
        optimizer.step()

        # Logging.
        total_loss += loss.item()        
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            print("| epoch {:3d} | {:5d}/{:5d} batches | loss {:5.2f} | ppl {:8.2f}".format(
                epoch, batch, len(train_iter), cur_loss, math.exp(cur_loss)))
            total_loss = 0

In [9]:
model = RNNModel(ntokens, 128, 128, 2, 0.3, lnorm=True)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=1e+2, weight_decay=0.)

In [10]:
# Sequence generation function.
def generate(n=50, temp=1.):
    model.eval()
    # First random symbol.
    x = torch.rand(1, 1).mul(ntokens).long()
    hidden = None
    out = []
    # Making n length sequence.
    for i in range(n):
        output, hidden = model(x, hidden)        
        s_weights = output.squeeze().data.div(temp).exp() # Gets distribution (with temperature) for next symbol.        
        s_idx = torch.multinomial(s_weights, 1)[0] # Samples next symbol index.
        x.data.fill_(s_idx)        
        s = TEXT.vocab.itos[s_idx] # Index to symbol and appends sequence.
        out.append(s)
    # returns string.
    return "".join(out)

Model training and sequence generation. First sequence is generated by untrained net, and consist of random symbols. After first epoch the model starts to produce real word looking samples. Cross entropy was used as loss function, and perplexity as measure of model quality.

$ppl = 2^{-\sum_i{p_i\log{q_i}}}$

In [11]:
# Train, validation and samples output.
with torch.no_grad():
    print("sample:\n", generate(50), "\n") # prints generated sample.

for epoch in range(1, 6):
    start_time = time()
    train()
    val_loss = evaluate(valid_iter)
    print("-" * 89)
    print("| end of epoch {:3d} | valid loss {:5.2f} | valid ppl {:8.2f}".format(
        epoch, val_loss, math.exp(val_loss)))
    print("-" * 89)    
    with torch.no_grad():
        print("sample:\n", generate(50), "\n")
    print("Epoch time: {}".format((time()-start_time)/60.))


sample:
 ¡ต礮аgลbс⁄ī£ćãửκ殻³(čヴ♯µoṃþ〉²£~隊ァxッ攻ル*тcầuხ्яμắ”#კłử 

| epoch   1 |   100/ 2808 batches | loss  2.93 | ppl    18.79
| epoch   1 |   200/ 2808 batches | loss  2.18 | ppl     8.87
| epoch   1 |   300/ 2808 batches | loss  2.06 | ppl     7.86
| epoch   1 |   400/ 2808 batches | loss  2.00 | ppl     7.37
| epoch   1 |   500/ 2808 batches | loss  1.95 | ppl     7.03
| epoch   1 |   600/ 2808 batches | loss  1.92 | ppl     6.82
| epoch   1 |   700/ 2808 batches | loss  1.89 | ppl     6.64
| epoch   1 |   800/ 2808 batches | loss  1.88 | ppl     6.52
| epoch   1 |   900/ 2808 batches | loss  1.86 | ppl     6.44
| epoch   1 |  1000/ 2808 batches | loss  1.85 | ppl     6.37
| epoch   1 |  1100/ 2808 batches | loss  1.83 | ppl     6.24
| epoch   1 |  1200/ 2808 batches | loss  1.83 | ppl     6.26
| epoch   1 |  1300/ 2808 batches | loss  1.82 | ppl     6.16
| epoch   1 |  1400/ 2808 batches | loss  1.80 | ppl     6.05
| epoch   1 |  1500/ 2808 batches | loss  1.80 | ppl     6.05
| epoch 

-----------------------------------------------------------------------------------------
sample:
  it was such as astant workments of cape of the fi 

Epoch time: 15.59484405517578
| epoch   5 |   100/ 2808 batches | loss  1.71 | ppl     5.54
| epoch   5 |   200/ 2808 batches | loss  1.69 | ppl     5.42
| epoch   5 |   300/ 2808 batches | loss  1.69 | ppl     5.45
| epoch   5 |   400/ 2808 batches | loss  1.70 | ppl     5.45
| epoch   5 |   500/ 2808 batches | loss  1.69 | ppl     5.45
| epoch   5 |   600/ 2808 batches | loss  1.69 | ppl     5.43
| epoch   5 |   700/ 2808 batches | loss  1.69 | ppl     5.44
| epoch   5 |   800/ 2808 batches | loss  1.69 | ppl     5.43
| epoch   5 |   900/ 2808 batches | loss  1.69 | ppl     5.41
| epoch   5 |  1000/ 2808 batches | loss  1.69 | ppl     5.43
| epoch   5 |  1100/ 2808 batches | loss  1.69 | ppl     5.40
| epoch   5 |  1200/ 2808 batches | loss  1.69 | ppl     5.43
| epoch   5 |  1300/ 2808 batches | loss  1.69 | ppl     5.43
| epoch   5 