## Demo 3 — next word prediction via LSTM

In [1]:
import sys
import os
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np

### Create a class for loading the text data.

In [3]:
from nltk.tokenize import WordPunctTokenizer
import random

class TextLoader(object):
    def __init__(self, filename):
        thefile = open(filename, "r")
        
        tokenizer = WordPunctTokenizer()
        
        self.sentences = []
        self.predicted = []
        for line in thefile:
            sentence = tokenizer.tokenize(line)
            if sentence:
                # List of tokenized sentences with the start symbol
                # and parallel list of each word's next word and
                # the end symbol.
                self.sentences.append(["<s>"] + sentence)
                self.predicted.append(sentence + ["<e>"])
        

        uniquevocab = list(set(sum(self.sentences, []))) + ["<e>"]
        self.vocabindex = {}
        for i in range(len(uniquevocab)):
            self.vocabindex[uniquevocab[i]] = i
        
        # Turn the sentences into lists of indices into the vocab.
        self.int_sentences = [[self.vocabindex[x] for x in y] for 
                              y in self.sentences]
        self.int_predicted = [[self.vocabindex[x] 
                               for x in y] for 
                              y in self.predicted]
        self.pairs = list(zip(self.int_sentences, self.int_predicted))
    
    # Get it randomized at every round.
    def shuffle(self):
        random.shuffle(self.pairs)
        int_sentences, int_predicted = zip(*self.pairs)
        self.int_sentences = list(int_sentences)
        self.int_predicted = list(int_predicted)
    
    def __getitem__(self, n):
        return self.pairs[n]
    
    def __iter__(self):
        return iter(self.pairs)

In [5]:
print(sentences)

NameError: name 'sentences' is not defined

In [3]:
tl = TextLoader("snark.txt")

In [4]:
tl[300]

([1924, 1448, 138, 2258, 2046, 748, 138, 2258, 1411, 878, 415, 1851],
 [1448, 138, 2258, 2046, 748, 138, 2258, 1411, 878, 415, 1851, 2267])

Every item in the `TextLoader` is now a pair consisting of the sentence, and the next words for each word.  Note that 64 is the start symbol and 2267 is the end symbol.

### Define the model.

In [5]:
class LSTMFun(nn.Module):
    def __init__(self, vocab_size, input_size, hidden_size, lstm_layers=2, dropout=0.1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, input_size)
        # We need at least two LSTM layers for the LSTM dropout to be meaningful.
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=lstm_layers, dropout=dropout)
        self.linear = nn.Linear(hidden_size, vocab_size)
        # The dimension that we run the softmax on is really important.  We want the dimension that
        # corresponds to a distribution over the vocabulary.
        self.softmax = nn.LogSoftmax(dim=2)
        
        self.lstm_layers = lstm_layers
        self.hidden_size = hidden_size
        
    def forward(self, sentence):
        init_hidden = self.init_hidden(len(sentence[0]))
        
        # The sentence as indices goes directly into the embedding layer,
        # which selects randomly-initialized vectors corresponding to the
        # indices.
        output = self.emb(sentence)
        output, hidden = self.lstm(output, init_hidden)
        output = self.linear(output)
        return self.softmax(output)
    
    def set_dev(self, dev):
        self.dev = dev
    
    # We need an initial pair of hidden weights and memory state.
    def init_hidden(self, sen_len):
        return (torch.zeros(self.lstm_layers, sen_len, self.hidden_size).to(self.dev),
                torch.zeros(self.lstm_layers, sen_len, self.hidden_size).to(self.dev))

### Test the forward pass to make sure we have the dimensions right.

Remember that we have never called `shuffle` so everything should be in the file order.

In [6]:
sentence = tl[300]

In [7]:
len(sentence[0]) # Remember that each sentence is a pair whose first member is the input.

12

In [8]:
model = LSTMFun(len(tl.vocabindex), 300, 200)
model.set_dev(torch.device('cpu'))

In this example, we're only seeing batch sizes of 1. So the hidden layer needs to be initialized to the length of that single sentence, which is 12.

In [9]:
hidden = model.init_hidden(12)

In [10]:
hidden

(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]]),
 tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]],
 
         [[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
       

In [11]:
hidden[0].size()

torch.Size([2, 12, 200])

Why do the hidden weights come in pairs? Because it is a two-layer network, so we need to feed an initial state with two layers.

In [12]:
output = model.emb(torch.LongTensor([sentence[0]]))

The network is expecting batches of sentences, so we need to enclose the single sentence in an outer dimension to represent a batch of 1.

In [13]:
output

tensor([[[ 2.2732,  0.7312, -0.4062,  ..., -0.1858,  0.0316,  0.3229],
         [ 1.7671, -1.3043, -0.0223,  ..., -1.0204,  1.2957, -0.1316],
         [ 0.7743, -1.4921, -0.3188,  ...,  0.0933, -0.5053,  1.0362],
         ...,
         [-0.8322, -1.4579,  0.7607,  ...,  1.4547,  1.1233, -2.9324],
         [ 0.5830, -0.3188, -2.6546,  ...,  0.4145,  0.4094,  1.1061],
         [-0.6901, -0.1143, -0.0835,  ...,  1.7085,  0.0943, -1.2241]]],
       grad_fn=<EmbeddingBackward>)

In [14]:
output.size()

torch.Size([1, 12, 300])

In [15]:
output2, hidden2 = model.lstm(output, hidden)

In [16]:
output2.size(), hidden2[0].size(), hidden2[1].size()

(torch.Size([1, 12, 200]), torch.Size([2, 12, 200]), torch.Size([2, 12, 200]))

In [17]:
output3 = model.linear(output2)

In [18]:
output3

tensor([[[-0.0235, -0.0082,  0.0413,  ...,  0.0039,  0.0900,  0.0044],
         [-0.0186, -0.0338,  0.0847,  ...,  0.0130,  0.0580,  0.0289],
         [-0.0251, -0.0095,  0.0593,  ..., -0.0152,  0.0425,  0.0078],
         ...,
         [-0.0128,  0.0034,  0.0747,  ..., -0.0004,  0.0801,  0.0282],
         [-0.0299,  0.0040,  0.0498,  ..., -0.0129,  0.0791,  0.0272],
         [-0.0193, -0.0144,  0.0602,  ...,  0.0071,  0.0572,  0.0212]]],
       grad_fn=<AddBackward0>)

In [19]:
output3.size()

torch.Size([1, 12, 2268])

We went from 300-dimension embeddings to a pair of 200-dimensional LSTM layers, and then the linear layer "decompresses" the 200 dimensions to the full size of the vocabulary, 2268.  Note that there is a batch dimension, but the batch size is 1, just as we started.

In [20]:
finaloutput = model.softmax(output3)

In [21]:
finaloutput.size()

torch.Size([1, 12, 2268])

Again, remember that there's a batch size of 1. We have to use `finaloutput[0]` to access the 12 vectors that correspond to probability distributions for each word.

In [22]:
len(tl.vocabindex)

2268

In [23]:
finaloutput

tensor([[[-7.7509, -7.7356, -7.6861,  ..., -7.7234, -7.6373, -7.7230],
         [-7.7459, -7.7611, -7.6426,  ..., -7.7142, -7.6692, -7.6984],
         [-7.7524, -7.7368, -7.6680,  ..., -7.7425, -7.6848, -7.7196],
         ...,
         [-7.7402, -7.7240, -7.6527,  ..., -7.7277, -7.6473, -7.6992],
         [-7.7572, -7.7232, -7.6775,  ..., -7.7402, -7.6482, -7.7001],
         [-7.7466, -7.7417, -7.6671,  ..., -7.7202, -7.6701, -7.7061]]],
       grad_fn=<LogSoftmaxBackward>)

In [24]:
finaloutput[0].size()

torch.Size([12, 2268])

In [25]:
# Remember, we applied LogSoftmax...
finalprobs = torch.exp(finaloutput[0]) 

In [26]:
finalprobs

tensor([[0.0004, 0.0004, 0.0005,  ..., 0.0004, 0.0005, 0.0004],
        [0.0004, 0.0004, 0.0005,  ..., 0.0004, 0.0005, 0.0005],
        [0.0004, 0.0004, 0.0005,  ..., 0.0004, 0.0005, 0.0004],
        ...,
        [0.0004, 0.0004, 0.0005,  ..., 0.0004, 0.0005, 0.0005],
        [0.0004, 0.0004, 0.0005,  ..., 0.0004, 0.0005, 0.0005],
        [0.0004, 0.0004, 0.0005,  ..., 0.0004, 0.0005, 0.0005]],
       grad_fn=<ExpBackward>)

In [27]:
sum(finalprobs[2])

tensor(1.0000, grad_fn=<AddBackward0>)

We get a true probability distribution, since it sums to 1.

In [28]:
criterion = nn.NLLLoss() # Takes the LogSoftmax output.

In [29]:
# Remember that we started with sentence 300.
target = sentence[1]
target = torch.LongTensor(target)

In [30]:
target

tensor([1448,  138, 2258, 2046,  748,  138, 2258, 1411,  878,  415, 1851, 2267])

In [31]:
target.size()

torch.Size([12])

This is a little confusing. `nn.NLLLoss` takes as the target the *indices* of the output words; it constructs the one-hot vectors/distributions itself. Easy to forget if you're used to sklearn, keras, etc. 

In [32]:
loss = criterion(finaloutput[0], target)

In [33]:
loss

tensor(7.7129, grad_fn=<NllLossBackward>)

Since we're not actually doing the loop, we don't backpropagate.

### Write the training loop.

In [34]:
import torch.optim as optim

def train(tl, epochs=3):
    model = LSTMFun(len(tl.vocabindex), 300, 200)
    optimizer = optim.Adam(model.parameters())
    criterion = nn.NLLLoss()
    
    dev = torch.device('cuda:3')
    model = model.to(dev)
    model.set_dev(dev)
    
    for i in range(epochs):
        tl.shuffle() # Shuffle at every epoch.
                
        for input_sent, output_sent in tl: 
            # This is extremely inefficient, a waste of GPU to send 
            # each instance individually, since we're not using
            # batching.
            input_tensor = torch.LongTensor(np.array([input_sent])).to(dev)
            output_tensor = torch.LongTensor(np.array(output_sent)).to(dev)
            optimizer.zero_grad()
            result = model(input_tensor)
            loss = criterion(result[0], output_tensor)
            loss.backward()
            optimizer.step()
            
    return model

In [35]:
trained = train(tl)

### Use the model to obtain the next word of a prefix sentence.

In [36]:
def predict(model, prefix):
    dev = torch.device("cuda:3")
    indices = [tl.vocabindex[word] for word in prefix.split()]
    indices = torch.LongTensor(np.array([indices])).to(dev)
    result = model(indices)
    return result

To test it, we need to use strictly words that will be in the vocabulary, otherwise it will raise an exception.

In [37]:
predictions = predict(trained, "<s> Baker believes woe")

In [38]:
predictions[0][-1].topk(1)[1].item()

2267

In [39]:
rev_vocab = {v: k for k, v in tl.vocabindex.items()}

In [40]:
rev_vocab[2267]

'<e>'

It predicted the end symbol. But does it always do that? In the first place, it's been trained on very little data and with very few epochs, so it's quite possible!

In [41]:
def generate(model, prefix, length=6):
    dev = torch.device("cuda:3")
    
    for i in range(length):
        # Iteratively get the next several words by concatenating
        # predictions.
        indices = [tl.vocabindex[word] for word in prefix.split()]
        indices = torch.LongTensor(np.array([indices])).to(dev)
        result = model(indices)
        nextword = rev_vocab[result[0][-1].topk(1)[1].item()]
        prefix = prefix + " " + nextword
        
    return prefix

In [42]:
generate(trained, "<s> believes the profit and")

'<s> believes the profit and hope ; <e> , <e> .'

In [43]:
generate(trained, "the Captain unfailing and also")

'the Captain unfailing and also a Snark , <e> , <e>'

In [44]:
generate(trained, "friend staff <s> quivering dens corrupt")

'friend staff <s> quivering dens corrupt , <e> , <e> . <e>'

So it does behave creatively. Let's train another model for a few more epochs.

In [45]:
trained2 = train(tl, epochs=30)

In [49]:
generate(trained2, "<s> believes the profit and")

'<s> believes the profit and hope ; <e> , <e> ,'

In [50]:
generate(trained2, "the Captain unfailing and also")

'the Captain unfailing and also seems a Snark , <e> ,'

In [51]:
generate(trained2, "friend staff <s> quivering dens corrupt")

'friend staff <s> quivering dens corrupt data , <e> , <e> ,'

It gives us more, but it's still too little data to be very good.