# Deep Learning with PyTorch


Author: [Anand Saha](http://teleported.in/)

### 4. Sequence Models - RNN for text generation

We will create a language model based on Shakespeare's writings, and will then generate new text similar to Shakespear's 

A simplified version of https://github.com/pytorch/examples/tree/master/word_language_model

In [1]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.cuda as cuda
import torch.optim as optim

import numpy as np

**Helper class to read in the texts, convert the words to integer indexes and provide lookup tables to convert any word to its index and vice versa.**

In [2]:
import os
import torch

class Dictionary(object):
    """Maps word (e.g. `cat`) to an index (e.g. 5) and vice-versa."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        """Adds word, if not already in dictionary, and returns its index."""
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    """
    Used to load text files (training and validation), generate dictionary from contents,
    and tokenize each file (words --> integers).
    """
    def __init__(self, path):
        self.dictionary = Dictionary()
        
        # This is very English language specific.
        # We will ingest only these characters:
        self.whitelist = [chr(i) for i in range(32, 127)]
        
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))

    def tokenize(self, path):
        """
        Tokenizes a text file. Converts each word in source text to integer ID. Returns
        tensor containing sequence of these IDs.
        """
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r',  encoding="utf8") as f:
            tokens = 0  # counts number of tokens, i.e. number of words in source text
            for line in f:
                line = ''.join([c for c in line if c in self.whitelist])
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r',  encoding="utf8") as f:
            # One-dimensional tensor of length 'tokens'
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                line = ''.join([c for c in line if c in self.whitelist])
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

In [3]:
!ls data/shakespear

train.txt  valid.txt


In [4]:
corpus = Corpus('./data/shakespear')

In [5]:
# Verify that dictionary works
print(corpus.dictionary.idx2word[10])
print(corpus.dictionary.word2idx['That'])

That
10


In [6]:
# About a million words in training set, 63k in validation set
print(corpus.train.size())
print(corpus.valid.size())

torch.Size([1039900])
torch.Size([63420])


In [7]:
# What is 112th word in training set?
id = corpus.train[112]
corpus.dictionary.idx2word[id]

'else'

In [8]:
vocab_size = len(corpus.dictionary)
print(vocab_size)

74010


**Added by Ryan to deal with out-of-memory errors**

In [9]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
torch.cuda.empty_cache()

**The RNN model (using GRU cells)**

In [10]:
import torch.nn as nn
from torch.autograd import Variable

class RNNModel(nn.Module):

    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0.5):
        """
        :param vocab_size: number of words in dictionary
        :embed_size: see word embedding technique, vectors used to represent words are of this
            length
        :hidden_size: ???
        """
        
        super(RNNModel, self).__init__()
        
        # Will map an ID to a vector of dims embed_size. Is trained via backpropagation like
        # everything else.
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.drop1 = nn.Dropout(dropout)
        self.drop2 = nn.Dropout(dropout)
        # GRU params
        #   input: number of input features
        #   hidden_size: number of features in hidden layer
        #   num_layers: number of of recurrent layers, i.e. number of stacked GRUs
        #   dropout = dropout probability
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers, dropout=dropout)
        self.decoder = nn.Linear(hidden_size, vocab_size)

        self.init_weights()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.forward_called = False

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input_data, hidden):
        """
        Standard function to apply NN to input.
        
        :param input: input_data features. A 2d tensor of batch data. Go down the rows
            to advance through sequences, across the columns to switch sequence.
        :param hidden: hidden features from previous time step
        """
        emb = self.drop1(self.encoder(input_data))
        output, hidden = self.rnn(emb, hidden)
        if not self.forward_called:
            self.forward_called = True
            print("first forward() call, input shape is", input_data.shape)
            print("hidden shape is", hidden.shape)
            print("emb shape is", emb.shape)
            print("output shape is", output.shape)
        output = self.drop2(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return Variable(weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
    

In [11]:
def batchify(data, batch_size):
    """
    Split data into batches, each of length batch_size.
    If batch size is n, then the data will be divided into n subsequences,
    with a subsequence being a contiguous series of words from the
    original text.
    
    Example:
    subsequence 0: "once upon a time..."
    subsequence 1: "a good king and..."
    
    Batch 0 contains first token from each subsequence, batch 1 contains
    second token, and so on.
    
    Note: in example in next cell, batch size is 2
    
    :param data: data as 1D tensor
    :param batch_size: size of a single batch
    :return: a 2D tensor in which each row is a batch
    """
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // batch_size
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * batch_size)
    # Evenly divide the data across the bsz batches.
    data = data.view(batch_size, -1).t().contiguous()
    if cuda.is_available():
        data = data.cuda()
    return data

In [12]:
dummy_data = "Once upon a time there was a good king and a queen"
dummy_data_idx = [corpus.dictionary.word2idx[w] for w in dummy_data.split()]
dummy_tensor = torch.LongTensor(dummy_data_idx) 
op = batchify(dummy_tensor, 2)
print(f"{len(op)} batches: the words")
for row in op:
    print("%10s %10s" %  (corpus.dictionary.idx2word[row[0]], corpus.dictionary.idx2word[row[1]]))
print(f"\n{len(op)} batches: the indexes")
for row in op:
    print("%10d %10d" %  (row[0], row[1]))
op.shape

6 batches: the words
      Once          a
      upon       good
         a       king
      time        and
     there          a
       was      queen

6 batches: the indexes
      9917         46
       845       1171
        46       2463
        23         90
       994         46
      1538       5574


torch.Size([6, 2])

In [13]:
bs_train = 20       # batch size for training set
bs_valid = 10       # batch size for validation set
# CRASHISSUE
#bptt_size = 35      # number of times to unroll the graph for back propagation through time
bptt_size = 25      # number of times to unroll the graph for back propagation through time
clip = 0.25         # gradient clipping to check exploding gradient

embed_size = 200    # size of the embedding vector
hidden_size = 200   # size of the hidden state in the RNN 
num_layers = 2      # number of RNN layres to use
dropout_pct = 0.5   # %age of neurons to drop out for regularization

In [14]:
train_data = batchify(corpus.train, bs_train)
val_data = batchify(corpus.valid, bs_valid)

In [15]:
# ~52,000 batches (rows), each of size 20 (columns)
train_data.shape

torch.Size([51995, 20])

**Experiments added by Ryan**
*(feel free to delete or whatever)*

In [16]:
dumb = torch.tensor([[0, 1], [2, 3], [4, 5]])
dumb.shape
dumb_flattened = dumb.view(-1)
dumb_flattened

tensor([0, 1, 2, 3, 4, 5])

In [17]:
# vocab_size: 74010, embed_size: 200, hidden_size: 200, num_layers: 2
model = RNNModel(vocab_size, embed_size, hidden_size, num_layers, dropout_pct)

if cuda.is_available():
    model.cuda()

In [18]:
criterion = nn.CrossEntropyLoss()

In [19]:
def get_batch_series(source, i, evaluation=False):
    """
    Given a tensor containing batch data, extract a sequence starting at index i (i.e. batch i). 
    Sequence length is bptt_size or less.
    
    Again, each row of returned data is a batch, each column of row belongs to a particular
    sequence of words. Technically, this returns a table of batch data.
    
    The returned 'target' is a little confusing. Basically, it just returns a 1-D tensor containing
    the contents of the second row in returned `data`, followed by the contents of the third row,
    etc.
    
    Obviously, the goal of this whole thing is to discover a relationship between words in one
    batch and words in the next batch.
    
    :returns: (extracted sequence as tensor, a same-length sequence but one time step forward
        and flattened)
    """
    # Sequence length is whatever is smaller: number of unrollings or remaining entries in batch,
    # past index i
    seq_len = min(bptt_size, len(source) - 1 - i)
    data = Variable(source[i:i+seq_len])
    target = Variable(source[i+1:i+1+seq_len].view(-1))
    if cuda.is_available():
        data = data.cuda()
        target = target.cuda()
    return data, target

In [20]:
data, target = get_batch_series(train_data, 1)

In [21]:
# Num batches x batch size
print(f"data.shape = {data.shape[0]} x {data.shape[1]}")
print("target.shape =", target.shape)

data.shape = 25 x 20
target.shape = torch.Size([500])


In [22]:
target.shape

torch.Size([500])

**More experiments by Ryan**

In [30]:
print("range: ", range(0, 10))
for batch_series_num, i in enumerate(range(0, train_data.size(0) - 1, bptt_size)):
    print(f"batch series: {batch_series_num}, i: {i}")
    if batch_series_num == 0 or batch_series_num == 1:
        data, targets = get_batch_series(train_data, i)
        targets_str = ""
        for t in range(10):
            t_word = corpus.dictionary.idx2word[targets[t]]
            targets_str += f"{', ' if t > 0 else ''}'{t_word}'"
        print(f"batch series {batch_series_num}/index {i}, targets {targets_str}")
        # First 5 columns
        for seq in range(5):
            # First 10 rows
            words = [corpus.dictionary.idx2word[data[w][seq]] for w in range(10)]
            print(f"batch series {batch_series_num}/index {i}, seq {seq}; words are: {' '.join(words)}")
    if batch_series_num >= 10:
        break

range:  range(0, 10)
batch series: 0, i: 0
batch series 0/index 0, targets 'THE', 'discontents', 'will', 'imprison'd.', '<eos>', 'like', 'the', 'be', 'USHER', 'am'
batch series 0/index 0, seq 0; words are: <eos> THE SONNETS <eos> <eos> 1 <eos> <eos> From fairest
batch series 0/index 0, seq 1; words are: The discontents repair, and men's reports <eos> Give him much
batch series 0/index 0, seq 2; words are: beg will not become me. My <eos> way is to
batch series 0/index 0, seq 3; words are: she imprison'd. All <eos> Is outward sorrow, though I think
batch series 0/index 0, seq 4; words are: <eos> <eos> PLAYER QUEEN. <eos> So many journeys may the
batch series: 1, i: 25
batch series 1/index 25, targets 'riper', 'the', 'love', 'FIRST', 'is', 'than', 'is', '<eos>', 'the', 'them'
batch series 1/index 25, seq 0; words are: the riper should by time decease, <eos> His tender heir
batch series 1/index 25, seq 1; words are: from the primal state <eos> That he which is was
batch series 1/index 25,

**Model Training**

In [31]:
def train(data_source, lr):
    """
    :param data_source: the training data, split into 20 batches, as described in batchify() docstring
    :param lr: loss rate
    """
    # Turn on training mode which enables dropout.
    
    model.train()
    total_loss = 0
    hidden = model.init_hidden(bs_train)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for batch_series_num, i in enumerate(range(0, data_source.size(0) - 1, bptt_size)):
        
        data, targets = get_batch_series(data_source, i)

        # Original comment below -- I think he's using "batch" to mean "batch series"
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = Variable(hidden.data)
        
        if cuda.is_available():
            hidden = hidden.cuda()
        
        # model.zero_grad()
        optimizer.zero_grad()
        
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, vocab_size), targets)
        loss.backward()

        # `clip_grad_norm_` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        total_loss += len(data) * loss.data
        
    return total_loss.item() / len(data_source)

In [32]:

def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    hidden = model.init_hidden(bs_valid)
    
    for i in range(0, data_source.size(0) - 1, bptt_size):
        data, targets = get_batch_series(data_source, i, evaluation=True)
        
        if cuda.is_available():
            hidden = hidden.cuda()
            
        output, hidden = model(data, hidden)
        output_flat = output.view(-1, vocab_size)
        
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = Variable(hidden.data)
        
    return total_loss.item() / len(data_source)


In [33]:
# Loop over epochs.
best_val_loss = None

In [34]:
from datetime import datetime

def run(epochs, lr):
    global best_val_loss
    
    for epoch in range(0, epochs):
        print("Beginning training for epoch:", epoch)
        start_time = datetime.now()
        train_loss = train(train_data, lr)
        val_loss = evaluate(val_data)
        end_time = datetime.now()
        time_delta = int((end_time - start_time).total_seconds())
        minutes = time_delta / 60
        seconds = time_delta % 60
        print(f"Finished epoch {epoch} in {minutes}:{seconds : 03d}")
        print("Train Loss: ", train_loss, "Valid Loss: ", val_loss)

        if not best_val_loss or val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "./4.model.pth")


In [None]:
torch.cuda.empty_cache()
run(5, 0.001)

Beginning training for epoch: 0
first forward() call, input shape is torch.Size([25, 20])
hidden shape is torch.Size([2, 20, 200])
emb shape is torch.Size([25, 20, 200])
output shape is torch.Size([25, 20, 200])
Finished epoch 0 in 2.4166666666666665: 25
Train Loss:  6.846469011443408 Valid Loss:  6.875562963181961
Beginning training for epoch: 1


In [30]:
run(5, 0.001)

Train Loss:  5.9627584383113765 Valid Loss:  6.722295928926206
Train Loss:  5.916144581209732 Valid Loss:  6.725216069457584
Train Loss:  5.878729925954419 Valid Loss:  6.75531858542258
Train Loss:  5.848820199057601 Valid Loss:  6.768370831362346
Train Loss:  5.825219372055005 Valid Loss:  6.7805644660596025


**Text Generation**

In [24]:
num_words = 200
temperature = 1

In [27]:
model = RNNModel(vocab_size, embed_size, hidden_size, num_layers, dropout_pct)
model.load_state_dict(torch.load("./4.model.pth"))

if cuda.is_available():
    model.cuda()
    
model.eval()

RNNModel(
  (encoder): Embedding(74010, 200)
  (drop1): Dropout(p=0.5)
  (drop2): Dropout(p=0.5)
  (rnn): GRU(200, 200, num_layers=2, dropout=0.5)
  (decoder): Linear(in_features=200, out_features=74010, bias=True)
)

In [26]:
# https://nlp.stanford.edu/blog/maximum-likelihood-decoding-with-rnns-the-good-the-bad-and-the-ugly/
# Which sample is better? It depends on your personal taste. The high temperature 
# sample displays greater linguistic variety, but the low temperature sample is 
# more grammatically correct. Such is the world of temperature sampling - lowering 
# the temperature allows you to focus on higher probability output sequences and 
# smooth over deficiencies of the model.

# If we set a high temperature, we can get more entropic (*noisier*) probabilities
# Often we want to sample with low temperatures to produce sharp probabilities
temperature = 0.8

In [32]:
hidden = model.init_hidden(1)
idx = corpus.dictionary.word2idx['I']
input = Variable(torch.LongTensor([[idx]]).long(), volatile=True)

if cuda.is_available():
    input.data = input.data.cuda()

print(corpus.dictionary.idx2word[idx], '', end='')

for i in range(num_words):
    output, hidden = model(input, hidden)
    word_weights = output.squeeze().data.div(temperature).exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0]
    input.data.fill_(word_idx)
    word = corpus.dictionary.idx2word[word_idx]

    if word == '<eos>':
        print('')
    else:
        print(word + ' ', end='')

I am a cog, 
A trumpet with his sham'd. 
SECOND MURDERER. I have say make you be of your now. 
PETER. He shall not be afeard against my life, for her, to the 
music of my name to a thence is the Queen of the 
tanner of my own posterity, 
WOLSEY. I know the matter, Is a Boult, an steward 
I'll hear the mercy when they are you? 
The valour is the Empress' innocent 
At heaven unto the flesh, have a Antony, 
Yet, like the man of high all men will am 
And nothing to faith, the happiness of his war. 
FIRST MURDERER. What goes the thousand king of his man's minds in 
reason. 

Come on him here. 

Enter PISANIO and drum and LORD 

CAPULET. 
My poor lord, I are a letter in 
To see them. Let the King be absent. 
I am that woman, can not into a weapon 
To us a thousand more. 

Set of the master? 

HAMLET. 
How bright my council lie up to the exceeds and cross 
bade 

**Homework**

* Play with the hyperparameters
* Play with the model architecture
* Run this on a different dataset
* Search up: Perplexity