# Sequence Models - RNN for Text Generation

In [1]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.cuda as cuda
import torch.optim as optim

import numpy as np

## Read Text and Convert to Integer Indexes

In [6]:
import os
import torch

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]
    def __len__(self):
        return len(self.idx2word)
    
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
    
        self.whitelist = [chr(i) for i in range(32,127)]
    
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))

    def tokenize(self, path):
        #tokenize text file
        assert os.path.exists(path)
        #add words to dictionary
        with open(path, 'r', encoding='utf8') as f:
            tokens = 0
            for line in f:
                line = ''.join([c for c in line if c in self.whitelist])
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)
            
        #tokenize file content
        with open(path, 'r', encoding='utf8') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                line = ''.join([c for c in line if c in self.whitelist])
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1
        return ids
    

In [7]:
!cd data

In [8]:
corpus = Corpus('./data')

In [9]:
print(corpus.dictionary.idx2word[10])
print(corpus.dictionary.word2idx['That'])

That
10


In [10]:
print(corpus.train.size())
print(corpus.valid.size())

torch.Size([1039900])
torch.Size([63420])


In [11]:
id = corpus.train[112]
corpus.dictionary.idx2word[id]

'else'

In [12]:
vocab_size = len(corpus.dictionary)
print(vocab_size)

74010


## RNN Model using GRU Cells

In [38]:
import torch.nn as nn
from torch.autograd import Variable

class RNNModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0.5):
        
        super(RNNModel, self).__init__()
        
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.drop1 = nn.Dropout(dropout)
        self.drop2 = nn.Dropout(dropout)
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers, dropout=dropout)
        self.decoder = nn.Linear(hidden_size, vocab_size)
        
        self.init_weights()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
            
    def forward(self, input, hidden):
        emb = self.drop1(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop2(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
        
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return Variable(weight.new(self.num_layers, batch_size, self.hidden_size).zero_())

In [39]:
def batchify(data, batch_size):
    #How clean can we divide batch into batch_size parts
    n_batch = data.size(0) // batch_size
    #Trim off elements that wouldn't fit
    data = data.narrow(0,0,n_batch*batch_size)
    #Evenly divide data
    data = data.view(batch_size, -1).t().contiguous()
    if cuda.is_available():
        data = data.cuda()
    return data

In [40]:
dummy_data = 'Once upon a time there was a good king and a queen'
dummy_data_idx = [corpus.dictionary.word2idx[w] for w in dummy_data.split()]
dummy_tensor = torch.LongTensor(dummy_data_idx)
op = batchify(dummy_tensor, 2)
for row in op:
    print('%10s %10s' % (corpus.dictionary.idx2word[row[0]], corpus.dictionary.idx2word[row[1]]))

      Once          a
      upon       good
         a       king
      time        and
     there          a
       was      queen


In [41]:
#batch size for train
bs_train = 20

#batch size for validation
bs_valid = 10

#number of times to unroll graph for backpropagation through time
bptt_size = 35

#gradient clipping
clip = 0.25

#size of embedding vector
embed_size = 200

#size of hidden state in RNN
hidden_size = 200

#number of RNN Layers
num_layers = 2

# percentage of neurons to drop for regularization
dropout_pct = 0.5

In [42]:
train_data = batchify(corpus.train, bs_train)
val_data = batchify(corpus.valid, bs_valid)

In [43]:
train_data.shape

torch.Size([51995, 20])

In [44]:
model = RNNModel(vocab_size, embed_size, hidden_size, num_layers, dropout_pct)

if cuda.is_available():
    model.cuda()

In [45]:
criterion = nn.CrossEntropyLoss()

if cuda.is_available():
    criterion.cuda()

In [48]:
def get_batch(source, i, evaluation=False):
    seq_len = min(bptt_size, len(source)-1-i)
    data = Variable(source[i:i+seq_len])
    target = Variable(source[i+1:i+1+seq_len].view(-1))
    if cuda.is_available():
        data = data.cuda()
        target = target.cuda()
    return data, target

In [49]:
data, target = get_batch(train_data, 0)

In [50]:
data.shape

torch.Size([35, 20])

In [51]:
target.shape

torch.Size([700])

## Model Training

In [68]:
def train(data_source, lr):
    #Turn on training mode -> enable dropout
    
    model.train()
    total_loss = 0
    hidden = model.init_hidden(bs_train)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for batch, i in enumerate(range(0,data_source.size(0) - 1, bptt_size)):
        
        data, targets = get_batch(data_source, i)
        
        #To start each batch, detach hidden state from how it was previously produced
        hidden = Variable(hidden.data)
        
        if cuda.is_available():
            hidden = hidden.cuda()
        
        optimizer.zero_grad()
        
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, vocab_size), targets)
        loss.backward()
        
        #clip_grad_norm prevents exploding gradient
        torch.nn.utils.clip_grad_norm(model.parameters(), clip)
        
        optimizer.step()
        total_loss += len(data) * loss.data
        
    return total_loss.item() / len(data_source)

In [69]:
def evaluate(data_source):
    #Turn on eval mode -> disable dropout
    
    model.eval()
    total_loss = 0
    hidden = model.init_hidden(bs_valid)
    
    for i in range(0, data_source.size(0) - 1, bptt_size):
        data, targets = get_batch(data_source, i, evaluation=True)
        
        if cuda.is_available():
            hidden = hidden.cuda()
        
        output, hidden = model(data, hidden)
        output_flat = output.view(-1, vocab_size)
        
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = Variable(hidden.data)
    return total_loss.item() / len(data_source)

In [70]:
#Loop over epochs
best_val_loss = None

In [71]:
def run(epochs, lr):
    global best_val_loss
    
    for epoch in range(0, epochs):
        train_loss = train(train_data, lr)
        val_loss = evaluate(val_data)
        print('Train Loss: ', train_loss, 'Valid Loss: ', val_loss)
        
        if not best_val_loss or val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), './4.model1.pth')

In [72]:
run(5, 0.001)

  This is separate from the ipykernel package so we can avoid doing imports until


Train Loss:  6.37492126646793 Valid Loss:  6.800919219094923
Train Loss:  6.218556471776132 Valid Loss:  6.744746703524125
Train Loss:  6.11726007308395 Valid Loss:  6.730186036542101
Train Loss:  6.0457069189345125 Valid Loss:  6.743603531023337
Train Loss:  5.987822146360227 Valid Loss:  6.745710639388205


## Text Generation

In [73]:
num_words = 200
temperature = 1

In [76]:
model = RNNModel(vocab_size, embed_size, hidden_size, num_layers, dropout_pct)
model.load_state_dict(torch.load('./4.model1.pth'))

if cuda.is_available():
    model.cuda()
    
model.eval()

RNNModel(
  (encoder): Embedding(74010, 200)
  (drop1): Dropout(p=0.5, inplace=False)
  (drop2): Dropout(p=0.5, inplace=False)
  (rnn): GRU(200, 200, num_layers=2, dropout=0.5)
  (decoder): Linear(in_features=200, out_features=74010, bias=True)
)

In [77]:
#High temperature displays greater linguistic variety, low temperature is more grammatically correct
temperature = 0.8

In [86]:
hidden = model.init_hidden(1)
idx = corpus.dictionary.word2idx['The']
input = Variable(torch.LongTensor([[idx]]).long())

if cuda.is_available():
    input.data = input.data.cuda()

print(corpus.dictionary.idx2word[idx], '', end='')

for i in range(num_words):
    output, hidden = model(input,hidden)
    word_weights = output.squeeze().data.div(temperature).exp().cpu()
    word_idx = torch.multinomial(word_weights, 1).item()
    input.data.fill_(word_idx)
    word = corpus.dictionary.idx2word[word_idx]
    
    if word == '<eos>':
        print('')
    else:
        print(word + ' ', end='')

The seas 
Were what their another. 
ALONSO. O, you shall speak the ambition's 
Out of the yon of my state, that now 
Will show the people in the phrase of death. 
ANTONY. Her own, and his gentleman had not neer all? 
PHEBE. I will be the young pause. 
KING HENRY. Nay, you have chance 
That care you may be funeral, 
The gods it make his ears of true these eyes! 
Or, thou art your face, in both his Thats 
And to alter by his royal hands 
To do you treachery: loss. 
CHARMIAN. Then, that you have been made 
As it is the brings of your epitaph 
And in his English intend and marvellous age, 
MIRANDA. Do you be off the letter] 
QUICKLY. I am a man for his faithful wife and the 
grief, of Rome will there's the cause to trust your of? 
FIRST WATCH. You cannot be weary, but slight my sight and 
yours. 
EVANS. Mine was my gentleman if it shall be a times of 
the sheep. 
ANTONY. No, my lord, is a hungry hand. 


In [91]:
temperature = 0.3

In [92]:
hidden = model.init_hidden(1)
idx = corpus.dictionary.word2idx['The']
input = Variable(torch.LongTensor([[idx]]).long())

if cuda.is_available():
    input.data = input.data.cuda()

print(corpus.dictionary.idx2word[idx], '', end='')

for i in range(num_words):
    output, hidden = model(input,hidden)
    word_weights = output.squeeze().data.div(temperature).exp().cpu()
    word_idx = torch.multinomial(word_weights, 1).item()
    input.data.fill_(word_idx)
    word = corpus.dictionary.idx2word[word_idx]
    
    if word == '<eos>':
        print('')
    else:
        print(word + ' ', end='')

The King of the 
world and that I have been a man in the King. 

Enter SIR SIR MESSENGER 

Enter the KING, and the MESSENGER 

FIRST MURDERER. How is a man of this of the 
King of the King of the King of the world of his 
own great man of the King's own life and the man of the 
next of the world of the Duke of the place of his 
own own eyes and the man of the King and the King of the 
King's own eyes and the Duke of the world and the of the 
man of the King of the King of the King of the poor 
DUKE OF FRANCE. The King is a man and the Duke of the 
whole of the Duke of the new whole of a man of the 
company of the other of the King of the King of the 
French of the King of the other of the King and the man of 
the of the King of the man of the King of the King's 
Wales, and the man of 