In [5]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

In [7]:
#Define the default tensor type at the top
torch.set_default_tensor_type(torch.cuda.FloatTensor if torch.cuda.is_available() else 
                              torch.FloatTensor)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
#Import text data, Alice in Wonderland from local directory
path = "./aiw.txt"

text= open(path).read()
print(len(data))

144348


In [23]:
text[0:500]

'CHAPTER I. Down the Rabbit-Hole\n\nAlice was beginning to get very tired of sitting by her sister on the\nbank, and of having nothing to do: once or twice she had peeped into the\nbook her sister was reading, but it had no pictures or conversations in\nit, ‘and what is the use of a book,’ thought Alice ‘without pictures or\nconversations?’\n\nSo she was considering in her own mind (as well as she could, for the\nhot day made her feel very sleepy and stupid), whether the pleasure\nof making a daisy-chain w'

In [34]:
"""The vocabulary is all the unique symbols used in the text. This is the benefit of 
working with a character level RNN."""

chars = sorted(set(text))
vocab_size= len(chars)
print(vocab_size)

71


In [42]:
{c:i for i, c in enumerate(chars)}

{'\n': 0,
 ' ': 1,
 '!': 2,
 '(': 3,
 ')': 4,
 '*': 5,
 ',': 6,
 '-': 7,
 '.': 8,
 ':': 9,
 ';': 10,
 '?': 11,
 'A': 12,
 'B': 13,
 'C': 14,
 'D': 15,
 'E': 16,
 'F': 17,
 'G': 18,
 'H': 19,
 'I': 20,
 'J': 21,
 'K': 22,
 'L': 23,
 'M': 24,
 'N': 25,
 'O': 26,
 'P': 27,
 'Q': 28,
 'R': 29,
 'S': 30,
 'T': 31,
 'U': 32,
 'V': 33,
 'W': 34,
 'X': 35,
 'Y': 36,
 'Z': 37,
 '[': 38,
 ']': 39,
 '_': 40,
 'a': 41,
 'b': 42,
 'c': 43,
 'd': 44,
 'e': 45,
 'f': 46,
 'g': 47,
 'h': 48,
 'i': 49,
 'j': 50,
 'k': 51,
 'l': 52,
 'm': 53,
 'n': 54,
 'o': 55,
 'p': 56,
 'q': 57,
 'r': 58,
 's': 59,
 't': 60,
 'u': 61,
 'v': 62,
 'w': 63,
 'x': 64,
 'y': 65,
 'z': 66,
 '‘': 67,
 '’': 68,
 '“': 69,
 '”': 70}

In [46]:
#Create dictionaries from character --> index and index --> character
c_to_idx= {c:i for i, c in enumerate(chars)}
idx_to_c= {i:c for i, c in enumerate(chars)}

In [51]:
"""Convert whole text to indicies. Want each character to be 
represented by its index in the vocabulary. This is how we will feed to RNN""



'Convert whole text to indicies. Want each character to be \nrepresented by its index in the vocabulary. This is how we will feed to RNN'

In [61]:
text_idx = [c_to_idx[c] for c in text]
text_len = len(text_idx)
text_idx[:10]

[14, 19, 12, 27, 31, 16, 29, 1, 20, 8]

In [63]:
#Check it works to convert back : join up the indicies

print(text[25:100])
print("--------")
print(''.join([idx_to_c[i] for i in text_idx[25:100]]))

t-Hole

Alice was beginning to get very tired of sitting by her sister on t
--------
t-Hole

Alice was beginning to get very tired of sitting by her sister on t


In [66]:
#Create a DataLoader
#Sequence of characters passed to RNN at a time. This dictates the length of the unrolled model (#timesteps)
#Batch size affects splitting of raw data as well as model architecture

seq_len = 8
batch_size= 512

In [67]:
#Wnat a non-overlapping set of inputs and outputs. Each X should be equal to the sequence length, while the Y, shifted by 1. Note that we don't go to the end for Y.

idx_in_data = [text_idx[idx:idx+seq_len] for idx in range(0, text_len-1-seq_len,seq_len)]

In [77]:
#Convert these inputs into a numpy array and provide info. Note dimensions are the total number of sequences in the corpus and the sequence length.

inp = np.array(idx_in_data)
print(inp.shape)
print(inp[:3, :])

(18043, 8)
[[14 19 12 27 31 16 29  1]
 [20  8  1 15 55 63 54  1]
 [60 48 45  1 29 41 42 42]]


In [78]:
#Do the samething with Y

idx_out_data = [text_idx[idx:idx+seq_len] for idx in range(1, text_len-seq_len, seq_len)]


In [79]:
#Confirm that the target array is the input array shifted by 1. We'll be predicting the next character in sequence.

outp = np.array(idx_out_data)
print(outp.shape)
print(outp[:3,:])

(18043, 8)
[[19 12 27 31 16 29  1 20]
 [ 8  1 15 55 63 54  1 60]
 [48 45  1 29 41 42 42 49]]


In [83]:
'''Split up the input and target data into training and test sets.
Return 4 numpy arrays- training input, training targets, test input, and test targets'''

def train_test_split(inp_data, out_data, train_fraction):
    trn_idx = np.random.rand(len(inp_data)) < train_fraction
    
    inp_trn = inp_data[trn_idx]
    inp_test = inp_data[~trn_idx]
    
    outp_trn= out_data[trn_idx]
    outp_test= out_data[~trn_idx]
    return inp_trn, outp_trn, inp_test, outp_test
    

In [84]:
#Split the data into 90%training, 10% test. This ratio should be bigger with a larger corpus.

x_trn, y_trn, x_val, y_val = train_test_split(inp,outp, 0.9)

In [85]:
'''PyTorch Dataset class for character level text generation. X and Y have widths equal to the sequence length'''

class CharSeqDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __len__(self):
        return len(self.X);
    
    def __getitem__(self,idx):
        item = self.X[idx];
        label = self.Y[idx];
        
        return(item,label);

In [86]:
#Create training and validation datasets

train_ds = CharSeqDataset(x_trn, y_trn)
val_ds = CharSeqDataset(x_val, y_val)

In [87]:
#Turn these into PyTorch dataloaders with batch size = bath_size.
#This will take care of the shuffling and batching

train_dl = DataLoader(dataset=train_ds, batch_size = batch_size, shuffle=True)
val_dl = DataLoader(dataset=val_ds, batch_size= batch_size, shuffle= True)

In [93]:
'''A couple experiments with Data Loaders:
1. The X and Y values are paired. Show that shuffling keeps them lined up.
2. You get a different order whenever you iterate over a dataloader.'''
exp_iter= iter(train_dl)
x_exp, y_exp = next(exp_iter)

#Exp 1
print(x_exp.shape) # batch size by sequence length
print(type(x_exp))
print(x_exp[:2, :])
print("*****")
print(y_exp.shape)
print(type(y_exp))
print(y_exp[:2, :])

torch.Size([512, 8])
<class 'torch.Tensor'>
tensor([[45, 45,  8,  1, 67, 20, 68, 53],
        [61, 54, 47, 58, 65,  1, 46, 55]])
*****
torch.Size([512, 8])
<class 'torch.Tensor'>
tensor([[45,  8,  1, 67, 20, 68, 53,  1],
        [54, 47, 58, 65,  1, 46, 55, 58]])


In [94]:
# Exp 2.
exp_iter2 = iter(train_dl)
x_exp2, y_exp2 = next(exp_iter2)

print(x_exp2.shape) # batch size by sequence length
print(type(x_exp2))
print(x_exp2[:2, :])
print("*****")
print(y_exp2.shape)
print(type(y_exp2))
print(y_exp2[:2, :])

torch.Size([512, 8])
<class 'torch.Tensor'>
tensor([[63, 49, 60, 48,  1, 60, 48, 45],
        [53, 55, 61, 59, 45,  6,  1, 41]])
*****
torch.Size([512, 8])
<class 'torch.Tensor'>
tensor([[49, 60, 48,  1, 60, 48, 45,  0],
        [55, 61, 59, 45,  6,  1, 41, 46]])


# Character Level RNN model class using PyTorch

In [95]:
#Dimension for Character's learned embeddings. Number of hidden units in the RNN.
emb_dim= 42
n_hidden = 256


In [96]:
'''
Pytorch model.
One sequence step involves embedding layer->RNN->fully connected layer->softmax over vocabulary
A couple tricky points:
-Want to keep the hidden activation values after a forward pass. So I have to detach h after a 
forward pass so BPTT doesn't have to go through all the steps back to the very beginning of the corpus.
-Output predictions are rank 3 tensor of batch_size x seq_len x vocab length (it's a prediction over the vocab
for each char in the sequence and for each sequence in the minibatch). Softmax only accepts rank 2, so need to
reshape this into a (batch_size * seq_len) x vocab_length tensor.
'''
class CharRnn(nn.Module):
    def __init__(self, vocab_size, emb_dim, bs):
        super().__init__()
        self.e = nn.Embedding(vocab_size, emb_dim) # Going from vocab size down to embedding size
        # Automatically runs for N sequence steps, which is known from input data size
        self.rnn = nn.RNN(emb_dim, n_hidden) # embedding size to number of hidden units
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.h = self.init_h(bs)
        
    def forward(self, cs):
        bs = cs.shape[0]
        if self.h.shape[1] != bs:
            self.h = self.init_h(bs)
        inp = self.e(cs)
        inp = torch.transpose(inp, 0, 1)
        outp, h = self.rnn(inp, self.h)
        self.h = Variable(h.data) # Save hidden values for next forward pass. Remove from BPTT by rewrapping in Var
        outp = F.log_softmax(self.l_out(outp), dim=-1)
        outp = torch.transpose(outp, 0, 1)
        return outp.contiguous().view(-1, vocab_size) #This is tricky! Write myself a note it
    
    def init_h(self, bs):
        return Variable(torch.zeros(1, bs, n_hidden))

In [97]:
# Training function does 1 epoch (pass through the data)
def train(model, opt, crit, train_loader):
    losses = []
    model.train()
    
    for i, (inputs, targets) in enumerate(train_loader):
        opt.zero_grad()
        outputs = model(inputs.to(device))
        targets = targets.view(-1).to(device)
        loss = crit(outputs, targets)
        loss.backward()
        opt.step()

        losses.append(loss.data);
    return losses

In [98]:
# Test function calculates average loss over all the test data.
def test(model, test_loader, crit):
    # Put model in evaluation mode. Read up on what it does
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs.to(device))
            targets = targets.view(-1).to(device)
#             l = F.nll_loss(outputs, targets, reduction='sum').item() / len(targets)# sum up batch loss
            l = crit(outputs, targets)
            test_loss += l.item()
            pred = outputs.max(1, keepdim=True)[1] # get the index of the max log-probability (char index)
            correct += pred.eq(targets.view_as(pred)).sum().item()
    test_loss /= len(test_loader)
    return test_loss

In [99]:
model = CharRnn(vocab_size, emb_dim, batch_size)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss();


In [100]:
epochs = 4
for ep in range(epochs):
    tr_loss = train(model, optimizer, criterion, train_dl)
    test_loss = test(model, val_dl, criterion)
    print(f'Epoch: {ep+1} / {epochs}, Training Loss: {tr_loss[-1]:.4f}, Validation Loss: {test_loss:.4f}')

Epoch: 1 / 4, Training Loss: 2.8930, Validation Loss: 2.8178
Epoch: 2 / 4, Training Loss: 2.5932, Validation Loss: 2.5552
Epoch: 3 / 4, Training Loss: 2.4434, Validation Loss: 2.4203
Epoch: 4 / 4, Training Loss: 2.3539, Validation Loss: 2.3269


#How well did it work?

In [101]:

'''
Given an input and a trained model, do a forward pass and predict the next character in the input sequence.
Return this character as its integer index in the vocabulary.
'''
def next_letter(my_model, inp):

    inp = torch.tensor([inp])
    model_out = my_model(inp)
    # Grab the last letter from the model output
    # And sample from the vocabulary based on the weighted probability for character in the vocab.
    # This makes this result non-deterministic, there can be variance between the next letter in the sequence
    # depending on the sampling. Especially if multiple character get assigned similar probabilities.
    next_idx = torch.multinomial(model_out[-1].exp(), 1).item()
    
    # return the next character index in the sequence
    return next_idx

In [102]:
mytext = "thos"
mytext = [c_to_idx[i] for i in mytext]
nl = next_letter(model, mytext)
print(nl, idx_to_c[nl])

52 l


In [103]:

'''
Keep generating the next character in the sequence. Repeatedly move the sampling window to include the latest
prediction and predict the next letter. Goes for num_chars repetitions.
'''
def gen_text(my_model, inp, num_chars):
    text = inp
    inp = [c_to_idx[i] for i in inp]
    for c in range(num_chars):
        l = next_letter(my_model, inp)
        text += idx_to_c[l]
        inp = inp[1:]+[l]
    print(text)

In [104]:
gen_text(model, "Hello", 400)


Helloint--ove? ‘YVNam!’

ures il
ore. of en:a dot Allaryntow, madtas ce, and thir thu fadliec it af che cit ous of thedriter ab
t:e. ans bto he ca Iinp whas oilgeet bertehyftoting wals teot. ‘Sy!’’

‘I
B Thery eupnf her’t anr herome

‘Thit,’ver,AKit saidl!!: l, pthe Heare is ireen -h*o to agrin.
‘wNoutfomu gor
ucbynbin
tha beab atosaingit Alihk I’ sy-e’ shatleaot, ail dfon  ailas  are onne toibt, qouse


Step 1 Summary
It looks like it's starting to work alright, especially since it hasn't trained for too long. I found that benefits of continued training started to level off after ~30 epochs or so.

I learned something important here though. When I split up the corpus into sequences of length 8 (sequence length / bptt length), characters 1 - 8 are the first training example in batch 1, 9 - 16 are the second etc. What that means is that the hidden states after the forward pass are meaningless for the next batch. There's no information gained about the previous sequence to help you out with the current sequence!

Here's a different idea -> What if characters 1 - 8 make up the first training example of the first batch, then characters 9 - 16 make up the first training example of the second batch. That way (since we're saving activation values) when character 9 gets passed in as the first step to the RNN, the activations correspond to what came out after character 8, which was the last character of example 1 in the previous minibatch.