## Character level LSTM in PyTorch

In [2]:
#from sklearn import datasets
#import pixiedust
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from itertools import islice
train_on_gpu = torch.cuda.is_available()
print(train_on_gpu)

True


In [3]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))


## Load in data

In [3]:
# with open('data/anna.txt','r') as f:
#     text=f.read()
with open('data/The_Alchemist.txt','r') as f:
    text=f.read()

In [4]:
text[:100]

'The Alchemist\n Paulo Coelho\nTranslated by Alan R. Clarke. Published 1992. ISBN 0-7225-3293-8.\n\n\nCONT'

### Tokenization

In [6]:
# encode the text and map each character to an integer and vice versa

# we create two dictionaries:
# 1. int2char, which maps integers to characters
# 2. char2int, which maps characters to unique integers

chars=tuple(set(text))
int2char= dict(enumerate(chars)) # key-> integers, value-> charac
print(take(10, int2char.items()))
char2int = {value:key for key, value in int2char.items()} # key-> characters, value-> integers
print(take(10, char2int.items()))
#encode the text
encoded= np.array([char2int[ch] for ch in text])
      
      

[(0, 'y'), (1, '9'), (2, '`'), (3, 'c'), (4, '\n'), (5, 'U'), (6, '2'), (7, 'f'), (8, '6'), (9, '3')]
[('y', 0), ('9', 1), ('`', 2), ('c', 3), ('\n', 4), ('U', 5), ('2', 6), ('f', 7), ('6', 8), ('3', 9)]


In [7]:
encoded[:100]

array([35, 71, 15, 47, 55, 60, 30, 79, 41,  4,  4,  4, 17, 15, 47, 47,  0,
       79,  7, 15, 54, 76, 58, 76, 60, 50, 79, 15, 30, 60, 79, 15, 58, 58,
       79, 15, 58, 76, 42, 60, 65, 79, 60, 59, 60, 30,  0, 79, 80, 29, 71,
       15, 47, 47,  0, 79,  7, 15, 54, 76, 58,  0, 79, 76, 50, 79, 80, 29,
       71, 15, 47, 47,  0, 79, 76, 29, 79, 76, 55, 50, 79, 61, 74, 29,  4,
       74, 15,  0, 73,  4,  4, 27, 59, 60, 30,  0, 55, 71, 76, 29])

## Preprocessing Data

In [8]:
def one_hot_encode(arr, n_classes):
    #initialize
    one_hot= np.zeros((arr.size,n_classes), dtype=np.float32)
    
    one_hot[np.arange(one_hot.shape[0]),arr.flatten()]=1.
    
    one_hot = one_hot.reshape((*arr.shape, n_classes))

    return one_hot
    
    #fill appropriate with ones
    

In [9]:
test_seq = np.array([[0,3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


## MAking Training mini-bataches

In [10]:
#from IPython.core.debugger import set_trace
def get_batches(arr, batch_size, seq_length):
    
    batch_size_total= batch_size * seq_length
    
    n_batches= len(arr)//batch_size_total #floor division
    
    #keep only enough characters to make full batches
    arr=arr[:n_batches * batch_size_total]
    
    #reshape into batch_size rows
    arr=arr.reshape((batch_size,-1))
    
    #iterate throught the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        #the features
        x=arr[:,n:n+seq_length]
        #the targts shifted by 1
        y=np.zeros_like(x)
        try:
            y[:,:-1],y[:,-1]=x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:,:-1], y[:,-1]=x[:, 1:], arr[:, 0]
        yield x,y

In [13]:

arr=np.array([1,2,3,4,5,6,7,8,9,10,11,12])
batch_size=2
seq_length=3

batches= get_batches(arr, batch_size, seq_length)


x, y = next(batches)
print('x',x)
print('y',y)

# batches = get_batches(encoded, 8, 50)
# x, y = next(batches)

x [[1 2 3]
 [7 8 9]]
y [[ 2  3  4]
 [ 8  9 10]]


In [14]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [16]:
#printing first 10 items of a sequence
print("x\n",x[:10, :10])
print("\ny\n",y[:10,:10])

x
 [[35 71 15 47 55 60 30 79 41  4]
 [50 61 29 79 55 71 15 55 79 15]
 [60 29 38 79 61 30 79 15 79  7]
 [50 79 55 71 60 79  3 71 76 60]
 [79 50 15 74 79 71 60 30 79 55]
 [ 3 80 50 50 76 61 29 79 15 29]
 [79 44 29 29 15 79 71 15 38 79]
 [66 48 58 61 29 50 42  0 73 79]]

y
 [[71 15 47 55 60 30 79 41  4  4]
 [61 29 79 55 71 15 55 79 15 55]
 [29 38 79 61 30 79 15 79  7 61]
 [79 55 71 60 79  3 71 76 60  7]
 [50 15 74 79 71 60 30 79 55 60]
 [80 50 50 76 61 29 79 15 29 38]
 [44 29 29 15 79 71 15 38 79 50]
 [48 58 61 29 50 42  0 73 79 63]]


In [17]:
# check if gpu available
train_on_gpu= torch.cuda.is_available()
if(train_on_gpu):
    print("Training on gpu")
else:
    print("no gpu available, trainig on CPU, consider making epochs very small,")

Training on gpu


In [34]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        ## TODO: define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## TODO: define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## TODO: define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        ## TODO: Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)
        
        ## TODO: pass through a dropout layer
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)
        
        ## TODO: put x through the fully-connected layer
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden
        

## Train Function

In [35]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

## Instantiating Model

In [36]:
# define and print the net
n_hidden=512
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


## Training a model

In [37]:
batch_size = 128
seq_length = 100
n_epochs = 20 # start smaller if you are just testing initial behavior

# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)

Epoch: 1/20... Step: 10... Loss: 3.2668... Val Loss: 3.2154
Epoch: 1/20... Step: 20... Loss: 3.1573... Val Loss: 3.1403
Epoch: 1/20... Step: 30... Loss: 3.1472... Val Loss: 3.1260
Epoch: 1/20... Step: 40... Loss: 3.1159... Val Loss: 3.1194
Epoch: 1/20... Step: 50... Loss: 3.1466... Val Loss: 3.1175
Epoch: 1/20... Step: 60... Loss: 3.1191... Val Loss: 3.1147
Epoch: 1/20... Step: 70... Loss: 3.1079... Val Loss: 3.1113
Epoch: 1/20... Step: 80... Loss: 3.1185... Val Loss: 3.1028
Epoch: 1/20... Step: 90... Loss: 3.1006... Val Loss: 3.0814
Epoch: 1/20... Step: 100... Loss: 3.0499... Val Loss: 3.0342
Epoch: 1/20... Step: 110... Loss: 2.9701... Val Loss: 2.9425
Epoch: 1/20... Step: 120... Loss: 2.8585... Val Loss: 2.8554
Epoch: 1/20... Step: 130... Loss: 2.7692... Val Loss: 2.7337
Epoch: 2/20... Step: 140... Loss: 2.6697... Val Loss: 2.6285
Epoch: 2/20... Step: 150... Loss: 2.5947... Val Loss: 2.5301
Epoch: 2/20... Step: 160... Loss: 2.5260... Val Loss: 2.4825
Epoch: 2/20... Step: 170... Loss:

Epoch: 10/20... Step: 1350... Loss: 1.3848... Val Loss: 1.4147
Epoch: 10/20... Step: 1360... Loss: 1.3938... Val Loss: 1.4163
Epoch: 10/20... Step: 1370... Loss: 1.3811... Val Loss: 1.4153
Epoch: 10/20... Step: 1380... Loss: 1.4157... Val Loss: 1.4087
Epoch: 10/20... Step: 1390... Loss: 1.4214... Val Loss: 1.4085
Epoch: 11/20... Step: 1400... Loss: 1.4382... Val Loss: 1.4079
Epoch: 11/20... Step: 1410... Loss: 1.4424... Val Loss: 1.4061
Epoch: 11/20... Step: 1420... Loss: 1.4211... Val Loss: 1.4060
Epoch: 11/20... Step: 1430... Loss: 1.3889... Val Loss: 1.4078
Epoch: 11/20... Step: 1440... Loss: 1.4162... Val Loss: 1.4023
Epoch: 11/20... Step: 1450... Loss: 1.3494... Val Loss: 1.4017
Epoch: 11/20... Step: 1460... Loss: 1.3716... Val Loss: 1.4009
Epoch: 11/20... Step: 1470... Loss: 1.3724... Val Loss: 1.4006
Epoch: 11/20... Step: 1480... Loss: 1.3807... Val Loss: 1.3951
Epoch: 11/20... Step: 1490... Loss: 1.3781... Val Loss: 1.3932
Epoch: 11/20... Step: 1500... Loss: 1.3553... Val Loss:

Epoch: 20/20... Step: 2660... Loss: 1.2305... Val Loss: 1.2794
Epoch: 20/20... Step: 2670... Loss: 1.2400... Val Loss: 1.2762
Epoch: 20/20... Step: 2680... Loss: 1.2174... Val Loss: 1.2805
Epoch: 20/20... Step: 2690... Loss: 1.2137... Val Loss: 1.2795
Epoch: 20/20... Step: 2700... Loss: 1.2277... Val Loss: 1.2727
Epoch: 20/20... Step: 2710... Loss: 1.1948... Val Loss: 1.2780
Epoch: 20/20... Step: 2720... Loss: 1.1991... Val Loss: 1.2806
Epoch: 20/20... Step: 2730... Loss: 1.1998... Val Loss: 1.2773
Epoch: 20/20... Step: 2740... Loss: 1.1899... Val Loss: 1.2764
Epoch: 20/20... Step: 2750... Loss: 1.1924... Val Loss: 1.2817
Epoch: 20/20... Step: 2760... Loss: 1.1872... Val Loss: 1.2781
Epoch: 20/20... Step: 2770... Loss: 1.2212... Val Loss: 1.2736
Epoch: 20/20... Step: 2780... Loss: 1.2564... Val Loss: 1.2736


## Checkpoint

In [38]:
# change the name, for saving multiple files
model_name = 'rnn_20_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)
    

## Making  Predictions

In [39]:
def predict(net, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return net.int2char[char], h

## priming and generating text

In [40]:
def sample(net, size, prime='The', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [41]:
print(sample(net, 1000, prime='Anna', top_k=5))

Anna
Parlinona, who went on, and said, she
was senting her arrangements of his cletker. The strength of
husband, had been told his beds of hating, and with the position
there will consequent him.

"Well, that is true that you're so in a strentthing of you,"
he went on.

"You can't teel you, that's the much of the forest. But I wanted to
be the same terrible times is at those passionate. The descousting of his
fater sides to be dreamfully done that if I don't know, then an answer
and much a chanted of all the same as a poss that they well the cheerful arranges
who want to thought that it seems to be still so that they cannot be an
and saw a lawy arm in the first contrary that he would carg out and
so in society women. As it was not alone to
himself. This is so stepping and went out of her son and the
side and three other, and her eyes, that sideless concemiditic
with their house shark of this sister and a personal
paids; he was not strenct to have breathing, and that he can from his bar

## Loading a checkpoint

In [42]:
# Here we have loaded in a model that trained over 20 epochs `rnn_20_epoch.net`
with open('rnn_20_epoch.net', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [43]:
# Sample using a loaded model
print(sample(loaded, 2000, top_k=5, prime="And Levin said"))

And Levin said a governor.

Alexey Alexandrovitch's eyes, their straight true the prace. He
had been talked in the same princess. He sat down again. "I say in
her, and I should not tell you, bringing the matter and his baby
steps only they too was not in, and that I can'bl have been suddenly
said at the tine of considerity and that here with her for the son
she supposes.
Why should you know hut and such a man who did not be and the
candect of hastical than so much then in host for the
same time, but, intelectual for the more and sound of the princess
would have thought into a child of terrible of the princips. At tomatis mother is
ill at once that were true, but I had sating to me. He doesn't be a man at once
therefores--and how to see the problem to him?" said Levin, smiling.

After the man, world. As she wonded the chest, he houred, but he
consequently settred him that the soft together he went out,
and all her samisty steps saw her houre should say nothing than enecgy. The
laster on