In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F


In [2]:

# open text file and read in data as `text`
with open('data/anna.txt', 'r') as f:
    text = f.read()

In [3]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [4]:
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])

In [5]:
encoded[:100]

array([68, 61, 32, 30, 65, 38, 29,  9, 50, 11, 11, 11,  5, 32, 30, 30, 75,
        9, 14, 32, 24, 39,  0, 39, 38, 59,  9, 32, 29, 38,  9, 32,  0,  0,
        9, 32,  0, 39, 67, 38, 81,  9, 38, 77, 38, 29, 75,  9, 35,  7, 61,
       32, 30, 30, 75,  9, 14, 32, 24, 39,  0, 75,  9, 39, 59,  9, 35,  7,
       61, 32, 30, 30, 75,  9, 39,  7,  9, 39, 65, 59,  9, 76, 18,  7, 11,
       18, 32, 75,  4, 11, 11, 48, 77, 38, 29, 75, 65, 61, 39,  7])

In [6]:

def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [7]:
# check that the function works as expected
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


In [8]:
def get_batches(arr,batch_size,seq_length):
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [9]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

Training on GPU!


In [10]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
       
        self.dropout = nn.Dropout(drop_prob)
        
        
        self.fc = nn.Linear(n_hidden, len(self.chars))
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        
        r_output, hidden = self.lstm(x, hidden)
        
        
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)
        
        
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [11]:

def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [12]:
# define and print the net
n_hidden=512
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [13]:
batch_size = 128
seq_length = 100
n_epochs = 20

# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)

Epoch: 1/20... Step: 10... Loss: 3.2504... Val Loss: 3.1824
Epoch: 1/20... Step: 20... Loss: 3.1375... Val Loss: 3.1305
Epoch: 1/20... Step: 30... Loss: 3.1370... Val Loss: 3.1207
Epoch: 1/20... Step: 40... Loss: 3.1095... Val Loss: 3.1184
Epoch: 1/20... Step: 50... Loss: 3.1384... Val Loss: 3.1167
Epoch: 1/20... Step: 60... Loss: 3.1148... Val Loss: 3.1133
Epoch: 1/20... Step: 70... Loss: 3.1016... Val Loss: 3.1070
Epoch: 1/20... Step: 80... Loss: 3.1038... Val Loss: 3.0915
Epoch: 1/20... Step: 90... Loss: 3.0734... Val Loss: 3.0569
Epoch: 1/20... Step: 100... Loss: 3.0148... Val Loss: 2.9796
Epoch: 1/20... Step: 110... Loss: 2.9094... Val Loss: 2.8793
Epoch: 1/20... Step: 120... Loss: 2.8318... Val Loss: 2.8203
Epoch: 1/20... Step: 130... Loss: 2.7828... Val Loss: 2.7476
Epoch: 2/20... Step: 140... Loss: 2.7091... Val Loss: 2.6658
Epoch: 2/20... Step: 150... Loss: 2.6118... Val Loss: 2.5841
Epoch: 2/20... Step: 160... Loss: 2.5379... Val Loss: 2.5023
Epoch: 2/20... Step: 170... Loss:

Epoch: 10/20... Step: 1340... Loss: 1.4091... Val Loss: 1.4282
Epoch: 10/20... Step: 1350... Loss: 1.3976... Val Loss: 1.4256
Epoch: 10/20... Step: 1360... Loss: 1.3950... Val Loss: 1.4254
Epoch: 10/20... Step: 1370... Loss: 1.3921... Val Loss: 1.4225
Epoch: 10/20... Step: 1380... Loss: 1.4316... Val Loss: 1.4206
Epoch: 10/20... Step: 1390... Loss: 1.4340... Val Loss: 1.4210
Epoch: 11/20... Step: 1400... Loss: 1.4375... Val Loss: 1.4174
Epoch: 11/20... Step: 1410... Loss: 1.4463... Val Loss: 1.4178
Epoch: 11/20... Step: 1420... Loss: 1.4273... Val Loss: 1.4109
Epoch: 11/20... Step: 1430... Loss: 1.4042... Val Loss: 1.4172
Epoch: 11/20... Step: 1440... Loss: 1.4345... Val Loss: 1.4105
Epoch: 11/20... Step: 1450... Loss: 1.3577... Val Loss: 1.4102
Epoch: 11/20... Step: 1460... Loss: 1.3852... Val Loss: 1.4056
Epoch: 11/20... Step: 1470... Loss: 1.3701... Val Loss: 1.4073
Epoch: 11/20... Step: 1480... Loss: 1.3934... Val Loss: 1.4035
Epoch: 11/20... Step: 1490... Loss: 1.3857... Val Loss:

Epoch: 19/20... Step: 2640... Loss: 1.2514... Val Loss: 1.3102
Epoch: 20/20... Step: 2650... Loss: 1.2499... Val Loss: 1.3060
Epoch: 20/20... Step: 2660... Loss: 1.2494... Val Loss: 1.3122
Epoch: 20/20... Step: 2670... Loss: 1.2683... Val Loss: 1.3096
Epoch: 20/20... Step: 2680... Loss: 1.2459... Val Loss: 1.3094
Epoch: 20/20... Step: 2690... Loss: 1.2454... Val Loss: 1.3096
Epoch: 20/20... Step: 2700... Loss: 1.2499... Val Loss: 1.3063
Epoch: 20/20... Step: 2710... Loss: 1.2219... Val Loss: 1.3081
Epoch: 20/20... Step: 2720... Loss: 1.2226... Val Loss: 1.3066
Epoch: 20/20... Step: 2730... Loss: 1.2220... Val Loss: 1.3035
Epoch: 20/20... Step: 2740... Loss: 1.2202... Val Loss: 1.3017
Epoch: 20/20... Step: 2750... Loss: 1.2199... Val Loss: 1.3050
Epoch: 20/20... Step: 2760... Loss: 1.2171... Val Loss: 1.3023
Epoch: 20/20... Step: 2770... Loss: 1.2565... Val Loss: 1.3011
Epoch: 20/20... Step: 2780... Loss: 1.2699... Val Loss: 1.2999


In [14]:
# change the name, for saving multiple files
model_name = 'rnn_20_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

In [15]:
def predict(net, char, h=None, top_k=None):
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return net.int2char[char], h

In [16]:
def sample(net, size, prime='The', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [23]:
print(sample(net, 1000, prime='Anna', top_k=4))

Anna, a complete of the conversation.

"I didn't know that the province of a long whole same to be done to meet you, and I will not be taken on the creak to
be anything."

"You say.

It was as a strick man think, I she will not
see him. To speak of intimace from me. A don't trie terrible to be about."

"Oh, yes! Well, and I wanted to did you and have all the commister," she said to himself. "I shill set to be soletions in his wife thanked it on the carrea to the
concert and
service, and so much friends is a sound of and something as
in the
servants.
This wors of calling
all times sense of a peasant and togith, that her face. They could never say about that to the sore of the same, that
he could not his love, and then we must had to go and see that there will be any of their sunshining, and shall be seeing her face in seeing them. At the marsh had been a second being so always and the painfess to the
study of the
moment of their cares and a lady
again, and he came into him and have thin

In [24]:
# Here we have loaded in a model that trained over 20 epochs `rnn_20_epoch.net`
with open('rnn_20_epoch.net', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [27]:
# Sample using a loaded model
print(sample(loaded, 2000, prime="And Levin said", top_k=5))

And Levin said,
that a long while he had said that the
sick man and to her sorts of clothes, showing the continual
countess, and where this consticuanished to hume in any tark was
nurse.

Anna, to bring him an inspated on the crup of a clight,
trusing hands, and said, and that there he seemed to
show some side of the came
and doing a service. The stath of wind so and saying that she was always always been for the party and and were sertaing him from the prince, and she saw
a summer sight of the stretthe that he was dount, with which suddenly with his brother's capitill of the sofa instant the steps of which she was still an erection of her face
and the footman.

"I am not
giving the minute."

"No, it's a country sat difter, than all hands are any one alone. How all you must
be an hare. And how don't you know! I did not go on it...."

She wanted to have so much for her
husband; and was taking off the face, he sat the
staricies and said,
so went out into the driving with said of the
deat