In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

np.random.seed(42)

In [2]:
with open('anna.txt', 'r') as f:
    text = f.read()

In [3]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

# Tokenization

In [4]:
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])

In [5]:
encoded[:100]

array([ 1, 70,  2, 30,  7, 76,  0, 67, 37, 21, 21, 21, 19,  2, 30, 30, 57,
       67,  5,  2, 16, 69, 64, 69, 76, 51, 67,  2,  0, 76, 67,  2, 64, 64,
       67,  2, 64, 69, 31, 76, 65, 67, 76, 62, 76,  0, 57, 67, 75, 72, 70,
        2, 30, 30, 57, 67,  5,  2, 16, 69, 64, 57, 67, 69, 51, 67, 75, 72,
       70,  2, 30, 30, 57, 67, 69, 72, 67, 69,  7, 51, 67, 10, 36, 72, 21,
       36,  2, 57, 74, 21, 21, 50, 62, 76,  0, 57,  7, 70, 69, 72])

# Pre-process the data

In [6]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [7]:
# check the function works well

test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


In [8]:
def get_batches(arr, batch_size, seq_length):
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [9]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [10]:
print('x\n', x[:10, :10])
print('y\n', y[:10, :10])

x
 [[ 1 70  2 30  7 76  0 67 37 21]
 [51 10 72 67  7 70  2  7 67  2]
 [76 72 12 67 10  0 67  2 67  5]
 [51 67  7 70 76 67 78 70 69 76]
 [67 51  2 36 67 70 76  0 67  7]
 [78 75 51 51 69 10 72 67  2 72]
 [67 22 72 72  2 67 70  2 12 67]
 [17 42 64 10 72 51 31 57 74 67]]
y
 [[70  2 30  7 76  0 67 37 21 21]
 [10 72 67  7 70  2  7 67  2  7]
 [72 12 67 10  0 67  2 67  5 10]
 [67  7 70 76 67 78 70 69 76  5]
 [51  2 36 67 70 76  0 67  7 76]
 [75 51 51 69 10 72 67  2 72 12]
 [22 72 72  2 67 70  2 12 67 51]
 [42 64 10 72 51 31 57 74 67 32]]


In [11]:
train_on_gpu = torch.cuda.is_available()
if (train_on_gpu):
    print('Training on GPU')
else:
    print('Training on CPU')

Training on GPU


In [12]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        ## TODO: define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## TODO: define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## TODO: define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        ## TODO: Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)
        
        ## TODO: pass through a dropout layer
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)
        
        ## TODO: put x through the fully-connected layer
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

# Training

In [13]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [15]:
n_hidden=512
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [16]:

batch_size = 128
seq_length = 100
n_epochs = 20

In [17]:
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)

Epoch: 1/20... Step: 10... Loss: 3.2609... Val Loss: 3.2012
Epoch: 1/20... Step: 20... Loss: 3.1542... Val Loss: 3.1350
Epoch: 1/20... Step: 30... Loss: 3.1447... Val Loss: 3.1233
Epoch: 1/20... Step: 40... Loss: 3.1158... Val Loss: 3.1196
Epoch: 1/20... Step: 50... Loss: 3.1432... Val Loss: 3.1175
Epoch: 1/20... Step: 60... Loss: 3.1206... Val Loss: 3.1155
Epoch: 1/20... Step: 70... Loss: 3.1111... Val Loss: 3.1133
Epoch: 1/20... Step: 80... Loss: 3.1174... Val Loss: 3.1070
Epoch: 1/20... Step: 90... Loss: 3.1073... Val Loss: 3.0915
Epoch: 1/20... Step: 100... Loss: 3.0685... Val Loss: 3.0537
Epoch: 1/20... Step: 110... Loss: 3.0101... Val Loss: 2.9868
Epoch: 1/20... Step: 120... Loss: 2.8713... Val Loss: 2.8525
Epoch: 1/20... Step: 130... Loss: 2.8071... Val Loss: 2.7745
Epoch: 2/20... Step: 140... Loss: 2.6802... Val Loss: 2.6289
Epoch: 2/20... Step: 150... Loss: 2.6046... Val Loss: 2.5412
Epoch: 2/20... Step: 160... Loss: 2.5342... Val Loss: 2.4892
Epoch: 2/20... Step: 170... Loss:

# Checkpoint

In [18]:
model_name = 'rnn_20_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

# Prediction

In [19]:
def predict(net, char, h=None, top_k=None):
    x = np.array([[net.char2int[char]]])
    x = one_hot_encode(x, len(net.chars))
    inputs = torch.from_numpy(x)

    if train_on_gpu:
        inputs = inputs.cuda()

    h = tuple([each.data for each in h])
    out, h = net(inputs, h)

    p = F.softmax(out, dim=1).data
    if train_on_gpu:
        p = p.cpu()

    if top_k is None:
        top_ch = np.arange(len(net.chars))
    else:
        p, top_ch = p.topk(top_k)
        top_ch = top_ch.numpy().squeeze()

    p = p.numpy().squeeze()
    char = np.random.choice(top_ch, p=p/p.sum())

    return net.int2char[char], h

In [20]:
def sample(net, size, prime='The', top_k=None):
    if train_on_gpu:
        net.cuda()
    else:
        net.cpu()
    
    net.eval()

    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)

    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [23]:
print(sample(net, 1000, prime='Anna', top_k=5))

Anna
Alexandrovna's health of all she was simply to him. He saw
at that tears with the possible weading, and whoseled with the
precious time to the chairs in his house.

"I had all more all more because they make him to think about have
it at once to speak to her," he said, at the state of his
study, and the day work of the country had said his head and struggle
of her.

"What is to be the carringer?" thought Levin, sighed to his
hands, he had seriously asking a bread. Tringing to this
trees. A signisication had bare arminated, and he were
all sides and transears, but as they had to do the sould of
this are complete chueling, and he had no stayed of his head and
considered this shaming, and so as his candle and he foonmed the profised of his
world, but the straish on stending over weads and her face.

"I'll see him!"

"No doubt it can be interested," said Anna, at the point of the
corridor, and he heard with his walk of a chalk to the ball
wait and conceaded angry, and smiling, stord a

# Loading a checkpoint

In [25]:
with open('rnn_20_epoch.net', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [26]:
print(sample(loaded, 2000, top_k=5, prime="And Nike said"))

And Nike said he
had now serious to him all the point. In the works was so much a man
walked out of the same start.

After a smell of an end of a consideration of sense of the cracks. The
cricking hat had not been that the whole means was so fal at the
point of anything to a sign. And had been as soon and to his side
of since he was so delightful, that she had been so life, his mistres,
tho good nature and all hereess and herestly and husband and simple of
attitude. She had an idea would not breath the same
at the same time in the station of the perform, and he went up
herself at the signess and at once that she was so good to all
the person he had been saying, he had the sick man, had that it was
never seen. She stood at him.

"Yes, you can't be in acquaintance too. I'm all to help the streagon
of the mare. This some ofting me, and see her in the stairs,
it could not conscious of his sun had setted herself and step and
seeing anything, that you will strange his sense of most forest to