In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

In [2]:
with open('data/anna.txt', 'r') as f:
    text = f.read()

In [3]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [4]:
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])
encoded

array([53, 67, 61, ..., 30,  8, 47])

In [5]:
len(encoded), len(text)

(1985223, 1985223)

In [6]:
def one_hot_encode(arr, n_labels):
    
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)
    print(one_hot)
    print(f'===================================')
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    print(one_hot)
    print(f'=============={one_hot.shape}=====================')
    
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    print(one_hot.shape)
    
    return one_hot

In [7]:
test_seq = np.array([[3, 5, 1]])

one_hot = one_hot_encode(test_seq, 8)
print(one_hot)

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]
[[0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]]
(1, 3, 8)
[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


In [49]:
def get_batches(arr, batch_size, seq_length):
    
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    batch_size_total = batch_size * seq_length
    ## TODO: Get the number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    ## TODO: Keep only enough characters to make full batches
    arr = arr[:n_batches*batch_size_total]
    
    ## TODO: Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    ## TODO: Iterate over the batches using a window of size seq_length
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n + seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

In [50]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [51]:
x

array([[53, 67, 61, 12, 32, 63, 60, 68, 71, 47, 47, 47, 46, 61, 12, 12,
        50, 68,  4, 61, 22,  5,  0,  5, 63, 30, 68, 61, 60, 63, 68, 61,
         0,  0, 68, 61,  0,  5, 44, 63, 55, 68, 63, 26, 63, 60, 50, 68,
        57, 45],
       [30, 18, 45, 68, 32, 67, 61, 32, 68, 61, 32, 32, 60, 61, 43, 32,
        63, 64, 68, 67, 63, 60, 68, 61, 32, 32, 63, 45, 32,  5, 18, 45,
        68, 75, 61, 30, 68, 67, 63, 60, 68, 67, 57, 30,  7, 61, 45, 64,
         8, 68],
       [63, 45, 64, 68, 18, 60, 68, 61, 68,  4, 18, 63, 65, 68, 67, 63,
        68, 61, 26, 18,  5, 64, 63, 64, 68, 67,  5, 30, 68,  4, 61, 32,
        67, 63, 60,  8, 68, 46, 63, 47,  0, 18, 18, 44, 63, 64, 68, 60,
        18, 57],
       [30, 68, 32, 67, 63, 68, 43, 67,  5, 63,  4, 68, 32, 67, 18, 57,
        48, 67, 68, 67,  5, 64, 64, 63, 45, 47,  5, 45, 32, 63, 60, 63,
        30, 32, 68, 18,  4, 68, 67,  5, 30, 68,  0,  5,  4, 63, 65, 68,
        18,  4],
       [68, 30, 61, 75, 68, 67, 63, 60, 68, 32, 63, 61, 60, 37, 30, 

In [52]:
y

array([[67, 61, 12, 32, 63, 60, 68, 71, 47, 47, 47, 46, 61, 12, 12, 50,
        68,  4, 61, 22,  5,  0,  5, 63, 30, 68, 61, 60, 63, 68, 61,  0,
         0, 68, 61,  0,  5, 44, 63, 55, 68, 63, 26, 63, 60, 50, 68, 57,
        45, 67],
       [18, 45, 68, 32, 67, 61, 32, 68, 61, 32, 32, 60, 61, 43, 32, 63,
        64, 68, 67, 63, 60, 68, 61, 32, 32, 63, 45, 32,  5, 18, 45, 68,
        75, 61, 30, 68, 67, 63, 60, 68, 67, 57, 30,  7, 61, 45, 64,  8,
        68,  9],
       [45, 64, 68, 18, 60, 68, 61, 68,  4, 18, 63, 65, 68, 67, 63, 68,
        61, 26, 18,  5, 64, 63, 64, 68, 67,  5, 30, 68,  4, 61, 32, 67,
        63, 60,  8, 68, 46, 63, 47,  0, 18, 18, 44, 63, 64, 68, 60, 18,
        57, 45],
       [68, 32, 67, 63, 68, 43, 67,  5, 63,  4, 68, 32, 67, 18, 57, 48,
        67, 68, 67,  5, 64, 64, 63, 45, 47,  5, 45, 32, 63, 60, 63, 30,
        32, 68, 18,  4, 68, 67,  5, 30, 68,  0,  5,  4, 63, 65, 68, 18,
         4, 68],
       [30, 61, 75, 68, 67, 63, 60, 68, 32, 63, 61, 60, 37, 30, 32, 

In [53]:
x.shape, y.shape

((8, 50), (8, 50))

In [54]:
train_on_gpu = torch.cuda.is_available()

if train_on_gpu:
    print(f'GPU is available')
else:
    print(f'GPU is not available')

GPU is not available


In [62]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2,
                 drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.lr = lr
        self.n_hidden = n_hidden
        
        #creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        ## TODO: define the layer of the model
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers,
                            dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.fc = nn.Linear(n_hidden, len(self.chars))
        
    def forward(self, x, hidden):
        
        r_output, hidden = self.lstm(x, hidden)
        out = self.dropout(r_output)
        out = out.contiguous().view(-1, self.n_hidden)
        out = self.fc(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
            
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
            
        return hidden

In [69]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, 
          lr=0.001, clip=5, val_frac=0.1, print_every=10):
    
    net.train()
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if train_on_gpu:
        net.cuda()
        
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One hot the data and turn them into tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if (train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()
                
            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])
            
            net.zero_grad()
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                
                for x, y in get_batches(val_data, batch_size, seq_length):
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    #creating new variables for the hidden state, otherwise
                    #we'd backpropagate through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if train_on_gpu:
                        inputs, targets = inputs.cuda(), targets.cuda()
                        
                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                    
                    val_losses.append(val_loss.item())
                    
                net.train()
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [70]:
# define the model
n_hidden = 512
n_layers = 2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [71]:
batch_size = 128
seq_length = 100
n_epochs = 20

# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size,
      seq_length=seq_length, lr=0.001, print_every=2)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(128, 100, 83)




[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(128, 100, 83)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]]
(128, 100, 83)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(128, 100, 83)
[[0. 0. 0. ... 0. 0

KeyboardInterrupt: 