In [4]:
import torch
from torch import nn
import numpy as np
import torch.nn.functional as F

## Load In Data

In [5]:
with open('data/anna.txt', 'r') as f:
    text = f.read()

In [6]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

## Tokenization

In [7]:
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: integer for integer, ch in int2char.items()}

encoded = np.array([char2int[ch] for ch in text])

In [8]:
encoded[:100]

array([ 3, 72, 63, 47, 67, 57, 65, 23, 66, 35, 35, 35,  0, 63, 47, 47, 45,
       23, 43, 63, 33, 30, 68, 30, 57, 46, 23, 63, 65, 57, 23, 63, 68, 68,
       23, 63, 68, 30, 24, 57, 56, 23, 57, 25, 57, 65, 45, 23, 44, 52, 72,
       63, 47, 47, 45, 23, 43, 63, 33, 30, 68, 45, 23, 30, 46, 23, 44, 52,
       72, 63, 47, 47, 45, 23, 30, 52, 23, 30, 67, 46, 23, 40, 39, 52, 35,
       39, 63, 45, 76, 35, 35, 54, 25, 57, 65, 45, 67, 72, 30, 52])

## One hot encoding the data

In [13]:
def one_hot_encode(arr, num_labels):
    # declare the one hot array
    one_hot = np.zeros((arr.size, num_labels), dtype=np.float32)
    # print(one_hot.shape)
    
    # fill up the appropriate place with 1
    # print(np.arange(one_hot.shape[0]))
    # print(arr.flatten())
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, num_labels))
    # print(one_hot.shape)
    return one_hot

In [12]:
# check that the function works as expected
test_seq = np.array([[2, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


In [None]:
a = np.arange(12)
a.reshape(2,-1)

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11]])

In [23]:
def get_batches(arr, batch_size, seq_length):
    
    batch_size_total = batch_size * seq_length
    # Get the number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[: n_batches * batch_size_total]
    
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    #print(arr)
    
    # Iterating over the batches using a window of size seq_length
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        
        yield x, y

In [24]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

In [25]:
# printing out the first 10 items in a sequence
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[ 3 72 63 47 67 57 65 23 66 35]
 [46 40 52 23 67 72 63 67 23 63]
 [57 52 49 23 40 65 23 63 23 43]
 [46 23 67 72 57 23 18 72 30 57]
 [23 46 63 39 23 72 57 65 23 67]
 [18 44 46 46 30 40 52 23 63 52]
 [23 55 52 52 63 23 72 63 49 23]
 [53 28 68 40 52 46 24 45 76 23]]

y
 [[72 63 47 67 57 65 23 66 35 35]
 [40 52 23 67 72 63 67 23 63 67]
 [52 49 23 40 65 23 63 23 43 40]
 [23 67 72 57 23 18 72 30 57 43]
 [46 63 39 23 72 57 65 23 67 57]
 [44 46 46 30 40 52 23 63 52 49]
 [55 52 52 63 23 72 63 49 23 46]
 [28 68 40 52 46 24 45 76 23 51]]


In [26]:
train_on_gpu = torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU!')
else:
    print('No GPU is available, training on CPU!')

Training on GPU!


In [27]:
class CharRNN(nn.Module):
    def __init__(self, tokens, num_hidden=256, num_layers=2, drop_prob=0.25, lr=0.01):
        super().__init__()
        self.drop_prob = drop_prob
        self.lr=lr
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        
        # creating the necessary character dictionary
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii,ch in self.int2char.items()}
        
        # create the layers of the Model
        
        # LSTM layer
        self.LSTM = nn.LSTM(len(self.chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)
        
        # Dropot layer
        self.dropout = nn.Dropout(drop_prob)
        
        # define the fully connected layer
        self.fc = nn.Linear(num_hidden, len(self.chars))
        
    def forward(self, x, hidden):
        
        r_output, hidden  = self.LSTM(x, hidden)
        out = self.dropout(r_output)
        # print('before contigous',out.shape)
        out = out.contiguous().view(-1, self.num_hidden)
        # print('after contigous',out.shape)
        out = self.fc(out)
        
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        
        weight = next(self.parameters()).data
        
        if(train_on_gpu):
            hiddenstate_cellstate = (
            weight.new(self.num_layers, batch_size, self.num_hidden).zero_().cuda(),
                weight.new(self.num_layers, batch_size, self.num_hidden).zero_().cuda()
            )
            
        else:
            
            hiddenstate_cellstate = (
            weight.new(self.num_layers, batch_size, self.num_hidden).zero_(),
                weight.new(self.num_layers, batch_size, self.num_hidden).zero_()
            )
            
        return hiddenstate_cellstate
        

In [28]:
def train(model, data, epochs=10, batch_size=10, seq_length=50, lr = 0.01, clip=5, val_frac=0.1, print_every= 10):
        ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
        '''
        # take the model to training mode
        model.train()
        
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()
        
        
        # creating the training and validation datasets
        val_idx = int(len(data) * (1 - val_frac))
        train_data, valid_data = data[:val_idx], data[val_idx:]
        
        if(train_on_gpu):
            model.cuda() # we take the parameters to the gpu
            
        counter = 0
        num_chars = len(model.chars)
        
        # the epoch loop starsts
        for e in range(epochs):
            
            # initialize the hidden state
            h = model.init_hidden(batch_size)
            
            # we start the batch loop
            for x, y in get_batches(train_data, batch_size, seq_length):
                counter += 1
                
                # first step is to one hot encode the input data
                x = one_hot_encode(x, num_chars)
                
                # we convert the data to pytorch tensor
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
                
                # if cuda is available we take the tensors to GPU
                if(train_on_gpu):
                    inputs, targets = inputs.cuda(), targets.cuda()
                
                # we set the variable for the hidden state
                h = tuple([each.data for each in h])
                
                
                # then we zero out any gradients
                model.zero_grad()
                
                
                # get the output from the model
                output, h = model(inputs, h)
#                 print('output shape after lstm',output.shape)
#                 print('target shape',targets.shape)
#                 print('taget shape after modification', targets.view(batch_size*seq_length).long().shape)
                # calculate the lass
                loss = criterion(output, targets.view(batch_size * seq_length).long())
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()
                
                
                # loss stats
                if counter % print_every == 0:
                    
                    # validation loop
                    val_losses = []
                    
                    # going to evaluation mode here
                    model.eval()
                    
                    val_h = model.init_hidden(batch_size)
                    
                    for x,y in get_batches(valid_data, batch_size, seq_length):
                        x = one_hot_encode(x, num_chars)
                        
                        inputs, targets= torch.from_numpy(x), torch.from_numpy(y)
                        
                        val_h = tuple([each.data for each in val_h])
                        
                        if(train_on_gpu):
                            inputs, targets = inputs.cuda(), targets.cuda()
                        
                        output, h = model(inputs, val_h)
                        val_loss = criterion(output, targets.view(batch_size * seq_length).long())
                        val_losses.append(val_loss.item())
                    
                    # reseeting to train mode again
                    model.train()
                    
                    
                    print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))
                    

In [29]:
## TODO: set your model hyperparameters
# define and print the net
n_hidden= 512
n_layers= 2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (LSTM): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.25)
  (dropout): Dropout(p=0.25, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [30]:
batch_size = 64
seq_length = 100
n_epochs = 20 # start small if you are just testing initial behavior

# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)

Epoch: 1/20... Step: 10... Loss: 3.2202... Val Loss: 3.2502
Epoch: 1/20... Step: 20... Loss: 3.1351... Val Loss: 3.1678
Epoch: 1/20... Step: 30... Loss: 3.1204... Val Loss: 3.1526
Epoch: 1/20... Step: 40... Loss: 3.1380... Val Loss: 3.1477
Epoch: 1/20... Step: 50... Loss: 3.1210... Val Loss: 3.1466
Epoch: 1/20... Step: 60... Loss: 3.1316... Val Loss: 3.1458
Epoch: 1/20... Step: 70... Loss: 3.1263... Val Loss: 3.1437
Epoch: 1/20... Step: 80... Loss: 3.1266... Val Loss: 3.1437
Epoch: 1/20... Step: 90... Loss: 3.1041... Val Loss: 3.1368
Epoch: 1/20... Step: 100... Loss: 3.1139... Val Loss: 3.1294
Epoch: 1/20... Step: 110... Loss: 3.0801... Val Loss: 3.1129
Epoch: 1/20... Step: 120... Loss: 3.0545... Val Loss: 3.0712
Epoch: 1/20... Step: 130... Loss: 2.9953... Val Loss: 3.0338
Epoch: 1/20... Step: 140... Loss: 2.9145... Val Loss: 2.9484
Epoch: 1/20... Step: 150... Loss: 2.8168... Val Loss: 2.9281
Epoch: 1/20... Step: 160... Loss: 2.7206... Val Loss: 2.7850
Epoch: 1/20... Step: 170... Loss:

In [33]:
# change the name, for saving multiple files
model_name = 'rnn_20_epoch.net'

checkpoint = {'n_hidden': net.num_hidden,
              'n_layers': net.num_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

# Top K prediciton

In [53]:
def predict(net, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        # print(x)
        x = one_hot_encode(x, len(net.chars))
        # print(x)
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)
        # print('out.shape: ', out.shape)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        # print('P shape: ', p.shape)
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            # print('p:',p.shape)
            # print('top_ch',top_ch)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return net.int2char[char], h

In [49]:
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    # chars = [ch for ch in prime]
    h = net.init_hidden(1)
    char, h = predict(net, 'T', h, top_k=5)


[[77]]
[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
   0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]]
out.shape:  torch.Size([1, 83])
P shape:  torch.Size([1, 83])
p: torch.Size([1, 5])
top_ch tensor([[44, 72, 63, 57, 67]])


In [57]:
def sample(net, size, prime='The', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    print(chars)
    
    # Now pass in the previous character and get a new one
    for ii in range(1500):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [58]:
print(sample(net, 1000, prime='love', top_k=5))

['l', 'o', 'v', 'e', '!']
love! It's night.
I see you are, and your decision and all meanings."

These letter in the wedding, when she went up to the stopping out of the
country, taking his hand, sent him to secort, which had so set, to
service him abone to too with the peasants, but he had stopped him
with the baby. But when the princess and Anna had been taking off the
conversation with her husband with a bad horse, and the sound of talk of the
presching chair of the short head and still maked his hand and shook his
head on the room.



 Chapter 15


At this moment it seemed to him that some soul to see the proofs without the
stalls. He was not a cheat former in the marsh; and took
off his chair, and studging the sun of some service. The stern
way there was another word, the chief seconds of this considerations
of her husband, she was not made and make up his heart, her shame of the sun was stood at
her, and again she did not know, that this was harrer of happiness and was she too,
t