In [1]:
import torch
from torch import nn
import numpy as np
import torch.nn.functional as F

## Load In Data

In [2]:
with open('data/anna.txt', 'r') as f:
    text = f.read()

In [3]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

## Tokenization

In [4]:
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: integer for integer, ch in int2char.items()}

encoded = np.array([char2int[ch] for ch in text])

In [6]:
encoded[:100]

array([68, 73, 25, 15, 57, 61, 19,  9, 49, 66, 66, 66, 76, 25, 15, 15, 45,
        9, 47, 25,  0, 27, 63, 27, 61,  3,  9, 25, 19, 61,  9, 25, 63, 63,
        9, 25, 63, 27, 14, 61, 28,  9, 61, 79, 61, 19, 45,  9, 13, 36, 73,
       25, 15, 15, 45,  9, 47, 25,  0, 27, 63, 45,  9, 27,  3,  9, 13, 36,
       73, 25, 15, 15, 45,  9, 27, 36,  9, 27, 57,  3,  9,  6, 34, 36, 66,
       34, 25, 45, 65, 66, 66, 29, 79, 61, 19, 45, 57, 73, 27, 36])

## One hot encoding the data

In [30]:
def one_hot_encode(arr, num_labels):
    # declare the one hot array
    one_hot = np.zeros((arr.size, num_labels), dtype=np.float32)
    print(one_hot.shape)
    
    # fill up the appropriate place with 1
    print(np.arange(one_hot.shape[0]))
    print(arr.flatten())
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
    
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, num_labels))
    print(one_hot.shape)
    return one_hot

In [31]:
# check that the function works as expected
test_seq = np.array([[3, 5, 1]])
one_hot = one_hot_encode(test_seq, 8)

print(one_hot)

(3, 8)
[0 1 2]
[3 5 1]
(1, 3, 8)
[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


In [38]:
a = np.arange(12)
a.reshape(2,-1)

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11]])

In [69]:
def get_batches(arr, batch_size, seq_length):
    
    batch_size_total = batch_size * seq_length
    # Get the number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[: n_batches * batch_size_total]
    
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    print(arr)
    
    # Iterating over the batches using a window of size seq_length
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[: 0]
        
        yield x, y

In [70]:
batches = get_batches(encoded, 8, 50)
x, y = next(batches)

[[68 73 25 ... 15 61 19]
 [ 3  6 36 ... 47 19 27]
 [61 36 23 ... 73 25 15]
 ...
 [18 13  3 ... 27 18 73]
 [ 9 67 36 ... 25 57  9]
 [74 38 63 ... 57  6  9]]


In [71]:
# printing out the first 10 items in a sequence
print('x\n', x[:10, :10])
print('\ny\n', y[:10, :10])

x
 [[68 73 25 15 57 61 19  9 49 66]
 [ 3  6 36  9 57 73 25 57  9 25]
 [61 36 23  9  6 19  9 25  9 47]
 [ 3  9 57 73 61  9 18 73 27 61]
 [ 9  3 25 34  9 73 61 19  9 57]
 [18 13  3  3 27  6 36  9 25 36]
 [ 9 67 36 36 25  9 73 25 23  9]
 [74 38 63  6 36  3 14 45 65  9]]

y
 [[73 25 15 57 61 19  9 49 66 66]
 [ 6 36  9 57 73 25 57  9 25 57]
 [36 23  9  6 19  9 25  9 47  6]
 [ 9 57 73 61  9 18 73 27 61 47]
 [ 3 25 34  9 73 61 19  9 57 61]
 [13  3  3 27  6 36  9 25 36 23]
 [67 36 36 25  9 73 25 23  9  3]
 [38 63  6 36  3 14 45 65  9 81]]


In [72]:
train_on_gpu = torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU!')
else:
    print('No GPU is available, training on CPU!')

Training on GPU!


In [108]:
class CharRNN(nn.Module):
    def __init__(self, tokens, num_hidden=256, num_layers=2, drop_prob=0.25, lr=0.01):
        super().__init__()
        self.drop_prob = drop_prob
        self.lr=lr
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        
        # creating the necessary character dictionary
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii,ch in self.int2char.items()}
        
        # create the layers of the Model
        
        # LSTM layer
        self.LSTM = nn.LSTM(len(self.chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)
        
        # Dropot layer
        self.dropout = nn.Dropout(drop_prob)
        
        # define the fully connected layer
        self.fc = nn.Linear(num_hidden, len(self.chars))
        
    def forward(self, x, hidden):
        
        r_output, hidden  = self.LSTM(x, hidden)
        out = self.dropout(r_output)
        print('before contigous',out.shape)
        out = out.contiguous().view(-1, self.num_hidden)
        print('after contigous',out.shape)
        out = self.fc(out)
        
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        
        weight = next(self.parameters()).data
        
        if(train_on_gpu):
            hiddenstate_cellstate = (
            weight.new(self.num_layers, batch_size, self.num_hidden).zero_().cuda(),
                weight.new(self.num_layers, batch_size, self.num_hidden).zero_().cuda()
            )
            
        else:
            
            hiddenstate_cellstate = (
            weight.new(self.num_layers, batch_size, self.num_hidden).zero_(),
                weight.new(self.num_layers, batch_size, self.num_hidden).zero_()
            )
            
        return hiddenstate_cellstate
        

In [125]:
def train(model, data, epochs=10, batch_size=10, seq_length=50, lr = 0.01, clip=5, val_frac=0.1, print_every= 10):
        ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
        '''
        # take the model to training mode
        model.train()
        
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()
        
        
        # creating the training and validation datasets
        val_idx = int(len(data) * (1 - val_frac))
        train_data, valid_data = data[:val_idx], data[val_idx:]
        
        if(train_on_gpu):
            model.cuda() # we take the parameters to the gpu
            
        counter = 0
        num_chars = len(model.chars)
        
        # the epoch loop starsts
        for e in range(epochs):
            
            # initialize the hidden state
            h = model.init_hidden(batch_size)
            
            # we start the batch loop
            for x, y in get_batches(train_data, batch_size, seq_length):
                counter += 1
                
                # first step is to one hot encode the input data
                x = one_hot_encode(x, num_chars)
                
                # we convert the data to pytorch tensor
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
                
                # if cuda is available we take the tensors to GPU
                if(train_on_gpu):
                    inputs, targets = inputs.cuda(), targets.cuda()
                
                # we set the variable for the hidden state
                h = tuple([each.data for each in h])
                
                
                # then we zero out any gradients
                model.zero_grad()
                
                
                # get the output from the model
                output, h = model(inputs, h)
#                 print('output shape after lstm',output.shape)
#                 print('target shape',targets.shape)
#                 print('taget shape after modification', targets.view(batch_size*seq_length).long().shape)
                # calculate the lass
                loss = criterion(output, targets.view(batch_size * seq_length).long())
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), clip)
                optimizer.step()
                
                
                # loss stats
                if counter % print_every == 0:
                    
                    # validation loop
                    val_losses = []
                    
                    # going to evaluation mode here
                    model.eval()
                    
                    val_h = model.init_hidden(batch_size)
                    
                    for x,y in get_batches(valid_data, batch_size, seq_length):
                        x = one_hot_encode(x, num_chars)
                        
                        inputs, targets= torch.from_numpy(x), torch.from_numpy(y)
                        
                        val_h = tuple([each.data for each in val_h])
                        
                        if(train_on_gpu):
                            inputs, targets = inputs.cuda(), targets.cuda()
                        
                        output, h = model(inputs, val_h)
                        val_loss = criterion(output, targets.view(batch_size * seq_length).long())
                        val_losses.append(val_loss.item())
                    
                    # reseeting to train mode again
                    model.train()
                    
                    
                    print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))
                    

In [126]:
## TODO: set your model hyperparameters
# define and print the net
n_hidden= 512
n_layers= 2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (LSTM): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.25)
  (dropout): Dropout(p=0.25, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [127]:
batch_size = 64
seq_length = 100
n_epochs = 20 # start small if you are just testing initial behavior

# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)

[[68 73 25 ... 25 79 61]
 [66 36 61 ...  6 36 25]
 [63  9 34 ... 36 23  9]
 ...
 [ 9 57  6 ... 25  9 24]
 [61 57 19 ...  9 13 15]
 [ 9 57  6 ... 36  9 73]]
(6400, 83)
[   0    1    2 ... 6397 6398 6399]
[68 73 25 ... 61 79 27]
(64, 100, 83)
before contigous torch.Size([64, 100, 512])
after contigous torch.Size([6400, 512])


RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`