# How to teach a RNN to count chars

In [1]:
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
from torch.autograd import Variable
from pprint import pprint

## Create a dataset

In [2]:
def make_string(len=10):
    data_string = []
    for i in range(1, len+1):
        letter_a = ['a']*i
        letter_b = ['b']*i
        string = ''.join(letter_a) \
               + 'X' \
               + ''.join(letter_b) \
               + '.'
        data_string.append(string)
    return data_string

def make_tensor(data_string, vocab_encoder):
    max_seq_len = max([len(string) for string in data_string])
    data_tensor = torch.zeros(len(data_string),
                              max_seq_len,
                              len(vocab_encoder))
    for i, string in enumerate(data_string):
        for j, char in enumerate(string):
            data_tensor[i][j] = vocab_encoder[char]
    return data_tensor

In [6]:
vocab = ['a', 'X', 'b', '.']
data_string = make_string(len=10)
del data_string[4]
del data_string[5]
pprint(data_string)

['aXb.',
 'aaXbb.',
 'aaaXbbb.',
 'aaaaXbbbb.',
 'aaaaaaXbbbbbb.',
 'aaaaaaaaXbbbbbbbb.',
 'aaaaaaaaaXbbbbbbbbb.',
 'aaaaaaaaaaXbbbbbbbbbb.']


We must convert our vocabulary to mathematical objects (Vector -> Tensor.dim()==1) to be used by our RNN.

Thus, we use a one-hot encoding function.

In [7]:
vocab_encoder = {
    'a': torch.Tensor([1,0,0,0]),
    'X': torch.Tensor([0,1,0,0]),
    'b': torch.Tensor([0,0,1,0]),
    '.': torch.Tensor([0,0,0,1])
}

def output_decoder(output, vocab):
    max_, argmax = output.data.max(1)
    output_char = vocab[argmax[0][0]]
    return output_char

data_tensor = make_tensor(data_string, vocab_encoder)
print(data_tensor)


(0 ,.,.) = 
   1   0   0   0
   0   1   0   0
   0   0   1   0
   0   0   0   1
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0

(1 ,.,.) = 
   1   0   0   0
   1   0   0   0
   0   1   0   0
   0   0   1   0
   0   0   1   0
   0   0   0   1
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0

(2 ,.,.) = 
   1   0   0   0
   1   0   0   0
   1   0   0   0
   0   1   0   0
   0   0   1   0
   0   0   1   0
   0   0   1   0
   0   0   0   1
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0 

For convenience, we are going to use the utils classes from torch to embed our dataset.

In [8]:
class Dataset(data.Dataset):

    def __init__(self, data_tensor):
        super(Dataset, self).__init__()
        self.data_tensor = data_tensor
        
    def __getitem__(self, index):
        input = self.data_tensor[index]
        target = torch.zeros(input.size())
        target[:-1] = input[1:] # targets are the next chars to predict
        #max_, argmax = target.max(1)
        #target = argmax.view(-1)
        return input, target
        
    def __len__(self):
        return self.data_tensor.size(0)

In [9]:
dataset = Dataset(data_tensor)
for i, (input, target) in enumerate(dataset):
    print(input, target)


    1     0     0     0
    0     1     0     0
    0     0     1     0
    0     0     0     1
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
[torch.FloatTensor of size 22x4]
 
    0     1     0     0
    0     0     1     0
    0     0     0     1
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    0     0     0     0
    

In [10]:
dataloader = data.DataLoader(dataset, batch_size=1, shuffle=True)
for i, (input, target) in enumerate(dataloader):
    print(input, target)


(0 ,.,.) = 
   1   0   0   0
   1   0   0   0
   0   1   0   0
   0   0   1   0
   0   0   1   0
   0   0   0   1
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
[torch.FloatTensor of size 1x22x4]
 
(0 ,.,.) = 
   1   0   0   0
   0   1   0   0
   0   0   1   0
   0   0   1   0
   0   0   0   1
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
   0   0   0   0
[torch.FloatTensor of size 1x22x4]


(0 ,.,.) = 
   1   0   0   0
   1   0   0   0
   1   0   0   0
   0   1   0   0
   0   0   1   0
   0   0   1   0
   0   0   1   0
   0   0   0   1
   0 

## Create a model

We want a RNN able to take a vector representing a char ('a', 'b', 'X' or '.') and to output a distribution over the vocabulary (i.e. a probability vector of size 4).

Thus, we create a model which have:

- a **recurrent cell** to process the new state from the current input and the old state,
- a **linear module** (weight, bias) to process the output vector from the new state,
- a **softmax module** to process the output probabilities from the output vector.

In [11]:
class RNNCell(nn.Module):
    
    def __init__(self, input_size, state_size, output_size):
        super(RNNCell, self).__init__()
        self.input_size = input_size
        self.state_size = state_size
        self.output_size = output_size
        self.rnn_cell = nn.GRUCell(input_size, state_size)
        self.linear = nn.Linear(state_size, output_size)
        self.softmax = nn.Softmax()
        
    def forward(self, input, state):
        state = self.rnn_cell(input, hx=state)
        output = self.linear(state)
        output = self.softmax(output)
        return output, state
    
model = RNNCell(len(vocab), 30, len(vocab))
print(model)
pprint(model.state_dict())

RNNCell (
  (rnn_cell): GRUCell(4, 30)
  (linear): Linear (30 -> 4)
  (softmax): Softmax ()
)
OrderedDict([('rnn_cell.weight_ih',
              
-0.1282  0.0630 -0.0753 -0.1236
-0.0173 -0.0657  0.0838 -0.1129
-0.0865 -0.0858  0.1532 -0.1579
-0.1368 -0.1708  0.0898  0.0283
-0.0299  0.0977  0.1377 -0.1317
 0.0061  0.1825 -0.0579  0.1820
 0.1714  0.1094  0.1376 -0.0618
-0.0895 -0.1296 -0.1571  0.0907
 0.0039 -0.0358 -0.1163  0.0937
 0.1686  0.0469 -0.1674 -0.1068
 0.1700  0.0659 -0.0446 -0.1098
 0.0658 -0.0213  0.0457 -0.0140
 0.0633 -0.0917 -0.0563  0.0525
 0.0145  0.0535 -0.0859 -0.1672
-0.1202  0.0300  0.0064  0.0019
 0.0393  0.1528  0.0758 -0.0361
-0.0302  0.0223 -0.0536  0.0292
 0.0935 -0.0404  0.1824 -0.0091
 0.0031 -0.1077 -0.0299 -0.0878
 0.0803 -0.1629 -0.1078 -0.0294
-0.0129 -0.0849 -0.1166  0.0094
-0.1449  0.1165 -0.0739 -0.0860
 0.0599 -0.1818  0.0064 -0.0210
-0.1378  0.0331 -0.0129  0.1719
-0.0402 -0.1532  0.0835 -0.1178
 0.0654  0.0289 -0.0861 -0.1076
-0.1451  0.1701 -0.0682

In [None]:
?nn.RNNCell

In [None]:
?nn.Linear

In [None]:
?nn.Softmax

## Choose a loss function

We want a loss function to produce an error value from the output of the model and the expected target (the loss function must be derivable). This error value will be backpropagate along the model to process the model parameters derivatives (gradients). 

For the sake of this tutorial, we will choose MSE (mean square error), but we usually use NLL (negative log likelihood) for classification tasks.

In [12]:
# loss = nn.CrossEntropyLoss()
loss = nn.MSELoss()

In [None]:
?nn.MSELoss

In [None]:
?nn.NLLLoss

In [None]:
?nn.CrossEntropyLoss

## Choose an optimizer

After having processed the gradients of the model parameters, we want an optimizer to update the model parameters of a certain amount (learning rate).

I personnaly find Adam to be easier to optimize than the classical SGD (stochastique gradient descent). Thus, we will use this one in this tutorial.

In [13]:
#optimizer = optim.SGD(model.parameters(), lr=0.00001, momentum=0.7)
optimizer = optim.Adam(model.parameters(), lr=1e-5)

In [None]:
?optim.SGD

In [None]:
?optim.Adam

## Train the RNN

In [14]:
def train_epoch(dataloader, model, loss, optimizer):
    model.train()
    
    error_epoch = 0
    
    # iterate over one sequence at a time
    for i, (input_data, target_data) in enumerate(dataloader):

        # convert data to variable for torch computational graph
        input = Variable(input_data)
        target = Variable(target_data)
        state = Variable(torch.zeros(1, model.state_size))

        # initialize the error to 0
        error_seq = Variable(torch.zeros(1))

        # iterate over the sequence
        seq_len = 0
        for t in range(input.size(1)):

            # doesnt process 0-padded values
            if input[:,t].data.sum() == 0:
                break

            # compute the char at time t (model forward)
            output, state = model(input[:,t], state)

            # compute the error at time t (loss forward)
            if input[:,t].data[0][2] == 1 or input[:,t].data[0][3] == 1:
                error_t = loss(output, target[:,t])
                error_seq += error_t
                seq_len += 1

        # takes the sequence length in count (average)
        error_seq /= seq_len
        error_epoch += error_seq.data[0]

        # compute the gradients over the computational graph
        # and update the model parameters
        optimizer.zero_grad()
        error_seq.backward()
        optimizer.step()
    return error_epoch
        

In [16]:
for epoch in range(2000):
    error = train_epoch(dataloader, model, loss, optimizer)
    if epoch%100==0:
        print('epoch: {}\t error: {}'.format(epoch, error))

epoch: 0	 error: 0.6059910617768764
epoch: 100	 error: 0.587552335113287
epoch: 200	 error: 0.5692928358912468
epoch: 300	 error: 0.5515261851251125
epoch: 400	 error: 0.5344775877892971
epoch: 500	 error: 0.5183918662369251
epoch: 600	 error: 0.5031605623662472
epoch: 700	 error: 0.48881886526942253
epoch: 800	 error: 0.47523349337279797
epoch: 900	 error: 0.462339760735631
epoch: 1000	 error: 0.450111897662282
epoch: 1100	 error: 0.4384175520390272
epoch: 1200	 error: 0.42724326625466347
epoch: 1300	 error: 0.41658078506588936
epoch: 1400	 error: 0.40642048977315426
epoch: 1500	 error: 0.39678394608199596
epoch: 1600	 error: 0.3874529730528593
epoch: 1700	 error: 0.3786028064787388
epoch: 1800	 error: 0.37034676037728786
epoch: 1900	 error: 0.3620837051421404


## Try out the model

In [25]:
state = Variable(torch.zeros(1, model.state_size), requires_grad=False)
model.eval()

nb_a = 11

for i in range(nb_a):
    input_char = 'a'
    input = Variable(vocab_encoder[input_char].view(1, -1), requires_grad=False)
    output, state = model(input, state)
    output_char = output_decoder(output, vocab)
    #print("input->output\t{}->{}".format(input_char, output_char))
    print("input->output\t{}".format(input_char))
    
input_char = 'X'
input = Variable(vocab_encoder[input_char].view(1, -1), requires_grad=False)
output, state = model(input, state)
output_char = output_decoder(output, vocab)
print("input->output\t{}: {}->{}".format(1, input_char, output_char))
print(output.data)

for i in range(nb_a):
    input_char = 'b'
    input = Variable(vocab_encoder[input_char].view(1, -1), requires_grad=False)
    output, state = model(input, state)
    output_char = output_decoder(output, vocab)
    print("input->output\t{}: {}->{}".format(i+2, input_char, output_char))
    print(output.data)




input->output	a
input->output	a
input->output	a
input->output	a
input->output	a
input->output	a
input->output	a
input->output	a
input->output	a
input->output	a
input->output	a
input->output	1: X->b

 0.0000  0.0000  0.9988  0.0012
[torch.FloatTensor of size 1x4]

input->output	2: b->b

 0.0001  0.0000  0.9980  0.0019
[torch.FloatTensor of size 1x4]

input->output	3: b->b

 0.0001  0.0001  0.9969  0.0029
[torch.FloatTensor of size 1x4]

input->output	4: b->b

 0.0001  0.0001  0.9951  0.0047
[torch.FloatTensor of size 1x4]

input->output	5: b->b

 0.0002  0.0002  0.9918  0.0077
[torch.FloatTensor of size 1x4]

input->output	6: b->b

 0.0004  0.0004  0.9854  0.0137
[torch.FloatTensor of size 1x4]

input->output	7: b->b

 0.0010  0.0008  0.9710  0.0273
[torch.FloatTensor of size 1x4]

input->output	8: b->b

 0.0025  0.0021  0.9330  0.0624
[torch.FloatTensor of size 1x4]

input->output	9: b->b

 0.0080  0.0065  0.8240  0.1615
[torch.FloatTensor of size 1x4]

input->output	10: b->b

 0.0265 