# Machine Translation
It is a Natural Language processing task of converting sequence of text from one language to another.
It uses Encoder-Decoder and Attention mechanism for achieving the same

In [1]:
import collections
import io
import math
from mxnet import autograd, gluon, init, nd
from mxnet.contrib import text
from mxnet.gluon import data as gdata, loss as gloss, nn, rnn

PAD, BOS, EOS = '<pad>', '<bos>', '<eos>'

  from ._conv import register_converters as _register_converters


## Dataset
Specify the type of dataset and the file paths

In [2]:
dataset_num=2
file_path1 = 'fr-en-small.txt'
file_path2 = 'tiny.europarl-v7.fr-en.en'
file_path3 = 'tiny.europarl-v7.fr-en.fr'

### Data Pre-processing

In [3]:
# Given a particular sequence, we store all the words in all the tokens to subsequently construct the dictionary.
# Add PAD to the sequence, until the length becomes max_seq_len
# Store the padded sequence in all_seqs
def process_one_seq(seq_tokens, all_tokens, all_seqs, max_seq_len):
    all_tokens.extend(seq_tokens)
    seq_tokens += [EOS] + [PAD] * (max_seq_len - len(seq_tokens) - 1)
    all_seqs.append(seq_tokens)

# Use all the tokens to construct a dictionary
# Construct an NDArray by converting words in all sequences to indices
def build_data(all_tokens, all_seqs):
    vocab = text.vocab.Vocabulary(collections.Counter(all_tokens),
                                  reserved_tokens=[PAD, BOS, EOS])
    indices = [vocab.to_indices(seq) for seq in all_seqs]
    return vocab, nd.array(indices)

While reading the data, append "&lt;eos&gt;” at the end of the sentence，and if needed, make the length of each sequecnce equal to `max_seq_len` by adding the “&lt;pad&gt;” symbol.

#### Seperation of concerns
We create seperate dictionaries for French and English words. The index of French words is independent of the index of English words

In [4]:
def read_data(max_seq_len):
    # in, out are abbreviations of input and output, respectively
    in_tokens, out_tokens, in_seqs, out_seqs = [], [], [], []
    if(dataset_num==1):
        with io.open(file_path1) as f:
            lines = f.readlines()
        for line in lines:
            in_seq, out_seq = line.rstrip().split('\t')
            in_seq_tokens, out_seq_tokens = in_seq.split(' '), out_seq.split(' ')
            if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1:
                continue  # If a sequence is longer than the max_seq_len after adding EOS, this example will be ignored.
            process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len)
            process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len)
        in_vocab, in_data = build_data(in_tokens, in_seqs)
        out_vocab, out_data = build_data(out_tokens, out_seqs)
        return in_vocab, out_vocab, gdata.ArrayDataset(in_data, out_data)
    else:
        with io.open(file_path2) as f_en, io.open(file_path3) as f_fr:
            for english_lines, french_lines in zip(f_en, f_fr):
                in_seq = english_lines.strip()
                out_seq = french_lines.strip()
                in_seq_tokens, out_seq_tokens = in_seq.split(' '), out_seq.split(' ')
                if max(len(in_seq_tokens), len(out_seq_tokens)) > max_seq_len - 1:
                    continue  # If a sequence is longer than the max_seq_len after adding EOS, this example will be ignored.
                process_one_seq(in_seq_tokens, in_tokens, in_seqs, max_seq_len)
                process_one_seq(out_seq_tokens, out_tokens, out_seqs, max_seq_len)
            in_vocab, in_data = build_data(in_tokens, in_seqs)
            out_vocab, out_data = build_data(out_tokens, out_seqs)
        return in_vocab, out_vocab, gdata.ArrayDataset(in_data, out_data)

Set the maximum length of the sequence to 7

In [6]:
max_seq_len = 7
in_vocab, out_vocab, dataset = read_data(max_seq_len)
dataset[0]

(
 [137.  24.   5.  82.   3.   1.   1.]
 <NDArray 7 @cpu(0)>, 
 [135.   5.   6.  87.   3.   1.   1.]
 <NDArray 7 @cpu(0)>)

## Encoder-Decoder with Attention Mechanism

### Encoder

Encoder uses word embedding layer to obtain a feature index from the word index of the input language and then input it into a multi-level gated recurrent unit (GRU). 


Gluon's `rnn.GRU` instance also returns the multi-layer hidden states of the output and final time steps after forward calculation. 
Here, the output refers to the hidden state of the hidden layer of the last layer at each time step, and it does not involve output layer calculation. The attention mechanism uses these output as key items and value items.

In [7]:
class Encoder(nn.Block):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 drop_prob=0, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = rnn.GRU(num_hiddens, num_layers, dropout=drop_prob)
        print("Encoder initialized")

    def forward(self, inputs, state):
        # The input shape is (batch size, number of time steps). Change the example dimension and time step dimension of the output.
        embedding = self.embedding(inputs).swapaxes(0, 1)
        return self.rnn(embedding, state)

    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

Next, we create mini-batch sequence input with a batch size of 4 and 7 time steps. We assume the number of hidden layers of the gated recurrent unit is 2 and the number of hidden units is 16. The output shape returned by the encoder after performing forward calculation on the input is (number of time steps, batch size, number of hidden units). The shape of the multi-layer hidden state of the gated recurrent unit in the final time step is (number of hidden layers, batch size, number of hidden units). For the gated recurrent unit, the `state` list contains only one element, which is the hidden state.

In [8]:
encoder = Encoder(vocab_size=10, embed_size=8, num_hiddens=16, num_layers=2)
encoder.initialize()
output, state = encoder(nd.zeros((4, 7)), encoder.begin_state(batch_size=4))
output.shape, state[0].shape

Encoder initialized


((7, 4, 16), (2, 4, 16))

### Attention Mechanism

`flatten` option for `Dense` instance : 
When the input dimension is greater than 2, by default, the `Dense` instance will treat all dimensions other than the first dimension (example dimension) as feature dimensions that require affine transformation, and will automatically convert the input into a two-dimensional matrix with rows of behavioral examples and columns of features.

Here, we set the `flatten` option to `False` so that the fully connected layer only performs affine transformation on the last dimension of the input, therefore, only the last dimension of the output shape becomes the number of outputs of the fully connected layer, i.e. 2.

In [9]:
dense = nn.Dense(2, flatten=False)
dense.initialize()
dense(nd.zeros((3, 5, 7))).shape

(3, 5, 2)

We implement the function $a$ in the Attention Mechanism to transform the concatenated input through a multilayer perceptron with a single hidden layer. The input of the hidden layer is a one-to-one concatenation between the hidden state of the decoder and the hidden state of the encoder on all time steps, which uses tanh as the activation function. The number of outputs of the output layer is 1. Neither `Dense` instance use a bias and they set `flatten=False`. Here, the length of the vector $\boldsymbol{v}$ in the $a$ function definition is a hyper-parameter, i.e. `attention_size`.

In [10]:
def attention_model(attention_size):
    model = nn.Sequential()
    model.add(nn.Dense(attention_size, activation='tanh', use_bias=False,
                       flatten=False),
              nn.Dense(1, use_bias=False, flatten=False))
    return model

### Hardmax
Hardmax takes the input data and converts it into a corresponding one-hot encoded tensor along the rows

In [11]:
def hardmax(data):
    return data.argmax(-1).one_hot(data.shape[-1])

The inputs of the attention model include query items, key items, and value items. Assume the encoder and decoder have the same number of hidden units. The query item here is the hidden state of the decoder in the previous time step, with a shape of (batch size, number of hidden units); the key and the value items are the hidden states of the encoder at all time steps, with a shape of (number of time steps, batch size, number of hidden units). The attention model returns the context variable of the current time step, and the shape is (batch size, number of hidden units).

In [18]:
def attention_forward(model, enc_states, dec_state, is_hardmax):
    dec_states = nd.broadcast_axis(
        dec_state.expand_dims(0), axis=0, size=enc_states.shape[0])
    enc_and_dec_states = nd.concat(enc_states, dec_states, dim=2)
    e = model(enc_and_dec_states)  # Shape is (number of time steps, batch size, 1)
    if(is_hardmax):
        alpha = hardmax(e)  # Perform hardmax operationo
    else:
        alpha = nd.softmax(e, axis=0) # Perform hardmax operation on timestep dimension
    return (alpha * enc_states).sum(axis=0)  # return context variable

In the example below, the encoder has 10 time steps and a batch size of 4. Both the encoder and the decoder have 8 hidden units. The attention model returns a mini-batch of context vectors, and the length of each context vector is equal to the number of hidden units of the encoder. Therefore, the output shape is (4, 8).

In [13]:
# seq_len, batch_size, num_hiddens = 10, 4, 8
# model = attention_model(10)
# model.initialize()
# enc_states = nd.zeros((seq_len, batch_size, num_hiddens))
# dec_state = nd.zeros((batch_size, num_hiddens))
# attention_forward(model, enc_states, dec_state).shape

### Decoder with Attention Mechanism

We directly use the hidden state of the encoder in the final time step as the initial hidden state of the decoder. This requires that the encoder and decoder RNNs have the same numbers of layers and hidden units.

In forward calculation of the decoder, we first calculate and obtain the context vector of the current time step by using the attention model introduced above. Since the input of the decoder comes from the word index of the output language, we obtain the feature expression of the input through the word embedding layer, and then concatenate the context vector in the feature dimension. We calculate the output and hidden state of the current time step through the gated recurrent unit, using the concatenated results and the hidden state of the previous time step. Finally, we use the fully connected layer to transform the output into predictions for each output word, with the shape of (batch size, output dictionary size).

In [14]:
class Decoder(nn.Block):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 attention_size, drop_prob=0, is_hardmax=True, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.attention = attention_model(attention_size)
        self.rnn = rnn.GRU(num_hiddens, num_layers, dropout=drop_prob)
        self.out = nn.Dense(vocab_size, flatten=False)
        self.is_hardmax = is_hardmax
        print("Decoder initialized")
        print("Hardmax used:"+str(self.is_hardmax))

    def forward(self, cur_input, state, enc_states):
        # Use the attention mechanism to calculate the context vector.
        c = attention_forward(self.attention, enc_states, state[0][-1], self.is_hardmax)
        # The embedded input and the context vector are concatenated in the feature dimension.
        input_and_c = nd.concat(self.embedding(cur_input), c, dim=1)
        # Add a time step dimension, with 1 time step, for the concatenation of the input and the context vector.
        output, state = self.rnn(input_and_c.expand_dims(0), state)
        # Remove the time step dimension, so the output shape is (batch size, output dictionary size).
        output = self.out(output).squeeze(axis=0)
        return output, state

    def begin_state(self, enc_state):
        # Directly use the hidden state of the final time step of the encoder as the initial hidden state of the decoder.
        return enc_state

## Training

We first implement the `batch_loss` function to calculate the loss of a mini-batch. The input of the decoder in the initial time step is the special character `BOS`. After that, the input of the decoder in a given time step is the word from the example output sequence in the previous time step, that is, teacher forcing. Also, use mask variables to avoid the impact of padding on loss function calculations.

In [15]:
def batch_loss(encoder, decoder, X, Y, loss):
    batch_size = X.shape[0]
    enc_state = encoder.begin_state(batch_size=batch_size)
    enc_outputs, enc_state = encoder(X, enc_state)
    # Initialize the hidden state of the decoder.
    dec_state = decoder.begin_state(enc_state)
    # The input of decoder at the initial time step is BOS.
    dec_input = nd.array([out_vocab.token_to_idx[BOS]] * batch_size)
    # Use the mask variable to ignore the loss when the label is PAD.
    mask, num_not_pad_tokens = nd.ones(shape=(batch_size,)), 0
    l = nd.array([0])
    for y in Y.T:
        dec_output, dec_state = decoder(dec_input, dec_state, enc_outputs)
        l = l + (mask * loss(dec_output, y)).sum()
        dec_input = y  # Use teacher forcing.
        num_not_pad_tokens += mask.sum().asscalar()
        # When we encounter EOS, words after the sequence will all be PAD and the mask for the corresponding position is set to 0.
        mask = mask * (y != out_vocab.token_to_idx[EOS])
    return l / num_not_pad_tokens

In the training function, we need to update the model parameters of the encoder and the decoder at the same time.

In [16]:
def train(encoder, decoder, dataset, lr, batch_size, num_epochs):
    print("Training:")
    encoder.initialize(init.Xavier(), force_reinit=True)
    decoder.initialize(init.Xavier(), force_reinit=True)
    enc_trainer = gluon.Trainer(encoder.collect_params(), 'adam',
                                {'learning_rate': lr})
    dec_trainer = gluon.Trainer(decoder.collect_params(), 'adam',
                                {'learning_rate': lr})
    loss = gloss.SoftmaxCrossEntropyLoss()
    data_iter = gdata.DataLoader(dataset, batch_size, shuffle=True)
    for epoch in range(num_epochs):
        l_sum = 0
        for X, Y in data_iter:
            with autograd.record():
                l = batch_loss(encoder, decoder, X, Y, loss)
            l.backward()
            enc_trainer.step(1)
            dec_trainer.step(1)
            l_sum += l.asscalar()
        if (epoch + 1) % 10 == 0:
            print("epoch %d, loss %.3f" % (epoch + 1, l_sum / len(data_iter)))

Next, we create a model instance and set hyper-parameters. Then, we can train the model.

In [None]:
Hardmax

In [16]:
embed_size, num_hiddens, num_layers = 64, 64, 2
attention_size, drop_prob, lr, batch_size, num_epochs = 10, 0.5, 0.01, 2, 50
encoder = Encoder(len(in_vocab), embed_size, num_hiddens, num_layers,
                  drop_prob)
decoder = Decoder(len(out_vocab), embed_size, num_hiddens, num_layers,
                  attention_size, drop_prob)
train(encoder, decoder, dataset, lr, batch_size, num_epochs)

Encoder initialized
Decoder initialized
Training:
epoch 10, loss 3.888
epoch 20, loss 3.536
epoch 30, loss 3.369
epoch 40, loss 3.343
epoch 50, loss 3.245


In [19]:
embed_size, num_hiddens, num_layers = 64, 64, 2
attention_size, drop_prob, lr, batch_size, num_epochs = 10, 0.5, 0.01, 2, 50
encoder = Encoder(len(in_vocab), embed_size, num_hiddens, num_layers,
                  drop_prob)
decoder = Decoder(len(out_vocab), embed_size, num_hiddens, num_layers,
                  attention_size, drop_prob, False)
train(encoder, decoder, dataset, lr, batch_size, num_epochs)

Encoder initialized
Decoder initialized
Hardmax used:False
Training:
epoch 10, loss 3.431
epoch 20, loss 3.120
epoch 30, loss 3.104
epoch 40, loss 3.049
epoch 50, loss 3.009
