In [None]:
# modified version from https://www.tensorflow.org/tutorials/text/nmt_with_attention

In [None]:
!mkdir data
https://raw.githubusercontent.com/ARBML/tkseem/master/tasks/translation/data/ar_data.txt -O data/ar_data.txt
https://raw.githubusercontent.com/ARBML/tkseem/master/tasks/translation/data/en_data.txt -O data/en_data.txt 

In [None]:
!pip install tkseem
!pip install tnkeeh

In [1]:
import re
import nltk
import time
import numpy as np
import tkseem as tk
import tnkeeh as tn
import tensorflow as tf
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt

### Data Preprocessing

In [2]:
tn.clean_data('data/ar_data.txt','data/ar_clean_data.txt', remove_diacritics=True)
tn.clean_data('data/en_data.txt','data/en_clean_data.txt')

tn.split_parallel_data('data/ar_clean_data.txt', 'data/en_clean_data.txt', split_ratio=0.95)
train_inp_text, train_tar_text, test_inp_text, test_tar_text = tn.read_data(mode = 2)

Remove diacritics
Remove Tatweel
Saving to data/ar_clean_data.txt
Remove Tatweel
Saving to data/en_clean_data.txt
Split data
Save to data
Read data  ['ar_clean_data.txt', 'ar_data.txt', 'en_clean_data.txt', 'en_data.txt', 'test_inp_data.txt', 'test_tar_data.txt', 'train_inp_data.txt', 'train_tar_data.txt']


### Tokenization

In [None]:
ar_tokenizer = tk.SentencePieceTokenizer(special_tokens=['<s>', '</s>'])
ar_tokenizer.train('data/train_inp_data.txt')

en_tokenizer = tk.SentencePieceTokenizer(special_tokens=['<s>', '</s>'])
en_tokenizer.train('data/train_tar_data.txt')

train_inp_data = ar_tokenizer.encode_sentences(train_inp_text, boundries = ('<s>', '</s>'))
train_tar_data = en_tokenizer.encode_sentences(train_tar_text, boundries = ('<s>', '</s>'))

### Create Dataset

In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = len(train_inp_data)

dataset = tf.data.Dataset.from_tensor_slices((train_inp_data, train_tar_data)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

### Encoder, Decoder

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights



def get_loss_object():
    return  tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 1))
    loss_ = get_loss_object()(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

Initialize models

In [None]:
units = 1024
BATCH_SIZE = 64
embedding_dim = 256
max_length_inp = train_inp_data.shape[1]
max_length_tar = train_tar_data.shape[1]
steps_per_epoch = len(train_inp_data)//BATCH_SIZE
vocab_inp_size = ar_tokenizer.vocab_size
vocab_tar_size = en_tokenizer.vocab_size

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

### Training Procedure

In [None]:
def get_train_step():
    @tf.function
    def train_step(inp, targ, enc_hidden, encoder, decoder, optimizer, en_tokenizer, BATCH_SIZE = 64, ):
        loss = 0

        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, enc_hidden)

            dec_hidden = enc_hidden

            dec_input = tf.expand_dims([en_tokenizer.token_to_id('<s>')] * BATCH_SIZE, 1)

            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

                loss += loss_function(targ[:, t], predictions)

                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)

        batch_loss = (loss / int(targ.shape[1]))

        variables = encoder.trainable_variables + decoder.trainable_variables

        gradients = tape.gradient(loss, variables)

        optimizer.apply_gradients(zip(gradients, variables))

        return batch_loss
    return train_step

def train(epochs = 10, verbose = 0 ):
    optimizer = tf.keras.optimizers.Adam()

    for epoch in range(epochs):
        start = time.time()

        enc_hidden = encoder.initialize_hidden_state()
        total_loss = 0

        for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
            batch_loss = get_train_step()(inp, targ, enc_hidden, encoder, decoder, optimizer, en_tokenizer)
            total_loss += batch_loss

            if batch % 100 == 0 and verbose:
                print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                           batch,
                                                           batch_loss.numpy()))

        if verbose:
            print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                              total_loss / steps_per_epoch))
            print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Start training

In [None]:
train(epochs = 10, verbose = 1)

Training SentencePiece ...
Training SentencePiece ...
Epoch 1 Batch 0 Loss 8.9047
Epoch 1 Batch 100 Loss 1.5026
Epoch 1 Batch 200 Loss 1.4206
Epoch 1 Batch 300 Loss 1.3276
Epoch 1 Batch 400 Loss 1.2741
Epoch 1 Batch 500 Loss 1.2128
Epoch 1 Batch 600 Loss 1.2709
Epoch 1 Batch 700 Loss 1.1027
Epoch 1 Batch 800 Loss 1.0506
Epoch 1 Batch 900 Loss 1.0894
Epoch 1 Batch 1000 Loss 1.1731
Epoch 1 Batch 1100 Loss 1.0621
Epoch 1 Batch 1200 Loss 0.9565
Epoch 1 Batch 1300 Loss 0.9814
Epoch 1 Batch 1400 Loss 1.1400
Epoch 1 Loss 1.1646
Time taken for 1 epoch 23225.82354784012 sec

Epoch 2 Batch 0 Loss 1.0153
Epoch 2 Batch 100 Loss 0.9624
Epoch 2 Batch 200 Loss 0.8185


### Test

In [4]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_tar, max_length_inp))

    inputs = ar_tokenizer.encode_sentences([sentence], boundries = ('<s>', '</s>'), 
                                  out_length = max_length_inp)
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([en_tokenizer.token_to_id('<s>')], 0)

    for t in range(max_length_tar):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)

        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += en_tokenizer.id_to_token(predicted_id) + ' '

        if en_tokenizer.id_to_token(predicted_id) == '</s>':
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence

def translate(sentences, translations, verbose = 1):
    inputs = sentences
    outputs = []
    
    for i, sentence in enumerate(sentences):
        result, sentence = evaluate(sentence)
        result = ar_tokenizer.detokenize(result)
        result = result.replace('<s>', '').replace('</s>', '')
        result = re.sub(' +', ' ', result)
        outputs.append(result)
        if verbose:
            print('inpt: %s' % (sentence))
            print('pred: {}'.format(result))
            print('true: {}'.format(translations[i]))

In [None]:
translate(test_inp_text[:50], test_tar_text[:50], verbose = 1)