# Packages

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import unicodedata
import re
import numpy as np
import os
import io
import time

In [None]:
print(tf.__version__)

# Constants

In [None]:
USE_DIACS = False
BATCH_SIZE = 256
EPOCHS = 10
EMBEDDINGS_DIM = 100
UNITS = 256

# Data Preparing

In [None]:
def remove_diacritics(text):
    diacritics_list = ''.join(['َ', 'ً', 'ُ', 'ٌ', 'ِ', 'ٍ', 'ّ', 'ْ'])
    return text.translate(str.maketrans('', '', ''.join(diacritics_list)))

In [None]:
def extract_diacritics(text):
    diacritics_list = ''.join(['َ', 'ً', 'ُ', 'ٌ', 'ِ', 'ٍ', 'ّ', 'ْ'])
    diacritics = ''
    for char in text:
        if char in diacritics_list:
            diacritics += char
    if diacritics == '':
        diacritics = '<none>'
    return diacritics

In [None]:
def create_dataset():
    ar_lines = open('data_dir/ar.bpe.train').read().strip().split('\n')
    for idx in range(len(ar_lines)):
        ar_lines[idx] = '<start> ' + ar_lines[idx].strip() + ' <end>'
    
    en_lines = open('data_dir/en.bpe.train').read().strip().split('\n')
    for idx in range(len(en_lines)):
        en_lines[idx] = '<start> ' + en_lines[idx].strip() + ' <end>'
    
    if USE_DIACS:            
        ar_diac_lines = open('data_dir/ar-diac.bpe.train').read().strip().split('\n')
        for idx in range(len(ar_diac_lines)):
            ar_diac_lines[idx] = ' '.join([extract_diacritics(token) for token in ar_diac_lines[idx].split()])
            ar_diac_lines[idx] = '<start> ' + ar_diac_lines[idx] + ' <end>'

        return ar_lines, ar_diac_lines, en_lines
    
    return ar_lines, en_lines

In [None]:
if USE_DIACS:
    ar, ar_diac, en = create_dataset()
else:
    ar, en = create_dataset()

In [None]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [None]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)

    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

    return tensor, lang_tokenizer

In [None]:
def load_dataset():
    if USE_DIACS:
        ar_lang, ar_diac_lang, en_lang = create_dataset()
    else:
        ar_lang, en_lang = create_dataset()

    ar_tensor, ar_lang_tokenizer = tokenize(ar_lang)
    en_tensor, en_lang_tokenizer = tokenize(en_lang)
    
    if USE_DIACS:
        ar_diac_tensor, ar_diac_lang_tokenizer = tokenize(ar_diac_lang)
        return ar_tensor, ar_diac_tensor, en_tensor, ar_lang_tokenizer, ar_diac_lang_tokenizer, en_lang_tokenizer

    return ar_tensor, en_tensor, ar_lang_tokenizer, en_lang_tokenizer

In [None]:
if USE_DIACS:
    ar_tensor, ar_diac_tensor, en_tensor, ar_lang, ar_diac_lang, en_lang = load_dataset()
    max_length_ar, max_length_ar_diac, max_length_en = max_length(ar_tensor), max_length(ar_diac_tensor), max_length(en_tensor)
    print(max_length_ar, max_length_ar_diac, max_length_en)
else:
    ar_tensor, en_tensor, ar_lang, en_lang = load_dataset()
    max_length_ar, max_length_en = max_length(ar_tensor), max_length(en_tensor)
    print(max_length_ar, max_length_en)

In [None]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            print ("%d ----> %s" % (t, lang.index_word[t]))

In [None]:
print ("AR Language; index to word mapping")
convert(ar_lang, ar_tensor[0])
print ()
print ("EN Language; index to word mapping")
convert(en_lang, en_tensor[0])
if USE_DIACS:
    print ()
    print ("AR DIAC Language; index to word mapping")
    convert(ar_diac_lang, ar_diac_tensor[0])

In [None]:
BUFFER_SIZE = len(ar_tensor)
steps_per_epoch = len(ar_tensor) // BATCH_SIZE
vocab_ar_size = len(ar_lang.word_index) + 1
vocab_en_size = len(en_lang.word_index) + 1

if USE_DIACS:
    vocab_ar_diac_size = len(ar_diac_lang.word_index) + 1
    dataset = tf.data.Dataset.from_tensor_slices((ar_tensor, ar_diac_tensor, en_tensor)).shuffle(BUFFER_SIZE)
else:
    dataset = tf.data.Dataset.from_tensor_slices((ar_tensor, en_tensor)).shuffle(BUFFER_SIZE)

dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
if USE_DIACS:
    example_ar_batch, example_ar_diac_batch, example_en_batch = next(iter(dataset))
    print(example_ar_batch.shape, example_ar_diac_batch.shape, example_en_batch.shape)
else:
    example_ar_batch, example_en_batch = next(iter(dataset))
    print(example_ar_batch.shape, example_en_batch.shape)

# The Model

In [None]:
if USE_DIACS:
    class Encoder(tf.keras.Model):
        def __init__(self, ar_vocab_size, ar_diac_vocab_size, embedding_dim, units, batch_size):
            super(Encoder, self).__init__()
            self.batch_size = batch_size
            self.units = units
            self.ar_embedding = tf.keras.layers.Embedding(ar_vocab_size,
                                                          embedding_dim,
                                                          embeddings_initializer='glorot_uniform')
            self.ar_diac_embedding = tf.keras.layers.Embedding(ar_diac_vocab_size,
                                                               embedding_dim,
                                                               embeddings_initializer='glorot_uniform')
            self.gru = tf.keras.layers.GRU(self.units,
                                           return_sequences=True,
                                           return_state=True,
                                           recurrent_initializer='glorot_uniform')

        def call(self, ar, ar_diac, hidden):
            ar = self.ar_embedding(ar)
            ar_diac = self.ar_diac_embedding(ar_diac)
            output, state = self.gru(tf.keras.layers.concatenate([ar, ar_diac]), initial_state=hidden)
            return output, state

        def initialize_hidden_state(self):
            return tf.zeros((self.batch_size, self.units))
else:
    class Encoder(tf.keras.Model):
        def __init__(self, ar_vocab_size, embedding_dim, units, batch_size):
            super(Encoder, self).__init__()
            self.batch_size = batch_size
            self.units = units
            self.ar_embedding = tf.keras.layers.Embedding(ar_vocab_size,
                                                          embedding_dim,
                                                          embeddings_initializer='glorot_uniform')
            self.gru = tf.keras.layers.GRU(self.units,
                                           return_sequences=True,
                                           return_state=True,
                                           recurrent_initializer='glorot_uniform')

        def call(self, ar, hidden):
            ar = self.ar_embedding(ar)
            output, state = self.gru(ar, initial_state=hidden)
            return output, state

        def initialize_hidden_state(self):
            return tf.zeros((self.batch_size, self.units))

In [None]:
if USE_DIACS:
    encoder = Encoder(vocab_ar_size, vocab_ar_diac_size, EMBEDDINGS_DIM, UNITS, BATCH_SIZE)
else:
    encoder = Encoder(vocab_ar_size, EMBEDDINGS_DIM, UNITS, BATCH_SIZE)

In [None]:
sample_hidden = encoder.initialize_hidden_state()
if USE_DIACS:
    sample_output, sample_hidden = encoder(example_ar_batch, example_ar_diac_batch, sample_hidden)
else:
    sample_output, sample_hidden = encoder(example_ar_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

In [None]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        hidden_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [None]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, en_vocab_size, embedding_dim, units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.units = units
        self.embedding = tf.keras.layers.Embedding(en_vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights

In [None]:
decoder = Decoder(vocab_en_size, EMBEDDINGS_DIM, UNITS, BATCH_SIZE)
sample_decoder_output, _, _ = decoder(tf.random.uniform((64, 1)), sample_hidden, sample_output)
print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

# Training

In [None]:
@tf.function
def train_step(sequences, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        if USE_DIACS:
            enc_output, enc_hidden = encoder(sequences[0], sequences[1], enc_hidden)
            en = sequences[2]
        else:
            enc_output, enc_hidden = encoder(sequences[0], enc_hidden)
            en = sequences[1]

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([en_lang.word_index['<start>']] * BATCH_SIZE, 1)

        for t in range(1, en.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(en[:, t], predictions)

            dec_input = tf.expand_dims(en[:, t], 1)

    batch_loss = (loss / int(en.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [None]:
for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, sequences) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(sequences, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))

    if (epoch + 1) % 5 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

# Evaluation

In [None]:
def evaluate(sequences):
    attention_plot = np.zeros((max_length_en, max_length_ar))
    
    for idx in range(len(sequences)):
        sequences[idx] = '<start> ' + sequences[idx] + ' <end>'

    if USE_DIACS:
        ar = [ar_lang.word_index[i] for i in sequences[0].split(' ')]
        ar = tf.keras.preprocessing.sequence.pad_sequences([ar],
                                                           maxlen=max_length_ar,
                                                           padding='post')
        ar = tf.convert_to_tensor(ar)
        
        ar_diac = [ar_diac_lang.word_index[i] for i in sequences[1].split(' ')]
        ar_diac = tf.keras.preprocessing.sequence.pad_sequences([ar_diac],
                                                                maxlen=max_length_ar_diac,
                                                                padding='post')
        ar_diac = tf.convert_to_tensor(ar_diac)
    else:
        ar = [ar_lang.word_index[i] for i in sequences[0].split(' ')]
        ar = tf.keras.preprocessing.sequence.pad_sequences([ar],
                                                           maxlen=max_length_ar,
                                                           padding='post')
        ar = tf.convert_to_tensor(ar)

    result = ''

    hidden = [tf.zeros((1, UNITS))]
    if USE_DIACS:
        enc_out, enc_hidden = encoder(ar, ar_diac, hidden)
    else:
        enc_out, enc_hidden = encoder(ar, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([en_lang.word_index['<start>']], 0)

    for t in range(max_length_en):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)

        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += en_lang.index_word[predicted_id] + ' '

        if en_lang.index_word[predicted_id] == '<end>':
            return result, sequences, attention_plot

        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sequences, attention_plot

In [None]:
def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(1, 1, 1)
    ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 14}

    ax.set_xticklabels([''] + sentence, fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence, fontdict=fontdict)

    plt.show()

In [None]:
def translate(sequences, plot_att=False):
    result, sequences, attention_plot = evaluate(sequences)

    print('Predicted translation: {}'.format(result))

    if plot_att:
        attention_plot = attention_plot[:len(result.split(' ')), :len(sequences[0].split(' '))]
        plot_attention(attention_plot, sequences[0].split(' '), result.split(' '))
    
    return result

In [None]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
test_file = 'data_dir/%s.bpe.test'
if USE_DIACS:
    test_file = test_file % 'ar-diac'
else:
    test_file = test_file % 'ar'

with open(test_file, 'r') as file:
    test_lines = file.readlines()

result = list()
for line in tqdm(test_lines):
    line = line.strip()
    
    if USE_DIACS:
        sequences = list()
        sequences.append(remove_diacritics(line))
        sequences.append(extract_diacritics(line))
    else:
        sequences = [line]
    
    result.append(translate(sequences))

with open(test_file + '.predictions', 'w') as file:
    file.write('\n'.join(result))