# Stage 1: Import Everything

In [None]:
import numpy as np
import math
import re
import time
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
import xml.etree.ElementTree as ET
import pickle
from lxml import etree


# Stage 2: Data preprocessing

In [None]:
filePath = "Cleaned CCMatrix v1- EN to AR Dataset.tmx"

In [None]:
def clean_control_characters(chunk):
    # Remove control characters except for tab, newline, and carriage return
    chunk = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', chunk)
    chunk = re.sub(r'\ufffe', '', chunk)  # Remove the 0xFFFE character
    return chunk

In [None]:
BUFFER_SIZE_FILE = 1024 * 1024  # 1MB

with open("CCMatrix v1- EN to AR Dataset.tmx", mode='r', encoding='utf-8') as f_src, \
        open(filePath, mode='w', encoding='utf-8') as f_dst:
    while True:
        chunk = f_src.read(BUFFER_SIZE_FILE)
        if not chunk:
            break
        cleaned_chunk = clean_control_characters(chunk)
        f_dst.write(cleaned_chunk)
print("Finished")

In [None]:
def extract_tu_elements(tu):
    ar_text = ""
    en_text = ""
    for tuv in tu.findall("tuv"):
        lang = tuv.get("{http://www.w3.org/XML/1998/namespace}lang")
        seg_text = tuv.findtext("seg")
        if lang == "ar":
            ar_text = seg_text
        elif lang == "en":
            en_text = seg_text
    return ar_text, en_text

In [None]:
ar_texts = []
en_texts = []

counter = 0
limit = 50000  # Change the number of sentences to read
flag = True  # True, stop at limit. False, ignore limit

context = etree.iterparse(filePath, events=('end',), tag='tu')
for event, elem in context:
    ar_text, en_text = extract_tu_elements(elem)
    if ar_text != "" and en_text != "":
        ar_texts.append(ar_text)
        en_texts.append(en_text)
        counter += 1
    # clear the element to free up memory
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]
    if flag and counter == limit:
        break
print("Arabic:", len(ar_texts))
print("English:", len(en_texts))

## Tokenize the data

In [None]:
tokenizer_en = Tokenizer(oov_token='<OOV>')
tokenizer_en.fit_on_texts(en_texts)
word_index_en = tokenizer_en.word_index

tokenizer_ar = Tokenizer(oov_token='<OOV>')
tokenizer_ar.fit_on_texts(ar_texts)
word_index_ar = tokenizer_ar.word_index
#tokenizer.fit_on_texts(data_clean)

In [20]:
VOCAB_SIZE_EN = len(word_index_en) + 2
print(VOCAB_SIZE_EN)

VOCAB_SIZE_AR = len(word_index_ar) + 2
print(VOCAB_SIZE_AR)


22428
59080


In [25]:
START_TOKEN_EN = VOCAB_SIZE_EN - 2
END_TOKEN_EN = VOCAB_SIZE_EN - 1
inputs = [[START_TOKEN_EN] + tokenizer_en.texts_to_sequences([sentence])[0] + [END_TOKEN_EN] for sentence in en_texts]

START_TOKEN_AR = VOCAB_SIZE_AR - 2
END_TOKEN_AR = VOCAB_SIZE_AR - 1

outputs = [[START_TOKEN_AR] + tokenizer_en.texts_to_sequences([sentence])[0] + [END_TOKEN_AR] for sentence in en_texts]

### Check the tokenized data

In [30]:
print(inputs[:5])
print(outputs[:5])

print(len(inputs))
print(len(outputs))

[[22426, 38, 2, 11709, 5, 2, 654, 8906, 11710, 4348, 30, 1597, 22427], [22426, 11, 108, 38, 13, 7573, 22427], [22426, 95, 5, 2, 600, 31, 663, 1230, 144, 55, 6, 103, 41, 4, 57, 4127, 2, 4594, 22427], [22426, 1388, 76, 11, 4349, 52, 155, 22427], [22426, 2, 601, 56, 3770, 36, 898, 375, 505, 13, 22427]]
[[59078, 38, 2, 11709, 5, 2, 654, 8906, 11710, 4348, 30, 1597, 59079], [59078, 11, 108, 38, 13, 7573, 59079], [59078, 95, 5, 2, 600, 31, 663, 1230, 144, 55, 6, 103, 41, 4, 57, 4127, 2, 4594, 59079], [59078, 1388, 76, 11, 4349, 52, 155, 59079], [59078, 2, 601, 56, 3770, 36, 898, 375, 505, 13, 59079]]
50000
50000


## Remove long sentences

In [33]:
MAX_LENGTH = 20
indices_to_remove = [indx for indx, sent in enumerate(inputs)
                     if len(sent) > MAX_LENGTH]
# Remove from the last, since doing it in the normal way would fuck up the length making the indices shift by one to the left, so deleting from the right is safe
for idx in reversed(indices_to_remove):
    del inputs[idx]
    del outputs[idx]

# do the same but for arabic    
indices_to_remove = [indx for indx, sent in enumerate(outputs)
                     if len(sent) > MAX_LENGTH]
for idx in reversed(indices_to_remove):
    del inputs[idx]
    del outputs[idx]

print(len(inputs))
print(len(outputs))

43099
43099


## Input/Output Creation

In [34]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)

outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs, value=0,
                                                        padding='post',
                                                        maxlen=MAX_LENGTH)

In [37]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

datasets = tf.data.Dataset.from_tensor_slices((inputs, outputs))

datasets = datasets.cache()  # Speed training, but does nothing else kek
datasets = datasets.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
datasets = datasets.prefetch(tf.data.experimental.AUTOTUNE)  # Speed training, but does nothing else kek


# Stage 3: Model Building

## Positional Encoding

In [41]:
class PositionalEncoding(layers.Layer):
    def __init__(self):
        super(PositionalEncoding, self).__init__()

    def get_angles(self, pos, i, d_model):  # pos is (seq_ength,1) and i is (1,d_model), hence the return 
        # pos and i are arrays
        angles = 1 / np.power(10000., (2 * (i // 2)) / np.float32(d_model))
        return pos * angles  # returns (seq_length, d_model)

    def call(self, inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_length)[:, np.newaxis],
                                 np.arange(d_model)[np.newaxis, :],
                                 d_model)
        angles[:, 0::2] = np.sin(angles[:, 0::2])  # even
        angles[:, 1::2] = np.cos(angles[:, 1::2])  # odd
        pos_encoding = angles[np.newaxis, ...]
        return inputs + tf.cast(pos_encoding, tf.float32)

## Attention

In [42]:
def scaled_dot_product_attention(queries, keys, values, mask):
    product = tf.matmul(queries, keys, transpose_b=True)
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
    scaled_product = product / tf.math.sqrt(keys_dim)
    if mask is not None:
        scaled_product += (mask * -1e9)
    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)

    return attention

In [43]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, nb_projection):
        super(MultiHeadAttention, self).__init__()
        self.nb_projection = nb_projection

    def build(self, input_shape):
        self.d_model = input_shape[-1]
        assert self.d_model % self.nb_projection == 0

        self.d_proj = self.d_model // self.nb_projection

        self.query_lin = layers.Dense(units=self.d_model)

        self.key_lin = layers.Dense(units=self.d_model)

        self.value_lin = layers.Dense(units=self.d_model)

        self.final_lin = layers.Dense(units=self.d_model)

    def split_proj(self, inputs, batch_size):  # inputs: (batch_size,seq_length,d_model)
        shape = (batch_size,
                 -1,
                 self.nb_projection,
                 self.d_proj)
        splitted_inputs = tf.reshape(inputs, shape=shape)  #(batch_size, seq_length, nb_proj, d_proj)

        return tf.transpose(splitted_inputs, perm=[0, 2, 1, 3])  # (batch_size, nb_proj, seq_length, d_proj)

    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]
        queries = self.query_lin(queries)
        keys = self.key_lin(keys)
        values = self.value_lin(values)

        queries = self.split_proj(queries, batch_size)
        keys = self.split_proj(keys, batch_size)
        values = self.split_proj(values, batch_size)

        attention = scaled_dot_product_attention(queries, keys, values, mask)

        attention = tf.transpose(attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(attention, shape=(batch_size, -1, self.d_model))

        outputs_att = self.final_lin(concat_attention)

        return outputs_att

## Encoder

In [46]:
class EncoderLayer(layers.Layer):
    def __init__(self, FFN_units, nb_proj, dropout):
        super(EncoderLayer, self).__init__()

        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout = dropout

    def build(self, input_shape):
        self.d_model = input_shape[-1]
        self.multi_head_attention = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

        self.dense_1 = layers.Dense(units=self.FFN_units, activation='relu')
        self.dense_2 = layers.Dense(units=self.d_model, activation='relu')

        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, mask, training=False):
        attention = self.multi_head_attention(inputs, inputs, inputs, mask)
        attention = self.dropout_1(attention, training=training)
        attention = self.norm_1(attention + inputs)

        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs)
        outputs = self.norm_2(outputs + attention)

        return outputs



In [48]:
class Encoder(layers.Layer):
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 vocab_size,
                 d_model,
                 name='encoder'):
        super(Encoder, self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model

        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)
        self.enc_layers = [EncoderLayer(FFN_units, nb_proj, dropout) for _ in range(nb_layers)]

    def call(self, inputs, mask, training=False):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)

        for i in range(self.nb_layers):
            outputs = self.enc_layers[i](outputs, mask, training)

        return outputs

## Decoder

In [68]:
class DecoderLayer(layers.Layer):
    def __init__(self,
                 FFN_units,
                 nb_proj,
                 dropout):
        super(DecoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout = dropout

    def build(self, input_shape):
        self.m_model = input_shape[-1]
        self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

        self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

        self.dense_1 = layers.Dense(units=self.FFN_units, activation='relu')
        self.dense_2 = layers.Dense(units=self.m_model, activation='relu')
        self.dropout_3 = layers.Dropout(rate=self.dropout)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, enc_outputs, mask_1, mask_2, training=False):
        attention = self.multi_head_attention_1(inputs, inputs, inputs, mask_1)
        attention = self.dropout_1(attention, training)
        attention = self.norm_1(attention + inputs)

        attention_2 = self.multi_head_attention_2(attention,
                                                  enc_outputs,
                                                  enc_outputs,
                                                  mask_2)

        attention_2 = self.dropout_2(attention_2, training)
        attention_2 = self.norm_2(attention_2 + attention)

        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs, training)
        outputs = self.norm_3(outputs + attention_2)

        return outputs




In [69]:
class Decoder(layers.Layer):
    def __init__(self, nb_layers,
                 FFN_units,
                 nb_projc,
                 droupout,
                 vocab_size,
                 d_model,
                 name='decoder'):
        super(Decoder, self).__init__(name=name)
        self.d_model = d_model
        self.nb_layers = nb_layers

        self.embedding = layers.Embedding(vocab_size, d_model)
        self.post_encoding = PositionalEncoding()
        self.dropout_1 = layers.Dropout(rate=droupout)

        self.decoder_layers = [DecoderLayer(FFN_units, nb_projc, droupout) for _ in range(nb_layers)]

    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.post_encoding(outputs)
        outputs = self.dropout_1(outputs, training)

        for i in range(self.nb_layers):
            outputs = self.decoder_layers[i](outputs, enc_outputs, mask_1, mask_2, training)
        return outputs

## Transformer

In [70]:
class Transformer(tf.keras.Model):
    def __init__(self,
                 vocab_size_enc,
                 vocab_size_dec,
                 d_model,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 name='transformer'):
        super(Transformer, self).__init__(name=name)
        self.encoder = Encoder(nb_layers, FFN_units, nb_proj, dropout, vocab_size_enc, d_model)
        self.decoder = Decoder(nb_layers, FFN_units, nb_proj, dropout, vocab_size_dec, d_model)

        self.last_linear = layers.Dense(units=vocab_size_dec)

    def create_padding_mask(self, seq):  # seq = (batch_size, seq_length)
        mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, seq):
        seq_len = tf.shape(seq)[1]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return look_ahead_mask

    def call(self, enc_inputs, dec_inputs, training):
        enc_mask = self.create_padding_mask(enc_inputs)
        dec_mask_1 = tf.maximum(self.create_padding_mask(dec_inputs),
                                self.create_look_ahead_mask(dec_inputs)
                                )
        dec_mask_2 = self.create_padding_mask(enc_inputs)
        enc_outputs = self.encoder(enc_inputs, enc_mask, training)
        dec_outputs = self.decoder(dec_inputs, enc_outputs, dec_mask_1, dec_mask_2, training)

        outputs = self.last_linear(dec_outputs)

        return outputs


# Stage 4: Training

In [71]:
tf.keras.backend.clear_session()
# Hyper-Parameters
D_MODEL = 128  # 512
NB_LAYERS = 4  # 6
FFN_UNITS = 512  # 2048
NB_PROJ = 8  # 8
DROPOUT = 0.1  # 0.1

transfomer = Transformer(vocab_size_enc=VOCAB_SIZE_EN,
                         vocab_size_dec=VOCAB_SIZE_AR,
                         d_model=D_MODEL,
                         nb_layers=NB_LAYERS,
                         FFN_units=FFN_UNITS,
                         nb_proj=NB_PROJ,
                         dropout=DROPOUT,
                         )

In [72]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")


def loss_function(target, pred):
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss_ = loss_object(target, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)


train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')


In [73]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)

        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)



In [79]:
checkpoint_path = "./MODEL"

ckpt = tf.train.Checkpoint(transfomer=transfomer, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [80]:
EPOCHS = 1
for epoch in range(EPOCHS):
    print(f"Starting of epoc{epoch + 1}")
    start = time.time()
    train_loss.reset_states()
    train_accuracy.reset_states()

    for (batch, (enc_inputs, targets)) in enumerate(datasets):
        dec_inputs = targets[:, :-1]
        dec_outputs_real = targets[:, 1:]
        with tf.GradientTape() as tape:
            prediction = transfomer(enc_inputs, dec_inputs, True)
            loss = loss_function(dec_outputs_real, prediction)

        gradients = tape.gradient(loss, transfomer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transfomer.trainable_variables))

        train_loss(loss)
        train_accuracy(dec_outputs_real, prediction)

        if batch % 50 == 0:
            print("Epoch {} Batch {} Loss {:.4f} Accuracy{:.4f}".format(epoch + 1, batch, train_loss.result(),
                                                                        train_accuracy.result()))
            break
    ckpt_save_path = ckpt_manager.save()
    print(f"Saving checkpoint for epoch {epoch + 1} at {ckpt_save_path}")
    print(f"Time taken for 1 epoch  {time.time() - start} seconds ")

Starting of epoc1
Epoch 1 Batch 0 Loss 4.1729 Accuracy0.1308
Saving checkpoint for epoch 1 at ./MODEL\ckpt-1
Time taken for 1 epoch  3.6235220432281494 seconds 


In [84]:
def evaluate(inp_sentence):
    inp_sentence = [START_TOKEN_EN] + tokenizer_en.texts_to_sequences([inp_sentence])[0] + [END_TOKEN_EN]
    enc_inputs = tf.expand_dims(inp_sentence, axis=0)
    outputs = tf.expand_dims([START_TOKEN_AR], axis=0)

    for _ in range(MAX_LENGTH):
        predictions = transfomer(enc_inputs, outputs, False)  # (1, Seq Length, vocab_size_ar)
        prediction = predictions[:, -1, :]

        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)
        if predicted_id == END_TOKEN_AR:
            return tf.squeeze(outputs, axis=0)
        predicted_id = tf.expand_dims(predicted_id, -1)  # Expand dimensions to make it [1,1]
        outputs = tf.concat([outputs, predicted_id], axis=-1)
    return tf.squeeze(outputs, axis=0)


In [85]:
def translate(sentence):
    outputs = evaluate(sentence).numpy()
    filtered_outputs = [i for i in outputs if i < START_TOKEN_AR]

    # Decode the filtered token IDs back to text
    decoded_text = tokenizer_ar.sequences_to_texts([filtered_outputs])[0]
    print(f"Input: {sentence}, Output: {decoded_text}")

In [90]:
translate("Why Not")

Input: Why Not, Output: لا لا
