# Stage 1: Import Everything

In [None]:
import numpy as np
import math
import re
import time
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
import xml.etree.ElementTree as ET
import pickle
from lxml import etree


# Stage 2: Data preprocessing

In [None]:
filePath = "Cleaned CCMatrix v1- EN to AR Dataset.tmx"

In [None]:
def clean_control_characters(chunk):
    # Remove control characters except for tab, newline, and carriage return
    chunk = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', chunk)
    chunk = re.sub(r'\ufffe', '', chunk)  # Remove the 0xFFFE character
    return chunk

In [None]:
BUFFER_SIZE_FILE = 1024 * 1024  # 1MB

with open("CCMatrix v1- EN to AR Dataset.tmx", mode='r', encoding='utf-8') as f_src, \
        open(filePath, mode='w', encoding='utf-8') as f_dst:
    while True:
        chunk = f_src.read(BUFFER_SIZE_FILE)
        if not chunk:
            break
        cleaned_chunk = clean_control_characters(chunk)
        f_dst.write(cleaned_chunk)
print("Finished")

In [None]:
def extract_tu_elements(tu):
    ar_text = ""
    en_text = ""
    for tuv in tu.findall("tuv"):
        lang = tuv.get("{http://www.w3.org/XML/1998/namespace}lang")
        seg_text = tuv.findtext("seg")
        if lang == "ar":
            ar_text = seg_text
        elif lang == "en":
            en_text = seg_text
    return ar_text, en_text

In [None]:
ar_texts = []
en_texts = []

counter = 0
limit = 50000  # Change the number of sentences to read
flag = True  # True, stop at limit. False, ignore limit

context = etree.iterparse(filePath, events=('end',), tag='tu')
for event, elem in context:
    ar_text, en_text = extract_tu_elements(elem)
    if ar_text != "" and en_text != "":
        ar_texts.append(ar_text)
        en_texts.append(en_text)
        counter += 1
    # clear the element to free up memory
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]
    if flag and counter == limit:
        break
print("Arabic:", len(ar_texts))
print("English:", len(en_texts))

## Tokenize the data

In [None]:
tokenizer_en = Tokenizer(oov_token='<OOV>')
tokenizer_en.fit_on_texts(en_texts)
word_index_en = tokenizer_en.word_index

tokenizer_ar = Tokenizer(oov_token='<OOV>')
tokenizer_ar.fit_on_texts(ar_texts)
word_index_ar = tokenizer_ar.word_index
#tokenizer.fit_on_texts(data_clean)

In [20]:
VOCAB_SIZE_EN = len(word_index_en) + 2
print(VOCAB_SIZE_EN)

VOCAB_SIZE_AR = len(word_index_ar) + 2
print(VOCAB_SIZE_AR)


22428
59080


In [25]:
START_TOKEN_EN = VOCAB_SIZE_EN - 2
END_TOKEN_EN = VOCAB_SIZE_EN - 1
inputs = [[START_TOKEN_EN] + tokenizer_en.texts_to_sequences([sentence])[0] + [END_TOKEN_EN] for sentence in en_texts]

START_TOKEN_AR = VOCAB_SIZE_AR - 2
END_TOKEN_AR = VOCAB_SIZE_AR - 1

outputs = [[START_TOKEN_AR] + tokenizer_en.texts_to_sequences([sentence])[0] + [END_TOKEN_AR] for sentence in en_texts]

### Check the tokenized data

In [30]:
print(inputs[:5])
print(outputs[:5])

print(len(inputs))
print(len(outputs))

[[22426, 38, 2, 11709, 5, 2, 654, 8906, 11710, 4348, 30, 1597, 22427], [22426, 11, 108, 38, 13, 7573, 22427], [22426, 95, 5, 2, 600, 31, 663, 1230, 144, 55, 6, 103, 41, 4, 57, 4127, 2, 4594, 22427], [22426, 1388, 76, 11, 4349, 52, 155, 22427], [22426, 2, 601, 56, 3770, 36, 898, 375, 505, 13, 22427]]
[[59078, 38, 2, 11709, 5, 2, 654, 8906, 11710, 4348, 30, 1597, 59079], [59078, 11, 108, 38, 13, 7573, 59079], [59078, 95, 5, 2, 600, 31, 663, 1230, 144, 55, 6, 103, 41, 4, 57, 4127, 2, 4594, 59079], [59078, 1388, 76, 11, 4349, 52, 155, 59079], [59078, 2, 601, 56, 3770, 36, 898, 375, 505, 13, 59079]]
50000
50000


## Remove long sentences

In [33]:
MAX_LENGTH = 20
indices_to_remove = [indx for indx, sent in enumerate(inputs)
                     if len(sent) > MAX_LENGTH]
# Remove from the last, since doing it in the normal way would fuck up the length making the indices shift by one to the left, so deleting from the right is safe
for idx in reversed(indices_to_remove):
    del inputs[idx]
    del outputs[idx]

# do the same but for arabic    
indices_to_remove = [indx for indx, sent in enumerate(outputs)
                     if len(sent) > MAX_LENGTH]
for idx in reversed(indices_to_remove):
    del inputs[idx]
    del outputs[idx]

print(len(inputs))
print(len(outputs))

43099
43099


## Input/Output Creation

In [34]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)

outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs, value=0,
                                                        padding='post',
                                                        maxlen=MAX_LENGTH)

In [37]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

datasets = tf.data.Dataset.from_tensor_slices((inputs, outputs))

datasets = datasets.cache()  # Speed training, but does nothing else kek
datasets = datasets.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
datasets = datasets.prefetch(tf.data.experimental.AUTOTUNE)  # Speed training, but does nothing else kek


# Stage 3: Model Building

## Positional Encoding

In [41]:
class PositionalEncoding(layers.Layer):
    def __init__(self):
        super(PositionalEncoding, self).__init__()

    def get_angles(self, pos, i, d_model):  # pos is (seq_ength,1) and i is (1,d_model), hence the return 
        # pos and i are arrays
        angles = 1 / np.power(10000., (2 * (i // 2)) / np.float32(d_model))
        return pos * angles  # returns (seq_length, d_model)

    def call(self, inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_length)[:, np.newaxis],
                                 np.arange(d_model)[np.newaxis, :],
                                 d_model)
        angles[:, 0::2] = np.sin(angles[:, 0::2])  # even
        angles[:, 1::2] = np.cos(angles[:, 1::2])  # odd
        pos_encoding = angles[np.newaxis, ...]
        return inputs + tf.cast(pos_encoding, tf.float32)