# Stage 1: Import Everything

In [None]:
import numpy as np
import math
import re
import time
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
import xml.etree.ElementTree as ET
import pickle

# Stage 2: Data preprocessing

In [None]:
def clean_control_characters(chunk):
    # Remove control characters except for tab, newline, and carriage return
    chunk = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', chunk)
    chunk = re.sub(r'\ufffe', '', chunk)  # Remove the 0xFFFE character
    return chunk

In [None]:
BUFFER_SIZE = 1024 * 1024  # 1MB

start_time = time.time()
filePath = "Cleaned CCMatrix v1- EN to AR Dataset.tmx"
with open("CCMatrix v1- EN to AR Dataset.tmx", mode='r', encoding='utf-8') as f_src, \
     open(filePath, mode='w', encoding='utf-8') as f_dst:

    while True:
        chunk = f_src.read(BUFFER_SIZE)
        if not chunk:
            break
        cleaned_chunk = clean_control_characters(chunk)
        f_dst.write(cleaned_chunk)

end_time = time.time()

time_taken = end_time - start_time
print(f"Time taken to process the file: {time_taken:.2f} seconds")


In [None]:
def extract_tu_elements(tu):
    ar_text = ""
    en_text = ""
    for tuv in tu.findall("tuv"):
        lang = tuv.get("{http://www.w3.org/XML/1998/namespace}lang")
        seg_text = tuv.findtext("seg")
        if lang == "ar":
            ar_text = seg_text
        elif lang == "en":
            en_text = seg_text
    return ar_text, en_text

In [None]:
from lxml import etree


ar_texts = []
en_texts = []

counter = 0
limit = 5000000  # Change the number of words to read
flag = False  # True, stop at limit. False, ignore limit

start_time = time.time()  # Record start time
context = etree.iterparse(filePath, events=('end',), tag='tu')
for event, elem in context:
    ar_text, en_text = extract_tu_elements(elem)
    if ar_text != "" and en_text != "":
        ar_texts.append(ar_text)
        en_texts.append(en_text)
        counter += 1
    # clear the element to free up memory
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]
    if flag and counter == limit:
        break
end_time = time.time()  # Record end time
print("Time taken to parse and extract: {:.2f} seconds".format(end_time - start_time))

print("Arabic:", len(ar_texts))
print("English:", len(en_texts))

In [None]:
tokenizer = Tokenizer(num_words=2 ** 16, oov_token='<OOV>')
#tokenizer.fit_on_texts(data_clean)