# `Read the data`

In [5]:
import numpy as np
import pandas as pd

data_path = '/content/ara.txt'
data = pd.read_table(data_path,  names =['source', 'target', 'comments'])
data.drop('comments', axis=1, inplace=True)
data = data.iloc[:9000]
print(f"Shape of the data: {data.shape}")
data.head()

Shape of the data: (7866, 2)


Unnamed: 0,source,target
0,Hi.,مرحبًا.
1,Run!,اركض!
2,Duck!,اخفض رأسك!
3,Duck!,اخفضي رأسك!
4,Duck!,اخفضوا رؤوسكم!


# `Dataset Cleaning`

In [6]:
import re
import string
from string import digits

In [7]:
# convert source and target text to Lowercase
data.source = data.source.apply(lambda x: x.lower())

# Remove quotes from source and target text
data.source = data.source.apply(lambda x: re.sub("'", '', x))
data.target = data.target.apply(lambda x: re.sub("'", '', x))

# create a set of all special characters
special_characters= set(string.punctuation)
# Remove all the special characters
data.source = data.source.apply(lambda x: ''.join(char for char in x if char not in special_characters))
data.target = data.target.apply(lambda x: ''.join(char for char in x if char not in special_characters))

# Remove digits from source and target sentences
num_digits = str.maketrans('','', digits)
data.source = data.source.apply(lambda x: x.translate(num_digits))
data.target = data.target.apply(lambda x: x.translate(num_digits))

# Remove extra spaces
data.source = data.source.apply(lambda x: x.strip())
data.target = data.target.apply(lambda x: x.strip())
data.source = data.source.apply(lambda x: re.sub(" +", " ", x))
data.target = data.target.apply(lambda x: re.sub(" +", " ", x))
data.source = data.source.apply(lambda x: re.sub("[^-9A-Za-z ]", "" , x))

In [8]:
def clean_text_english(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()

    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)

    return text

data.source = data.source.apply(clean_text_english)
data.source = data.source.apply(lambda x: re.sub("[.?#@%^&*()@!;:'\/!*]", "", x))

In [9]:
data.target = data.target.apply(lambda x: re.sub("[٠١٢٣٤٥٦٧٨٩۱۹۰]", "", x))
data.target = data.target.apply(lambda x: re.sub("[ـ،؛؟٫٬٠]", "", x))
data.target = data.target.apply(lambda x: re.sub("[abcdefghijklmnopqrstuvwxyz]", "", x))

# adapted from https://github.com/bakrianoo/aravec
# function to clean and normalize text
def clean_text(text):
    search = ["أ", "إ", "آ", "ة", "_", "-", "/", ".", "،", " و ", " يا ", '"', "ـ", "'", "ى", "\\", '\n', '\t', '"', '?', '؟', '!']
    replace = ["ا", "ا", "ا", "ه", " ", " ", "", "", "", " و", " يا", "", "", "", "ي", "", ' ',  ' ', ' ', ' ? ', ' ؟ ', ' ! ']
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')

    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])

    text = text.strip()

    return text

data.target = data.target.apply(clean_text)

In [10]:
def remove_diacritics(text):
    arabic_diacritics = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(arabic_diacritics, '', str(text))
    return text

data.target = data.target.apply(remove_diacritics)

# `Design Encoder & Decoder, Preprocessing & Teconization`

In [11]:
# Encoder Input  :: English Sentences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

eng_lines = list()
for line in data.source:
    eng_lines.append(line)

tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(eng_lines)
tokenized_eng_lines = tokenizer_eng.texts_to_sequences(eng_lines)

length_list = []
for token_seq in tokenized_eng_lines:
    length_list.append(len(token_seq))
max_eng_input_length = np.array(length_list).max()

padded_eng_lines = pad_sequences(tokenized_eng_lines, maxlen=max_eng_input_length, padding='post')

encoder_input_data = np.array(padded_eng_lines, dtype='float16')

eng_vocab_size = len(tokenizer_eng.word_index) + 1

print('English sentence max length:', max_eng_input_length)
print('Encoder input shape:', encoder_input_data.shape)
print('English vocab size:', eng_vocab_size)

English sentence max length: 9
Encoder input shape: (7866, 9)
English vocab size: 2845


In [12]:
# Decoder Input  :: Arabic Sentences

arb_lines = list()
for line in data.target:
    arb_lines.append('<START> ' + line + ' <END>')

tokenizer_arb = Tokenizer()
tokenizer_arb.fit_on_texts(arb_lines)
tokenized_arb_lines = tokenizer_arb.texts_to_sequences(arb_lines)

length_list = []
for token_seq in tokenized_arb_lines:
    length_list.append(len(token_seq))
max_arb_input_length = np.array(length_list).max()

padded_arb_lines = pad_sequences(tokenized_arb_lines, maxlen=max_arb_input_length, padding='post')

decoder_input_data = np.array(padded_arb_lines, dtype='float16')

num_arb_token = len(tokenizer_arb.word_index) + 1

print('Arabic sentence max length:', max_arb_input_length)
print('Decoder input shape:', decoder_input_data.shape)
print('Arabic vocab size:', num_arb_token)

Arabic sentence max length: 11
Decoder input shape: (7866, 11)
Arabic vocab size: 6377


In [13]:
# Decoder Output  :: Arabic Sentences -Preprocessed-

from tensorflow.keras.utils import to_categorical

decoder_output_data = list()
for token in tokenized_arb_lines:
  decoder_output_data.append(token[1:])

padded_arb_lines = pad_sequences(decoder_output_data, maxlen=max_arb_input_length, padding='post')
onehot_arb_lines = to_categorical(padded_arb_lines, num_classes=num_arb_token)

decoder_output_data = np.array(onehot_arb_lines, dtype='float16')

print('Decoder output shape:', decoder_output_data.shape)

Decoder output shape: (7866, 11, 6377)


# `Build The model`

In [19]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.activations import softmax, relu

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(eng_vocab_size, 256, mask_zero=True)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(128, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_arb_token, 256, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(128, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Dense layer
decoder_dense = Dense(num_arb_token, activation=softmax)
output = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

# `Train the model`

In [None]:
# Train
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_output_data,
    batch_size=16,
    epochs=200,
    validation_split=0.2
)

Epoch 1/200
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 15ms/step - loss: 6.0528 - val_loss: 5.5111
Epoch 2/200
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - loss: 4.6893 - val_loss: 5.3797
Epoch 3/200
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - loss: 4.5340 - val_loss: 5.2926
Epoch 4/200
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - loss: 4.4124 - val_loss: 5.2163
Epoch 5/200
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - loss: 4.2874 - val_loss: 5.1502
Epoch 6/200
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - loss: 4.1974 - val_loss: 5.1069
Epoch 7/200
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 14ms/step - loss: 4.0927 - val_loss: 5.0784
Epoch 8/200
[1m450/450[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - loss: 3.9854 - val_loss: 5.0305
Epoch 9/200
[1m450/4

<keras.src.callbacks.history.History at 0x7bfc642406d0>

In [None]:
model.save('eng2arb.keras')

# `Inference`

In [14]:
from tensorflow.keras.models import load_model
model_m = load_model("/content/eng2arb.keras")

In [15]:
model_m.summary()

In [None]:
model_m.layers

[<InputLayer name=input_layer, built=True>,
 <InputLayer name=input_layer_1, built=True>,
 <Embedding name=embedding, built=True>,
 <Embedding name=embedding_1, built=True>,
 <LSTM name=lstm, built=True>,
 <LSTM name=lstm_1, built=True>,
 <Dense name=dense, built=True>]

In [16]:
def make_reference_models(model):
    encoder_inputs = model.input[0]  # input_1
    encoder_outputs, state_h_enc, state_c_enc = model.layers[4].output  # lstm_1
    encoder_states = [state_h_enc, state_c_enc]
    encoder_model_reference = Model(encoder_inputs, encoder_states)

    decoder_inputs = model.input[1]  # input_2
    decoder_state_h = Input(shape=(128,))
    decoder_dense_c = Input(shape=(128,))
    decoder_states_inputs = [decoder_state_h, decoder_dense_c]

    decoder_embedding = model.layers[3](decoder_inputs) # Embedding
    decoder_lstm = model.layers[5]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)

    decoder_states = [state_h, state_c]
    decoder_dense = model.layers[6] # Dense
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model_reference = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

    return encoder_model_reference, decoder_model_reference

In [17]:
def str_to_tokens(sentence: str):
  words = sentence.lower().split()
  tokens_list = list()
  for word in words:
    tokens_list.append(tokenizer_eng.word_index[word])

  return pad_sequences([tokens_list], maxlen=max_eng_input_length, padding='post')

# `Test`

In [None]:
enc_model, dec_model = make_reference_models(model_m)


states_values = enc_model.predict(str_to_tokens(input('Enter an English Sentence: ')))

empty_target_seq = np.zeros((1, 1))
empty_target_seq[0, 0] = tokenizer_arb.word_index['start']
stop_condition = False
decoded_translation = ''

while not stop_condition:
  dec_outputs, h, c = dec_model.predict([empty_target_seq] + states_values)
  sampled_word_index = np.argmax(dec_outputs[0, -1, :])
  sampled_word = None

  for word, index in tokenizer_arb.word_index.items():
    if sampled_word_index == index:
      decoded_translation += ' {}'.format(word)
      sampled_word = word

    if sampled_word == 'end' or len(decoded_translation.split()) > max_arb_input_length:
      stop_condition = True

  empty_target_seq = np.zeros((1, 1))
  empty_target_seq[0, 0] = sampled_word_index
  states_values = [h, c]

print(decoded_translation)