In [34]:
file_path = './Dataset/ara_.txt'

with open(file_path, 'r', encoding='utf-8') as f:
    data = f.read()

lines = data.strip().split('\n')
sentence_pairs = [line.split('\t') for line in lines if '\t' in line]

english_sentences, arabic_sentences = zip(*sentence_pairs)

<h3>Preprocessing data</h3>


In [35]:
import re

def preprocess_arabic(sent):
    sent = sent.strip()
    sent = re.sub(r'[^\u0600-\u06FF\s]', '', sent)
    sent = re.sub(r'\s+', ' ', sent)
    return sent

def preprocess_english(sent):
    sent = sent.lower().strip()
    sent = re.sub(r"([?.!,¿])", r" \1 ", sent)
    sent = re.sub(r'[" "]+', " ", sent)
    sent = re.sub(r"[^a-zA-Z?.!,¿ء-ي]+", " ", sent)
    sent = sent.strip()
    return sent

# Apply preprocessing
preprocessed_english = ['<start> ' + preprocess_english(s) + ' <end>' for s in english_sentences]
preprocessed_arabic = ['<start> ' + preprocess_arabic(s) + ' <end>' for s in arabic_sentences]

<h3>Tokenization and Padding</h3>


In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer_en = Tokenizer(filters='')
tokenizer_en.fit_on_texts(preprocessed_english)
en_sequence = tokenizer_en.texts_to_sequences(preprocessed_english)
en_sequence = pad_sequences(en_sequence, padding='post')

tokenizer_ar = Tokenizer(filters='')
tokenizer_ar.fit_on_texts(preprocessed_arabic)
ar_sequence = tokenizer_ar.texts_to_sequences(preprocessed_arabic)
ar_sequence = pad_sequences(ar_sequence, padding='post')


vocab_size_en = len(tokenizer_en.word_index) + 1
vocab_size_ar = len(tokenizer_ar.word_index) + 1


<h3>Preparing the model</h3>


In [39]:
import keras_nlp
import keras
from tensorflow.keras import layers
from keras_nlp.layers import TransformerEncoder, TransformerDecoder

embed_dim = 128
num_heads = 2
ff_dim = 256

encoder_inputs = layers.Input(shape=(None,), dtype="int64", name="encoder_inputs")
decoder_inputs = layers.Input(shape=(None,), dtype="int64", name="decoder_inputs")


encoder_embedding = layers.Embedding(input_dim=vocab_size_ar, output_dim=embed_dim)(encoder_inputs)
decoder_embedding = layers.Embedding(input_dim=vocab_size_en, output_dim=embed_dim)(decoder_inputs)


encoder_embedding = layers.LayerNormalization()(encoder_embedding)
decoder_embedding = layers.LayerNormalization()(decoder_embedding)


transformer_encoder = TransformerEncoder(
    intermediate_dim=ff_dim,
    num_heads=num_heads
)
encoder_outputs = transformer_encoder(encoder_embedding)


transformer_decoder = TransformerDecoder(
    intermediate_dim=ff_dim,
    num_heads=num_heads
)
decoder_outputs = transformer_decoder(
    decoder_embedding,
    encoder_outputs
)


outputs = layers.Dense(vocab_size_en, activation="softmax")(decoder_outputs)


model = keras.Model([encoder_inputs, decoder_inputs], outputs)


<h3>Prepare inputs&outputs</h3>


In [40]:
from sklearn.model_selection import train_test_split

input_train, input_test, target_train, target_test = train_test_split(
    ar_sequence, en_sequence, test_size=0.2, random_state=42)

In [41]:
# decoder data train
decoder_input_train = target_train[:, :-1]
decoder_target_train = target_train[:, 1:]

#decoder data test
decoder_input_test = target_test[:, :-1]
decoder_target_test = target_test[:, 1:]

<h3>Train the Model</h3>


In [12]:


model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])


model.fit(
    [input_train, decoder_input_train],
    decoder_target_train,
    batch_size=64,
    epochs=10,
    validation_split=0.1
)

Epoch 1/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 224ms/step - accuracy: 0.9718 - loss: 0.1808 - val_accuracy: 0.9679 - val_loss: 0.1884
Epoch 2/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 232ms/step - accuracy: 0.9829 - loss: 0.1016 - val_accuracy: 0.9676 - val_loss: 0.2006
Epoch 3/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 232ms/step - accuracy: 0.9889 - loss: 0.0611 - val_accuracy: 0.9649 - val_loss: 0.2157
Epoch 4/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 240ms/step - accuracy: 0.9920 - loss: 0.0402 - val_accuracy: 0.9625 - val_loss: 0.2308
Epoch 5/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 232ms/step - accuracy: 0.9940 - loss: 0.0296 - val_accuracy: 0.9611 - val_loss: 0.2451
Epoch 6/10
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 242ms/step - accuracy: 0.9950 - loss: 0.0230 - val_accuracy: 0.9590 - val_loss: 0.2596
Epoch 7/10

<keras.src.callbacks.history.History at 0x1d095cce8d0>

## Evaluate


In [13]:


loss, accuracy = model.evaluate(
    [input_test, decoder_input_test],
    decoder_target_test
)

print(f"🔍 Test Loss: {loss:.4f}")
print(f"✅ Test Accuracy (token-level): {accuracy:.4f}")

[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 49ms/step - accuracy: 0.9543 - loss: 0.2965
🔍 Test Loss: 0.2892
✅ Test Accuracy (token-level): 0.9550


In [43]:
def translate(sentence):
    sentence = preprocess_arabic(sentence)
    sentence = "<start> " + sentence + " <end>"
    sequence = tokenizer_ar.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=ar_sequence.shape[1], padding='post')

    output = [tokenizer_en.word_index['<start>']]

    for i in range(en_sequence.shape[1]):
        decoder_input = pad_sequences([output], maxlen=en_sequence.shape[1], padding='post')
        prediction = model.predict([sequence, decoder_input])
        next_token = prediction[0, i].argmax()
        output.append(next_token)
        if tokenizer_en.index_word.get(next_token) == "<end>":
            break

    translated = ' '.join([tokenizer_en.index_word.get(i, '') for i in output[1:] if i > 0])
    return translated.replace('<end>', '').strip()


In [24]:
for arabic_sen, english_sen in zip(arabic_sentences[500:600], english_sentences[500:600]):
    print(f"Arabic: {arabic_sen}")
    print(f"English: {english_sen}")
    print(f"Translated: {translate(arabic_sen)}")
    print("-" * 50)


Arabic: أنا مشغول.
English: I'm not free.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 398ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Translated: i m busy .
--------------------------------------------------
Arabic: لست نحيفا.
English: I'm not thin.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Translated: i m not thin .
----------------------

In [45]:
translate("انا حزين")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40

'provided hamburgers couple sharpening exploded tufts planes plain accept hard customs eighty eggs classmate whistled like neither hong hopeless dna older shadow argentine upstairs kids conviction aim math outside shock southeastern know carried letting insurance dumb factors concern ten ask rate gloomy'