In [80]:
file_path = './Dataset/ara_.txt'

with open(file_path, 'r', encoding='utf-8') as f:
    data = f.read()

lines = data.strip().split('\n')
sentence_pairs = [line.split('\t') for line in lines if '\t' in line]

english_sentences, arabic_sentences = zip(*sentence_pairs)

In [81]:
import re

def preprocess_arabic(sent):
    sent = sent.strip()
    return sent

def preprocess_english(sent):
    sent = sent.lower().strip()
    sent = re.sub(r"([?.!,¿])", r" \1 ", sent)
    sent = re.sub(r'[" "]+', " ", sent)
    sent = re.sub(r"[^a-zA-Z?.!,¿]+", " ", sent)
    sent = sent.strip()
    return sent

# Apply preprocessing
preprocessed_english = ['<start> ' + preprocess_english(s) + ' <end>' for s in english_sentences]
preprocessed_arabic = ['<start> ' + preprocess_arabic(s) + ' <end>' for s in arabic_sentences]

<h3>Tokenization and Padding</h3>


In [82]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer_en = Tokenizer(filters='')
tokenizer_en.fit_on_texts(preprocessed_english)
tensor_en = tokenizer_en.texts_to_sequences(preprocessed_english)
tensor_en = pad_sequences(tensor_en, padding='post')

tokenizer_ar = Tokenizer(filters='')
tokenizer_ar.fit_on_texts(preprocessed_arabic)
tensor_ar = tokenizer_ar.texts_to_sequences(preprocessed_arabic)
tensor_ar = pad_sequences(tensor_ar, padding='post')


vocab_size_en = len(tokenizer_en.word_index) + 1
vocab_size_ar = len(tokenizer_ar.word_index) + 1


<h3>Tokenization and Padding</h3>


In [83]:
import keras_nlp
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


embed_dim = 128
num_heads = 2
ff_dim = 256




encoder_inputs = layers.Input(shape=(None,), dtype="int64", name="encoder_inputs")
decoder_inputs = layers.Input(shape=(None,), dtype="int64", name="decoder_inputs")


encoder_embedding = layers.Embedding(input_dim=vocab_size_ar, output_dim=embed_dim)(encoder_inputs)
decoder_embedding = layers.Embedding(input_dim=vocab_size_en, output_dim=embed_dim)(decoder_inputs)


encoder_embedding = layers.LayerNormalization()(encoder_embedding)
decoder_embedding = layers.LayerNormalization()(decoder_embedding)


transformer_decoder = keras_nlp.layers.TransformerDecoder(
    intermediate_dim=ff_dim,
    num_heads=num_heads
)


x = transformer_decoder(
    decoder_embedding,
    encoder_embedding,
    decoder_padding_mask=None,
    encoder_padding_mask=None
)

outputs = layers.Dense(vocab_size_en, activation="softmax")(x)

model = keras.Model([encoder_inputs, decoder_inputs], outputs)


<h3>Train the Model</h3>


In [87]:
from sklearn.model_selection import train_test_split

input_tensor_train, input_tensor_test, target_tensor_train, target_tensor_test = train_test_split(
    tensor_ar, tensor_en, test_size=0.1)

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])


decoder_input_data = target_tensor_train[:, :-1]
decoder_target_data = target_tensor_train[:, 1:]

model.fit(
    [input_tensor_train, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=10,
    validation_split=0.1
)

Epoch 1/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 208ms/step - accuracy: 0.8571 - loss: 1.1273 - val_accuracy: 0.8746 - val_loss: 0.7467
Epoch 2/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 217ms/step - accuracy: 0.8818 - loss: 0.6854 - val_accuracy: 0.8878 - val_loss: 0.6575
Epoch 3/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 241ms/step - accuracy: 0.9030 - loss: 0.5374 - val_accuracy: 0.8964 - val_loss: 0.6083
Epoch 4/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 381ms/step - accuracy: 0.9212 - loss: 0.4201 - val_accuracy: 0.8997 - val_loss: 0.5917
Epoch 5/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 517ms/step - accuracy: 0.9393 - loss: 0.3131 - val_accuracy: 0.9038 - val_loss: 0.5880
Epoch 6/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 521ms/step - accuracy: 0.9551 - loss: 0.2257 - val_accuracy: 0.9046 - val_loss: 0.6012
Epoch 7/10

<keras.src.callbacks.history.History at 0x26dd99e7490>

<h3>Test the Model</h3>


In [88]:
decoder_input_test = target_tensor_test[:, :-1]
decoder_target_test = target_tensor_test[:, 1:]

loss, accuracy = model.evaluate(
    [input_tensor_test, decoder_input_test],
    decoder_target_test
)

print(f"🔍 Test Loss: {loss:.4f}")
print(f"✅ Test Accuracy (token-level): {accuracy:.4f}")

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.9085 - loss: 0.6670
🔍 Test Loss: 0.6644
✅ Test Accuracy (token-level): 0.9097


In [None]:
def translate(sentence):
    sentence = preprocess_arabic(sentence)
    sentence = '<start> ' + sentence + ' <end>'
    sequence = tokenizer_ar.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=tensor_ar.shape[1], padding='post')

    output = [tokenizer_en.word_index['<start>']]
    
    for _ in range(tensor_en.shape[1]):
        decoder_input = pad_sequences([output], maxlen=tensor_en.shape[1], padding='post')
        predictions = model.predict([sequence, decoder_input], verbose=0)
        next_token = predictions[0, len(output)-1].argmax()
        output.append(next_token)
        if tokenizer_en.index_word.get(next_token) == '<end>':
            break

    translated = [tokenizer_en.index_word.get(i, '') for i in output if i > 0]
    return ' '.join(translated).replace('<start>', '').replace('<end>', '').strip()


In [89]:
for arabic_sen, english_sen in zip(arabic_sentences[:1000], preprocessed_english[:1000]):
    print(f"Arabic: {arabic_sen}")
    print(f"English: {english_sen}")
    print(f"Translated: {translate(arabic_sen)}")
    print("-" * 50)


Arabic: مرحبًا.
English: <start> hi . <end>
Translated: it s get a little .
--------------------------------------------------
Arabic: اركض!
English: <start> run ! <end>
Translated: run !
--------------------------------------------------
Arabic: النجدة!
English: <start> help ! <end>
Translated: help !
--------------------------------------------------
Arabic: اقفز!
English: <start> jump ! <end>
Translated: jump !
--------------------------------------------------
Arabic: قف!
English: <start> stop ! <end>
Translated: stop !
--------------------------------------------------
Arabic: داوم.
English: <start> go on . <end>
Translated: go on .
--------------------------------------------------
Arabic: استمر.
English: <start> go on . <end>
Translated: go on .
--------------------------------------------------
Arabic: مرحباً.
English: <start> hello ! <end>
Translated: hello !
--------------------------------------------------
Arabic: تعجّل!
English: <start> hurry ! <end>
Translated: he made me