In [1]:
file_path = '/content/ara_.txt'

with open(file_path, 'r', encoding='utf-8') as f:
    data = f.read()

lines = data.strip().split('\n')
sentence_pairs = [line.split('\t') for line in lines if '\t' in line]

english_sentences, arabic_sentences = zip(*sentence_pairs)

In [2]:
import re

def preprocess_arabic(sent):
    sent = sent.strip()
    sent = re.sub(r'[^\u0600-\u06FF\s]', '', sent)
    sent = re.sub(r'\s+', ' ', sent)
    return sent

def preprocess_english(sent):
    sent = sent.lower().strip()
    sent = re.sub(r"([?.!,¿])", r" \1 ", sent)
    sent = re.sub(r'[" "]+', " ", sent)
    sent = re.sub(r"[^a-zA-Z?.!,¿ء-ي]+", " ", sent)
    sent = sent.strip()
    return sent

# Apply preprocessing
preprocessed_english = ['<start> ' + preprocess_english(s) + ' <end>' for s in english_sentences]
preprocessed_arabic = ['<start> ' + preprocess_arabic(s) + ' <end>' for s in arabic_sentences]

<h3>Tokenization and Padding</h3>

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer_en = Tokenizer(filters='')
tokenizer_en.fit_on_texts(preprocessed_english)
tensor_en = tokenizer_en.texts_to_sequences(preprocessed_english)
tensor_en = pad_sequences(tensor_en, padding='post')

tokenizer_ar = Tokenizer(filters='')
tokenizer_ar.fit_on_texts(preprocessed_arabic)
tensor_ar = tokenizer_ar.texts_to_sequences(preprocessed_arabic)
tensor_ar = pad_sequences(tensor_ar, padding='post')


vocab_size_en = len(tokenizer_en.word_index) + 1
vocab_size_ar = len(tokenizer_ar.word_index) + 1


<h3>Preparing the model</h3>

In [6]:
import keras_nlp
import keras
from tensorflow.keras import layers
from keras_nlp.layers import TransformerEncoder, TransformerDecoder

embed_dim = 128
num_heads = 2
ff_dim = 256

encoder_inputs = layers.Input(shape=(None,), dtype="int64", name="encoder_inputs")
decoder_inputs = layers.Input(shape=(None,), dtype="int64", name="decoder_inputs")


encoder_embedding = layers.Embedding(input_dim=vocab_size_ar, output_dim=embed_dim)(encoder_inputs)
decoder_embedding = layers.Embedding(input_dim=vocab_size_en, output_dim=embed_dim)(decoder_inputs)


encoder_embedding = layers.LayerNormalization()(encoder_embedding)
decoder_embedding = layers.LayerNormalization()(decoder_embedding)


transformer_encoder = TransformerEncoder(
    intermediate_dim=ff_dim,
    num_heads=num_heads
)
encoder_outputs = transformer_encoder(encoder_embedding)


transformer_decoder = TransformerDecoder(
    intermediate_dim=ff_dim,
    num_heads=num_heads
)
decoder_outputs = transformer_decoder(
    decoder_embedding,
    encoder_outputs
)


outputs = layers.Dense(vocab_size_en, activation="softmax")(decoder_outputs)


model = keras.Model([encoder_inputs, decoder_inputs], outputs)


<h3>Train the Model</h3>

In [8]:
from sklearn.model_selection import train_test_split

input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
    tensor_ar, tensor_en, test_size=0.1)

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])


decoder_input_data = target_tensor_train[:, :-1]
decoder_target_data = target_tensor_train[:, 1:]

model.fit(
    [input_tensor_train, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=10,
    validation_split=0.1
)

Epoch 1/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 83ms/step - accuracy: 0.8740 - loss: 0.7484 - val_accuracy: 0.8907 - val_loss: 0.6420
Epoch 2/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - accuracy: 0.8950 - loss: 0.5842 - val_accuracy: 0.8994 - val_loss: 0.5798
Epoch 3/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9163 - loss: 0.4437 - val_accuracy: 0.9078 - val_loss: 0.5490
Epoch 4/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9364 - loss: 0.3190 - val_accuracy: 0.9103 - val_loss: 0.5440
Epoch 5/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9526 - loss: 0.2272 - val_accuracy: 0.9136 - val_loss: 0.5442
Epoch 6/10
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.9674 - loss: 0.1515 - val_accuracy: 0.9135 - val_loss: 0.5673
Epoch 7/10
[1m136/1

<keras.src.callbacks.history.History at 0x799b864a5090>

## Evaluate

In [9]:
decoder_input_test = target_tensor_val[:, :-1]
decoder_target_test = target_tensor_val[:, 1:]

loss, accuracy = model.evaluate(
    [target_tensor_val, decoder_input_test],
    decoder_target_test
)

print(f"🔍 Test Loss: {loss:.4f}")
print(f"✅ Test Accuracy (token-level): {accuracy:.4f}")

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.8562 - loss: 1.3659
🔍 Test Loss: 1.3444
✅ Test Accuracy (token-level): 0.8572


In [12]:
def translate(sentence):
    sentence = preprocess_arabic(sentence)
    sentence = "<start> " + sentence + " <end>"
    sequence = tokenizer_ar.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=tensor_ar.shape[1], padding='post')

    output = [tokenizer_en.word_index['<start>']]

    for i in range(tensor_en.shape[1]):
        decoder_input = pad_sequences([output], maxlen=tensor_en.shape[1], padding='post')
        prediction = model.predict([sequence, decoder_input])
        next_token = prediction[0, i].argmax()
        output.append(next_token)
        if tokenizer_en.index_word.get(next_token) == "<end>":
            break

    translated = ' '.join([tokenizer_en.index_word.get(i, '') for i in output[1:] if i > 0])
    return translated.replace('<end>', '').strip()


In [14]:
for arabic_sen, english_sen in zip(arabic_sentences[:100], english_sentences[:100]):
    print(f"Arabic: {arabic_sen}")
    print(f"English: {english_sen}")
    print(f"Translated: {translate(arabic_sen)}")
    print("-" * 50)


Arabic: مرحبًا.
English: Hi.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Translated: hi .
--------------------------------------------------
Arabic: اركض!
English: Run!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
Translated: run !
--------------------------------------------------
Arabic: النجدة!
English: Help!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Translated: help !
--------------------------------------------------
Arabic: اقفز!
English: Jump!
[1m1/1[0m [32m━━━━━━━━━