In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Attention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# 1- اقرأ الداتا من الملف
english_sentences = []
arabic_sentences = []

with open('/content/drive/MyDrive/translated_captions_cleaned .csv', 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split(',')
        if len(parts) >= 2:
            english_sentences.append(parts[0])
            arabic_sentences.append(parts[1])

In [4]:
import re
import unicodedata

# دالة لحذف التشكيل من النصوص العربية
def remove_tashkeel(text):
    return re.sub(r'[\u0617-\u061A\u064B-\u0652]', '', text)

# دالة لتنظيف النصوص الإنجليزية والعربية
def clean_text(text):
    # إزالة التشكيل في النصوص العربية
    text = remove_tashkeel(text)

    # تحويل النصوص الإنجليزية إلى lowercase
    text = text.lower()

    # إزالة الرموز غير المطلوبة (مثل علامات الترقيم غير المفيدة)
    text = re.sub(r'[^\w\s.,?!ء-ي]', '', text)

    # إزالة المسافات الزائدة بين الكلمات
    text = re.sub(r'\s+', ' ', text)

    # إزالة المسافات الزائدة في بداية ونهاية النص
    text = text.strip()

    return text

# تنظيف الجمل الإنجليزية والعربية
english_sentences = [clean_text(sentence) for sentence in english_sentences]
arabic_sentences = [clean_text(sentence) for sentence in arabic_sentences]


In [5]:

# ⚡ أول حاجة: ضيفي <start> و <end> لكل جملة عربية
arabic_sentences = [f"<start> {sentence.strip()} <end>" for sentence in arabic_sentences]

# معلمات
num_samples = 24000
max_vocab_size = 10000
max_encoder_seq_length = 30
max_decoder_seq_length = 30

# تجهيز التوكنز
eng_tokenizer = Tokenizer(num_words=max_vocab_size, filters='', lower=True)
eng_tokenizer.fit_on_texts(english_sentences)
input_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
input_sequences = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')

ar_tokenizer = Tokenizer(num_words=max_vocab_size, filters='', lower=False)
ar_tokenizer.fit_on_texts(arabic_sentences)
target_sequences = ar_tokenizer.texts_to_sequences(arabic_sentences)
target_sequences = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')

input_vocab_size = len(eng_tokenizer.word_index) + 1
target_vocab_size = len(ar_tokenizer.word_index) + 1


In [6]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.callbacks import ModelCheckpoint

# بناء موديل Seq2Seq
encoder_inputs = Input(shape=(None,), name='encoder_inputs')
encoder_embedding = Embedding(input_vocab_size, 256, name='encoder_embedding')(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True, name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,), name='decoder_inputs')
decoder_embedding = Embedding(target_vocab_size, 256, name='decoder_embedding')(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(target_vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

# تجميع الموديل
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# كمبايل
model.compile(
    optimizer='rmsprop',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# تجهيز بيانات الديكودر
decoder_input_data = target_sequences[:, :-1]
decoder_target_data = target_sequences[:, 1:]
decoder_target_data = decoder_target_data[..., None]

# حفظ أفضل موديل
checkpoint = ModelCheckpoint(
    '/content/drive/MyDrive/seq2seq_translation_model.keras',
    save_best_only=True,
    monitor='val_loss',
    verbose=1
)
# تدريب
history = model.fit(
    [input_sequences, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=50,
    validation_split=0.2,
    callbacks=[checkpoint]
)


Epoch 1/50
[1m312/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 67ms/step - accuracy: 0.7172 - loss: 3.0207
Epoch 1: val_loss improved from inf to 1.68947, saving model to /content/drive/MyDrive/seq2seq_translation_model.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 107ms/step - accuracy: 0.7174 - loss: 3.0150 - val_accuracy: 0.7705 - val_loss: 1.6895
Epoch 2/50
[1m312/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 68ms/step - accuracy: 0.7577 - loss: 1.7383
Epoch 2: val_loss improved from 1.68947 to 1.67069, saving model to /content/drive/MyDrive/seq2seq_translation_model.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 86ms/step - accuracy: 0.7577 - loss: 1.7383 - val_accuracy: 0.7704 - val_loss: 1.6707
Epoch 3/50
[1m312/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 69ms/step - accuracy: 0.7585 - loss: 1.7027
Epoch 3: val_loss improved from 1.67069 to 1.61692, saving model to /content/drive/MyDri

In [7]:
model = tf.keras.models.load_model('/content/drive/MyDrive/seq2seq_translation_model.keras')

In [8]:
import pickle

# حفظ التوكنيزر الإنجليزي بعد التدريب
with open('/content/drive/MyDrive/eng_tokenizer.pkl', 'wb') as f:
    pickle.dump(eng_tokenizer, f)

# حفظ التوكنيزر العربي بعد التدريب
with open('/content/drive/MyDrive/ar_tokenizer.pkl', 'wb') as f:
    pickle.dump(ar_tokenizer, f)


In [10]:
encoder_model = Model(encoder_inputs, encoder_states)


In [11]:
# مكان جديد لمدخل الحالات الخفية
decoder_state_input_h = Input(shape=(256,), name='input_h')
decoder_state_input_c = Input(shape=(256,), name='input_c')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# نعيد استخدام الـ embedding والـ LSTM من نفس الموديل
decoder_embedding2 = decoder_embedding
decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    decoder_embedding2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)


In [12]:
def decode_sequence(input_seq):
    # استخراج الحالات الخفية من الانكودر
    states_value = encoder_model.predict(input_seq)

    # تحضير البداية
    target_seq = np.zeros((1, 1))
    # Changed target_tokenizer to ar_tokenizer
    target_seq[0, 0] = ar_tokenizer.word_index['<start>']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        # Changed target_tokenizer to ar_tokenizer
        sampled_word = ar_tokenizer.index_word.get(sampled_token_index, '')

        if sampled_word == '<end>' or len(decoded_sentence.split()) > max_decoder_seq_length:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        states_value = [h, c]

    return decoded_sentence.strip()


In [13]:
# مثال على جملة
test_sentence = "A dog is running in the snow"
seq = eng_tokenizer.texts_to_sequences([test_sentence]) # Changed input_tokenizer to eng_tokenizer
seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_encoder_seq_length, padding='post')

translated = decode_sequence(seq)
print("الترجمة:", translated)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
الترجمة: الكلب يركض في الثلج.


In [14]:
# مثال على جملة
test_sentence = "Three people hang out on top of a big hill"
seq = eng_tokenizer.texts_to_sequences([test_sentence]) # Changed input_tokenizer to eng_tokenizer
seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_encoder_seq_length, padding='post')

translated = decode_sequence(seq)
print("الترجمة:", translated)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
الترجمة: ثلاثة أشخاص على قمة تلة كبيرة.


In [15]:
# مثال على جملة
test_sentence = "A girls plays in the surf"
seq = eng_tokenizer.texts_to_sequences([test_sentence]) # Changed input_tokenizer to eng_tokenizer
seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_encoder_seq_length, padding='post')

translated = decode_sequence(seq)
print("الترجمة:", translated)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
الترجمة: الفتيات يلعبن في الملعب.


In [16]:
# مثال على جملة
test_sentence = "A group of people gathered around in the dark"
seq = eng_tokenizer.texts_to_sequences([test_sentence]) # Changed input_tokenizer to eng_tokenizer
seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_encoder_seq_length, padding='post')

translated = decode_sequence(seq)
print("الترجمة:", translated)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
الترجمة: مجموعة من الناس ينتظرون في الليل.


In [17]:
# مثال على جملة
test_sentence = "A man drives a jeep over rough rocks"
seq = eng_tokenizer.texts_to_sequences([test_sentence]) # Changed input_tokenizer to eng_tokenizer
seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_encoder_seq_length, padding='post')

translated = decode_sequence(seq)
print("الترجمة:", translated)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
الترجمة: رجل يقود دراجة فوق منحدر صخري.


In [19]:
# مثال على جملة
test_sentence = "A fat dog sitting in a boat"
seq = eng_tokenizer.texts_to_sequences([test_sentence]) # Changed input_tokenizer to eng_tokenizer
seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_encoder_seq_length, padding='post')

translated = decode_sequence(seq)
print("الترجمة:", translated)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
الترجمة: كلب يجلس في الخارج في الثلج.


In [20]:
# مثال على جملة
test_sentence = "Three boys diving into a lake"
seq = eng_tokenizer.texts_to_sequences([test_sentence]) # Changed input_tokenizer to eng_tokenizer
seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_encoder_seq_length, padding='post')

translated = decode_sequence(seq)
print("الترجمة:", translated)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
الترجمة: ثلاثة أولاد يغوصون في بحيرة.
