In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


In [2]:
data = open("/content/drive/MyDrive/Dataset.txt", encoding="utf-8").read().split("\n")[:10000]
eng_texts, ar_texts = [], []
for line in data:
    parts = line.split("\t")
    if len(parts) == 2:
        eng_texts.append(parts[0])
        ar_texts.append("start " + parts[1] + " end")


In [3]:
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(eng_texts)
eng_sequences = eng_tokenizer.texts_to_sequences(eng_texts)
eng_word_index = eng_tokenizer.word_index

ar_tokenizer = Tokenizer()
ar_tokenizer.fit_on_texts(ar_texts)
ar_sequences = ar_tokenizer.texts_to_sequences(ar_texts)
ar_word_index = ar_tokenizer.word_index

max_encoder_seq_length = max(len(s) for s in eng_sequences)
max_decoder_seq_length = max(len(s) for s in ar_sequences)
encoder_input_data = pad_sequences(eng_sequences, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(ar_sequences, maxlen=max_decoder_seq_length, padding='post')
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]


In [4]:
latent_dim = 512
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(len(eng_word_index) + 1, latent_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(len(ar_word_index) + 1, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(len(ar_word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [6]:
from tensorflow.keras.optimizers import Adam

model.compile(
    optimizer=Adam(learning_rate=0.0005),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    [encoder_input_data, decoder_input_data],
    np.expand_dims(decoder_target_data, -1),
    batch_size=64,
    epochs=30,
    validation_split=0.2,
    callbacks=[es]
)

model.compile(
    optimizer=Adam(learning_rate=0.0005),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    [encoder_input_data, decoder_input_data],
    np.expand_dims(decoder_target_data, -1),
    batch_size=64,
    epochs=30,
    validation_split=0.2,
    callbacks=[es]
)

Epoch 1/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 124ms/step - accuracy: 0.6919 - loss: 3.9845 - val_accuracy: 0.6472 - val_loss: 3.0771
Epoch 2/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 118ms/step - accuracy: 0.7766 - loss: 1.8104 - val_accuracy: 0.6545 - val_loss: 3.0376
Epoch 3/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 116ms/step - accuracy: 0.7804 - loss: 1.7361 - val_accuracy: 0.6571 - val_loss: 3.0285
Epoch 4/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 116ms/step - accuracy: 0.7819 - loss: 1.6825 - val_accuracy: 0.6579 - val_loss: 3.0346
Epoch 5/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 116ms/step - accuracy: 0.7836 - loss: 1.6270 - val_accuracy: 0.6586 - val_loss: 3.0491
Epoch 1/30
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 118ms/step - accuracy: 0.7838 - loss: 1.6932 - val_accuracy: 0.6586 - val_loss: 3.0135
Epoch 2/30

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_layer = Embedding(len(ar_word_index) + 1, latent_dim)
decoder_emb_infer = decoder_embedding_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(
    decoder_emb_infer, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]

decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)


In [8]:
reverse_eng_index = dict((i, w) for w, i in eng_word_index.items())
reverse_ar_index = dict((i, w) for w, i in ar_word_index.items())


In [None]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    start_token = ar_tokenizer.word_index.get('start', 1)
    target_seq = np.array([[start_token]])

    decoded_sentence = []
    stop_condition = False

    for _ in range(max_decoder_seq_length):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_ar_index.get(sampled_token_index, '')

        if sampled_word == '' or sampled_word == 'end':
            break

        decoded_sentence.append(sampled_word)

        target_seq = np.array([[sampled_token_index]])
        states_value = [h, c]

    return ' '.join(decoded_sentence)


In [10]:
for seq_index in range(5):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("English:", eng_texts[seq_index])
    print("Predicted Arabic:", decoded_sentence)
    print("-" * 60)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
English: Hi.
Predicted Arabic: أنا
------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
English: Run!
Predicted Arabic: أنا
------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
English: Help!
Predicted Arabic: أنا
------------------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[