In [74]:
#Importing library
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras import layers
from keras.utils import *
from keras.initializers import *
import tensorflow as tf
# import time, random

In [75]:
#Vectorize the data.
num_samples = 10000

input_texts = []
validation_inputs = []
target_texts = []
validation_target = []
input_chars = set()
target_chars = set()

with open(r'C:\Users\ahmed\Downloads\seq2seqTranslation_arabic\ara.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text = line.split('\t')
    input_texts.append(input_text)
    target_text = target_text.lower()
    target_texts.append(target_text)

for line in lines[  min(num_samples, len(lines) - 1):]:
    input_text, target_text = line.split('\t')
    validation_inputs.append(input_text)
    target_text = target_text.lower()
    validation_target.append(target_text)


In [76]:
# Unicode normalization

import re
import unicodedata

def normalize_unicode(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence_eng(s):
    s = normalize_unicode(s)
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = re.sub(r'[^a-z A-Z 0-9\s]+', "", s)
    s = s.strip()
    return s

def preprocess_sentence_arabic(s):
    s = normalize_unicode(s)
    
    # Arabic-specific punctuation
    arabic_punctuation = r'[،؟؛]'  # Original Arabic punctuation marks
    
    # Common punctuation (including the dot, comma, exclamation, etc.)
    common_punctuation = r'[.،؟؛!,]'  # Add the dot '.' here and other common punctuation marks
    
    # Diacritics (Tashkeel)
    tashkeel = r'[\u064B-\u0652]'  # Arabic diacritical marks
    
    # Combine patterns to remove tashkeel and punctuation
    combined_pattern = tashkeel + '|' + common_punctuation
    
    # Remove tashkeel and punctuation
    s = re.sub(combined_pattern, '', s)
    
    # Strip extra spaces
    s = s.strip()
    
    return s


def tag_target_sentences(sentences):
    tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
    return list(tagged_sentences)

def generate_decoder_inputs_targets(sentences, tokenizer):
    seqs = tokenizer.texts_to_sequences(sentences)
    decoder_inputs = [s[:-1] for s in seqs] # Drop the last token in the sentence.
    decoder_targets = [s[1:] for s in seqs] # Drop the first token in the sentence.
    return decoder_inputs, decoder_targets

In [77]:
#preprocess the data
from tensorflow.keras.preprocessing.text import Tokenizer

English_Tokenizer = Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')


def preprocess_encoder_inputs(input_texts, tokenizer):
    train_preprocessed_input = [preprocess_sentence_eng(s) for s in input_texts]
    train_tagged_preprocessed_input = tag_target_sentences(train_preprocessed_input)

    tokenizer.fit_on_texts(train_tagged_preprocessed_input)

    input_vocab_size = len(English_Tokenizer.index_word)+1
    train_encoder_inputs = English_Tokenizer.texts_to_sequences(train_tagged_preprocessed_input)
    max_encoding_len = len(max(train_encoder_inputs, key=len))

    padded_train_encoder_inputs = pad_sequences(train_encoder_inputs, max_encoding_len, padding='post', truncating='post')
    return padded_train_encoder_inputs,tokenizer, input_vocab_size, max_encoding_len

def preprocess_decoder(target_text, tokenizer):
    train_preprocessed_target = [preprocess_sentence_arabic(s) for s in target_texts]
    train_tagged_preprocessed_target = tag_target_sentences(train_preprocessed_target)

    tokenizer = Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(train_tagged_preprocessed_target)

    train_decoder_inputs, train_decoder_targets = generate_decoder_inputs_targets(train_tagged_preprocessed_target,
tokenizer)
    
    max_decoding_len = len(max(train_decoder_inputs, key=len))

    padded_train_decoder_inputs = pad_sequences(train_decoder_inputs, max_decoding_len, padding='post', truncating='post')
    padded_train_decoder_targets = pad_sequences(train_decoder_targets, max_decoding_len, padding='post', truncating='post')

    target_vocab_size = len(tokenizer.word_index) + 1
    return padded_train_decoder_inputs,tokenizer, padded_train_decoder_targets , target_vocab_size, max_decoding_len


In [78]:
#preprocess the training data
English_Tokenizer = Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
arabic_tokenizer = Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')

padded_train_encoder_inputs,English_Tokenizer, input_vocab_size, max_encoding_len = preprocess_encoder_inputs(input_texts, English_Tokenizer)
padded_train_decoder_inputs,arabic_tokenizer, padded_train_decoder_targets , target_vocab_size, max_decoding_len = preprocess_decoder(target_text, arabic_tokenizer)


In [72]:
len(arabic_tokenizer.word_index)

8637

In [73]:
#saving tokenizer as pickle 
import pickle

with open('English_Tokenizer.pickle','wb') as handle:
    pickle.dump(English_Tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

with open('arabic_tokenizer.pickle','wb') as handle:
    pickle.dump(arabic_tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [33]:
#preprocess the val data
val_padded_encoder_inputs,_, val_input_vocab_size, val_max_encoding_len = preprocess_encoder_inputs(input_texts, English_Tokenizer)
val_padded_decoder_inputs,_ , val_padded_train_decoder_targets , val_target_vocab_size, val_max_decoding_len = preprocess_decoder(target_text, arabic_tokenizer)

In [12]:
embedding_dim = 128
hidden_dim = 256
default_dropout=0.2
batch_size = 32
epochs = 30

In [68]:
# Encoder model with Masking

encoder_inputs = tf.keras.Input(shape=(None,))
encoder_masking = layers.Masking(mask_value=0.0)(encoder_inputs)
encoder_embedding = layers.Embedding(input_dim=input_vocab_size, output_dim=embedding_dim)(encoder_masking)
encoder_lstm = layers.LSTM(hidden_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

encoder_states = [state_h, state_c]

# Decoder model with Masking
decoder_inputs = tf.keras.Input(shape=(None,))
decoder_masking = layers.Masking(mask_value=0.0)(decoder_inputs)
decoder_embedding = layers.Embedding(input_dim=target_vocab_size, output_dim=embedding_dim)(decoder_masking)
decoder_lstm = layers.LSTM(hidden_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = layers.Dense(target_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Full model
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_22 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_23 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 masking_17 (Masking)           (None, None)         0           ['input_22[0][0]']               
                                                                                                  
 masking_18 (Masking)           (None, None)         0           ['input_23[0][0]']               
                                                                                            

In [42]:
# Train the model
history = model.fit([padded_train_encoder_inputs, padded_train_decoder_inputs],
                    np.expand_dims(padded_train_decoder_targets, -1),
                    batch_size=batch_size,
                    epochs=11,
                    validation_data=([val_padded_encoder_inputs, val_padded_decoder_inputs],
                                    np.expand_dims(val_padded_train_decoder_targets, -1)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [18]:
model.save("eng_to_arabic_96_acc_97_val.keras")

In [43]:
#bulding the encoder and decoder 
encoder_model = tf.keras.Model(encoder_inputs, encoder_states)

# Decoder inference model
decoder_state_input_h = tf.keras.Input(shape=(hidden_dim,))
decoder_state_input_c = tf.keras.Input(shape=(hidden_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_lstm_outputs)

decoder_model = tf.keras.Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)


In [47]:
encoder_model.save("eng_to_arabic_encoder_98_acc_99_val_v2.keras")
decoder_model.save("eng_to_arabic_decoder_98_acc_99_val_v2.keras")



In [80]:
max_decoding_len

15

In [45]:
def translate_sentence(sentence, english_tokenizer, arabic_tokenizer, encoder_model, decoder_model, max_encoding_len, max_decoding_len):
    # Tokenize the input English sentence using english_tokenizer
    input_seq = english_tokenizer.texts_to_sequences([sentence])
    
    # Pad the tokenized input to max_encoding_len
    input_seq = pad_sequences(input_seq, maxlen=max_encoding_len, padding='post')

    # Encode the input as state vectors using the encoder model
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate an empty target sequence of length 1
    target_seq = np.zeros((1, 1))  # [[0]]

    # Populate the first character of the target sequence with the start token ('<sos>')
    target_seq[0, 0] = arabic_tokenizer.word_index['<sos>']

    # Sampling loop to generate the French sentence
    stop_condition = False
    translated_sentence = ''
    
    while not stop_condition:
        # Predict the next token and hidden states (h, c) from the decoder model
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value, verbose=0)

        # Sample the token with the highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = arabic_tokenizer.index_word.get(sampled_token_index, '<unk>')

        # Append the sampled word to the translated sentence
        translated_sentence += ' ' + sampled_word

        # Stop if we encounter the end token ('<eos>') or exceed max_decoding_len
        if sampled_word == '<eos>' or len(translated_sentence.split()) > max_decoding_len:
            stop_condition = True
        else:
            # Update the target sequence with the sampled token
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index

            # Update the states (h, c) for the next prediction
            states_value = [h, c]

    # Return the translated sentence without '<eos>' token
    translated_sentence = translated_sentence.replace('<eos>', '').strip()
    return translated_sentence


In [55]:
translate_sentence("I want to know your opinion",English_Tokenizer, arabic_tokenizer, encoder_model, decoder_model,max_encoding_len, max_encoding_len  )

'اريد ان اعرف رايك'