In [None]:
from google.colab import drive
drive.mount('/drive')


Mounted at /drive


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense

In [None]:
data = pd.read_csv("/drive/My Drive/Job Tasks/kemet/translation_train.csv")

In [None]:


# Prepare the data
arabic_sentences = data["Arabic"].to_list()  # List of Arabic sentences
english_sentences = data["English"].to_list()  # List of English sentences

# Tokenize the sentences
arabic_tokenizer = tf.keras.preprocessing.text.Tokenizer()
arabic_tokenizer.fit_on_texts(arabic_sentences)
arabic_sequences = arabic_tokenizer.texts_to_sequences(arabic_sentences)

english_tokenizer = tf.keras.preprocessing.text.Tokenizer()
english_tokenizer.fit_on_texts(english_sentences)
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)

# Add '<start>' and '<end>' tokens to the English tokenizer vocabulary
english_tokenizer.word_index['<start>'] = len(english_tokenizer.word_index) + 1
english_tokenizer.word_index['<end>'] = len(english_tokenizer.word_index) + 1
english_tokenizer.index_word[english_tokenizer.word_index['<start>']] = '<start>'
english_tokenizer.index_word[english_tokenizer.word_index['<end>']] = '<end>'

# Pad the sequences
arabic_sequences = tf.keras.preprocessing.sequence.pad_sequences(arabic_sequences)
english_sequences = tf.keras.preprocessing.sequence.pad_sequences(english_sequences)

# Define the maximum sequence length
MAX_SEQUENCE_LENGTH = max(len(sequence) for sequence in arabic_sequences)

# Prepare the input and output data
encoder_input_data = arabic_sequences
decoder_input_data = english_sequences[:, :-1]
decoder_output_data = english_sequences[:, 1:]

# Define the model architecture
encoder_inputs = Input(shape=(None,))
encoder_embedding = tf.keras.layers.Embedding(len(arabic_tokenizer.word_index) + 1, 256)(encoder_inputs)
encoder_lstm = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding = tf.keras.layers.Embedding(len(english_tokenizer.word_index) + 1, 256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(len(english_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Create the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile and train the model
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_output_data, batch_size=64, epochs=5, validation_split=0.2)

# Save the model
model.save('translation_model.h5')


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
import tensorflow as tf
import numpy as np
import json


# Load the trained model
model = tf.keras.models.load_model('translation_model.h5')

# Define the maximum sequence length
MAX_SEQUENCE_LENGTH = 256

# Translate a sentence
def translate(sentence):
    # Preprocess the input sentence
    sequence = arabic_tokenizer.texts_to_sequences([sentence])
    sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=256)

    # Generate the translation
    encoder_model = tf.keras.models.Model(inputs=model.input[0], outputs=model.layers[4].output)
    decoder_lstm = model.layers[5]
    decoder_dense = model.layers[6]

    encoder_output, state_h_enc, state_c_enc = encoder_model.predict(sequence)
    states_value = [state_h_enc, state_c_enc]

    target_seq = np.zeros((1, 1,256))
    target_seq[0, 0,0] = english_tokenizer.word_index['<start>']  # Start token
    stop_condition = False
    translation = ''

    while not stop_condition:
        decoder_output, state_h, state_c = decoder_lstm(target_seq, initial_state=[tf.convert_to_tensor(state_h_enc), tf.convert_to_tensor(state_c_enc)])
        decoder_output = decoder_dense(decoder_output)
        sampled_token_index = np.argmax(decoder_output[0, -1, :])
        if sampled_token_index not in english_tokenizer.index_word:
            break
        print(sampled_token_index)    
        sampled_word = english_tokenizer.index_word[sampled_token_index]
        print(sampled_word)
        translation += sampled_word + ' '  # Add the sampled word to the translation
        print(translation)
        if sampled_word == '<end>' or len(translation.split()) >= 256:
            stop_condition = True
        else:
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index  # Update the target sequence
            states_value = [state_h, state_c]

    return translation.strip()

# Example usage:
arabic_text = "مرحبًا، كيف حالك؟"
english_translation = translate(arabic_text)
print("Arabic:", arabic_text)
print("English:", english_translation)


Arabic: مرحبًا، كيف حالك؟
English: 


# My comment

There is something wrong with how i handle encoder decoder stuff i am still not able to figure it out, yet it got me interested so i will probably continue to work on it till i figure it out. 
Also the training takes enormous amount of time despite using colab pro so I didn’t have much chances to try multiple stuff
of course we could always do the fine tuning thing i thought about it too but didn't have much time to try it

Thanks for this enjoyable task hope to hear from you soon!
