In [3]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os

In [None]:
data_path = '/kaggle/input/umc005'

# Function to load a text corpus from a specified file path
def load_corpus(file_path, set_path):
    set_path = os.path.join(data_path, set_path)
    file_path = os.path.join(set_path, file_path)
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.readlines()

# Load English and Urdu training, validation, and test data from Quran and Bible datasets
train_en = load_corpus('train.en', 'quran') + load_corpus('train.en', 'bible') 
train_ur = load_corpus('train.ur', 'quran') + load_corpus('train.ur', 'bible')
dev_en = load_corpus('dev.en', 'quran') + load_corpus('dev.en', 'bible')
dev_ur = load_corpus('dev.ur', 'quran') + load_corpus('dev.ur', 'bible')
test_en = load_corpus('test.en', 'quran') + load_corpus('test.en', 'bible')
test_ur = load_corpus('test.ur', 'quran') + load_corpus('test.ur', 'bible')
en_corpus = train_en + test_en + dev_en
ur_corpus = train_ur + test_ur + dev_ur

# Check that the dataset pairs are properly aligned
assert len(train_en) == len(train_ur), "Training data misaligned!"
assert len(dev_en) == len(dev_ur), "Validation data misaligned!"
assert len(test_en) == len(test_ur), "Test data misaligned!"
print("Train Dataset Size:", len(train_en))
print("Test Dataset Size:", len(test_en))
print("Dev Dataset Size:", len(dev_en))

Train Dataset Size: 13400
Test Dataset Size: 457
Dev Dataset Size: 514


In [None]:
# Maximum vocabulary sizes and sequence lengths for English and Urdu
max_input_length = 150
max_output_length = 181

src_vocab_size = 11115
tgt_vocab_size = 11015

embedding_dim = 256
hidden_units = 512

# Cleaning Urdu text: Remove non-alphanumeric characters and add START/END tokens
def clean_urdu(text):
    # Remove non-alphanumeric and special characters, keeping Urdu script
    text = tf.strings.regex_replace(text, r'[^\p{L}\s]', '')  # Keep letters and spaces
    text = tf.strings.regex_replace(text, r'\s+', ' ')  # Normalize whitespace
    text = tf.strings.strip(text)  # Remove leading/trailing spaces
    text = tf.strings.join(['START', text, 'END'], separator=" ")
    return text

# Cleaning English text: Lowercase and remove non-alphabetic characters
def clean_english(text):
    text = tf.strings.lower(text)  # Convert to lowercase
    text = tf.strings.regex_replace(text, r'[^a-zA-Z\s]', '')  # Remove non-alphabetic characters
    text = tf.strings.regex_replace(text, r'\s+', ' ')  # Normalize whitespace
    text = tf.strings.strip(text)  # Remove leading/trailing spaces
    return text

# Corpus cleaning function based on language
def clean_corpus(corpus, lang='en'):
    if lang == 'en': return [clean_english(t) for t in corpus]
    elif lang == 'ur': return [clean_urdu(t) for t in corpus]

# Save tokenizer vocabulary to a file
def save_tokenizer(vectorizer, filename, verbose=None):
    with open(filename, 'w') as file:
        file.write("\n".join(vectorizer.get_vocabulary()))
    file.close()
    if verbose: print(f"Vocabulary Saved to file {filename}")

# Load or create a tokenizer using TextVectorization
def load_tokenizer(lang_ds, max_vocab, seq_len, standardize, pretrained=None):
    vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_vocab,
        output_mode='int',
        output_sequence_length=seq_len,
        standardize=standardize,
    )
    vectorizer.adapt(lang_ds)        
    return vectorizer
    
# Initialize and save English and Urdu tokenizers
tokenizer_en = load_tokenizer(en_corpus, src_vocab_size, max_input_length, clean_english)
tokenizer_ur = load_tokenizer(ur_corpus, tgt_vocab_size, max_output_length, clean_urdu)
save_tokenizer(tokenizer_en, 'vocab_en', True)
save_tokenizer(tokenizer_ur, 'vocab_ur', True)

# Print vocabulary sizes for English and Urdu
en_vocab_size = tokenizer_en.vocabulary_size()
ur_vocab_size = tokenizer_ur.vocabulary_size()
print(f"\nEnglish Vocabulary Size: {en_vocab_size}")
print(f"Urdu Vocabulary Size: {ur_vocab_size}")

Vocabulary Saved to file vocab_en
Vocabulary Saved to file vocab_ur

English Vocabulary Size: 11115
Urdu Vocabulary Size: 11010


In [None]:
# Create dataset for training and validation
def create_dataset(en_ds, ur_ds, batch_size):
    # Convert lists to TensorFlow datasets
    en_dataset = tf.data.Dataset.from_tensor_slices(en_ds)
    ur_dataset = tf.data.Dataset.from_tensor_slices(ur_ds)

    # Tokenize English and Urdu datasets
    en_dataset = en_dataset.map(tokenizer_en)
    ur_dataset = ur_dataset.map(tokenizer_ur)

    # Prepare decoder input and target from Urdu tokenized data
    ur_input_dataset = ur_dataset.map(lambda x: x[:-1])
    ur_target_dataset = ur_dataset.map(lambda x: x[1:])
    # Zip English and Urdu targets, Batch the dataset and enable prefetching
    dataset = tf.data.Dataset.zip(((en_dataset, ur_input_dataset), ur_target_dataset))
    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

batch_size = 64
train_ds = create_dataset(train_en, train_ur, batch_size)
val_ds = create_dataset(dev_en, dev_ur, batch_size)

# Randomly sample a batch and display tokenized examples
idx = np.random.randint(batch_size)
for (en, ur), ur_labels in train_ds.take(1):
    print("English Batch:", en[idx])
    print("\nUrdu Input Batch (Decoder Input):", ur[idx])
    print("\nUrdu Target Batch (Decoder Target):", ur_labels[idx])

English Batch: tf.Tensor(
[  11   27 1439 1337    3  476   37   11   22   15  413    5    2  277
  262    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0], shape=(150,), dtype=int64)

Urdu Input Batch (Decoder Input): tf.Tensor(
[   3   27 1896 2618    2 1037   16   63   13  144 1259   10   57   24
 6107   33    4    0    0    0    0    0    0    0    0    0    0  

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
from tensorflow.keras.optimizers import Adam

def LSTM_model(pretrained_weights=None):
    """
    Builds a sequence-to-sequence LSTM model for machine translation.
    The model includes an encoder-decoder architecture with LSTM cells and embeddings.
    """
    # Set random seed for reproducibility
    tf.random.set_seed(42)
    
    # Encoder part
    encoder_input = tf.keras.layers.Input(shape=(max_input_length,))  # Input shape for the encoder
    embedding = tf.keras.layers.Embedding(src_vocab_size, embedding_dim)(encoder_input)  # Embedding layer for encoder
    encoder_lstm, state_h, state_c = tf.keras.layers.LSTM(hidden_units, return_state=True)(embedding)
    encoder_states = [state_h, state_c]  # The encoder state is passed to the decoder

    # Decoder part
    decoder_input = tf.keras.layers.Input(shape=(max_output_length-1,))  # Input shape for the decoder (one less due to teacher forcing)
    decoder_embedding = tf.keras.layers.Embedding(en_vocab_size, embedding_dim)(decoder_input)  # Embedding layer for decoder
    decoder_lstm = tf.keras.layers.LSTM(hidden_units, return_sequences=True, return_state=True)  # Decoder LSTM
    decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)  # Decoder with initial state from encoder
    decoder_dense = tf.keras.layers.Dense(ur_vocab_size, activation='softmax')  # Output layer for predicting the next token
    decoder_output = decoder_dense(decoder_output)  # The decoder output is passed through a dense layer to predict token probabilities

    # Define and Build the model with the specified input shape
    model = tf.keras.models.Model([encoder_input, decoder_input], decoder_output)
    model.build(input_shape=(None, max_input_length))
    
    # If pre-trained weights are provided, load them into the model
    if pretrained_weights: 
        model.load_weights(pretrained_weights)
    
    return model


batch_size = 32  # Define the batch size for training
# Create and compile the model
model = LSTM_model()  # Instantiate the model
model.compile(loss='sparse_categorical_crossentropy',  # Loss function (for multi-class classification)
              optimizer=Adam(learning_rate=1e-3),  # Adam optimizer with learning rate 1e-3
              metrics=['accuracy'])  # Track accuracy during training
model.summary()  # Print model summary to check the architecture

In [20]:
callbacks = [
    ModelCheckpoint(filepath='best_modelmt.keras', save_best_only=True, monitor='val_loss', mode='min'),
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    CSVLogger('losses.csv')
]

In [21]:
history = model.fit(train_ds,
                    callbacks=callbacks,
                    validation_data=val_ds,
                    epochs=5)

Epoch 1/5
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2039s[0m 10s/step - accuracy: 0.7676 - loss: 2.3915 - val_accuracy: 0.8731 - val_loss: 0.8191
Epoch 2/5
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2045s[0m 10s/step - accuracy: 0.8097 - loss: 1.1821 - val_accuracy: 0.8804 - val_loss: 0.7421
Epoch 4/5
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2039s[0m 10s/step - accuracy: 0.8145 - loss: 1.1191 - val_accuracy: 0.8825 - val_loss: 0.7264
Epoch 5/5
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2055s[0m 10s/step - accuracy: 0.8185 - loss: 1.0761 - val_accuracy: 0.8841 - val_loss: 0.7145


In [22]:
model.save('lstm_model.keras')

In [32]:
import numpy as np
import tensorflow as tf

def translate_lstm_model(sentence, model, tokenizer_en, tokenizer_ur, max_input_length=150, max_output_length=5):
    # Step 1: Clean and tokenize the input sentence
    input_tensor = tf.constant([sentence])  # Wrap the sentence as a batch of size 1
    input_indices = tokenizer_en(input_tensor).numpy()[0]  # Convert to token indices
    input_indices = np.pad(input_indices, (0, max_input_length - len(input_indices)), 'constant')  # Pad to max length

    # Step 2: Prepare the encoder input
    encoder_input = tf.constant([input_indices])  # Batch of size 1, shape (1, max_input_length)
    
    # Step 3: Get the initial encoder states
    encoder_embedding = model.get_layer('embedding_4')(encoder_input)
    encoder_lstm = model.get_layer('lstm_4')  # Make sure to use the correct layer name
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]
    
    # Step 4: Initialize decoder input with the START token
    start_token = tokenizer_ur(["START"]).numpy()[0][0]  # Convert "START" token to index
    decoder_input = np.array([start_token])
    
    # Step 5: Initialize the decoded sentence list
    translated_tokens = []

    # Step 6: Decode word by word until the "END" token is predicted or max length is reached
    for _ in range(max_output_length):
        # Step 6.1: Predict the next token with the current decoder input and encoder states
        decoder_input_tensor = tf.constant([decoder_input])  # Convert to tensor of shape (1, 1)
        
        decoder_embedding = model.get_layer('embedding_5')(decoder_input_tensor)  # Decoder embedding layer
        decoder_lstm = model.get_layer('lstm_5')  # Decoder LSTM
        decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
        
        # Step 6.2: Get the predicted word (index of max probability)
        predicted_index = np.argmax(decoder_output[0, -1, :].numpy())  # Get index of most probable word
        
        if predicted_index == tokenizer_ur(["END"]).numpy()[0][0]:
            break  # Stop if END token is predicted
        
        translated_tokens.append(predicted_index)  # Append predicted token to the output
        
        # Step 6.3: Update decoder input to the predicted token for the next step
        decoder_input = np.array([predicted_index])

    # Step 7: Convert the token indices back to words
    translated_sentence = ' '.join([tokenizer_ur.get_vocabulary()[idx] for idx in translated_tokens])
    
    return translated_sentence

In [35]:
sentence = "Ho how are you ?"  # Example input in English
translated_sentence = translate_lstm_model(
    sentence, model, tokenizer_en, tokenizer_ur
)
print("Translated Sentence:", translated_sentence)

Translated Sentence: جائے ہاں جائے ہاں جائے
