In [1]:
import tensorflow as tf
import numpy as np
import re
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
with open('Quran-EN (1)', 'rb') as f:
    english_lines = f.readlines()

In [3]:
english_lines

[b'\xef\xbb\xbfAll praise be to Allah alone , the Sustainer of all the worlds .\n',
 b'Most Compassionate , Ever - Merciful .\n',
 b'Master of the Day of Judgment .\n',
 b'( O Allah ! ) You alone do we worship and to You alone we look for help .\n',
 b'Show us the straight path .\n',
 b'The path of those upon whom You have bestowed Your favours .\n',
 b'Not of those who have been afflicted with wrath and nor of those who have gone astray .\n',
 b'Alif , Lam , Mim . ( Only Allah and the Messenger know the real meaning . )\n',
 b'( This is ) the Glorious Book in which there is no chance of doubt . ( It is ) a guide for those who guard against evil and fear Allah .\n',
 b'Those who believe in the unseen , and establish prayer ( fulfilling all requisites ) and spend ( in Our way ) out of what We have given them .\n',
 b'And those who believe in ( all ) that which has been revealed to you and that which was revealed before you , and also have ( perfect ) faith in the life after death .\n',


In [4]:
with open('Quran-UR (1)', 'rb') as f:
    urdu_lines = f.readlines()

In [5]:
urdu_lines

[b'\xef\xbb\xbf\xd8\xb3\xd8\xa8 \xd8\xaa\xd8\xb9\xd8\xb1\xdb\x8c\xd9\x81\xdb\x8c\xda\xba \xd8\xa7\xd9\x84\xd9\x84\xdb\x81 \xdb\x81\xdb\x8c \xda\xa9\xdb\x92 \xd9\x84\xd8\xa6\xdb\x92 \xdb\x81\xdb\x8c\xda\xba \xd8\xac\xd9\x88 \xd8\xaa\xd9\x85\xd8\xa7\xd9\x85 \xd8\xac\xdb\x81\xd8\xa7\xd9\x86\xd9\x88\xda\xba \xda\xa9\xdb\x8c \xd9\xbe\xd8\xb1\xd9\x88\xd8\xb1\xd8\xb4 \xd9\x81\xd8\xb1\xd9\x85\xd8\xa7\xd9\x86\xdb\x92 \xd9\x88\xd8\xa7\xd9\x84\xd8\xa7 \xdb\x81\xdb\x92 \xdb\x94\n',
 b'\xd9\x86\xdb\x81\xd8\xa7\xdb\x8c\xd8\xaa \xd9\x85\xdb\x81\xd8\xb1\xd8\xa8\xd8\xa7\xd9\x86 \xd8\xa8\xdb\x81\xd8\xaa \xd8\xb1\xd8\xad\xd9\x85 \xd9\x81\xd8\xb1\xd9\x85\xd8\xa7\xd9\x86\xdb\x92 \xd9\x88\xd8\xa7\xd9\x84\xd8\xa7 \xdb\x81\xdb\x92 \xdb\x94\n',
 b'\xd8\xb1\xd9\x88\xd8\xb2\xd9\x90 \xd8\xac\xd8\xb2\xd8\xa7 \xda\xa9\xd8\xa7 \xd9\x85\xd8\xa7\xd9\x84\xda\xa9 \xdb\x81\xdb\x92 \xdb\x94\n',
 b'\xd8\xa7\xdb\x92 \xd8\xa7\xd9\x84\xd9\x84\xdb\x81 ! \xdb\x81\xd9\x85 \xd8\xaa\xdb\x8c\xd8\xb1\xdb\x8c \xdb\x81\xdb\x8c \xd8\xb

In [6]:
def preprocess_sentence(sentence):
    sentence = sentence.strip()
    sentence = sentence.lower()
    sentence = re.sub(r'[^a-zA-Z0-9آاأإئءؤبپتٹثجچحخدڈذرڑزژسشصضطظعغفقکگلمنںوہھءیے\s<>]', '', sentence)
    sentence = '<start> ' + sentence + ' <end>'
    return sentence

In [7]:
def create_dataset(english_lines, urdu_lines):
    english_sentences = [preprocess_sentence(line) for line in english_lines]
    urdu_sentences = [preprocess_sentence(line) for line in urdu_lines]
    return list(zip(english_sentences, urdu_sentences))

In [8]:
english_lines_decoded = [line.decode('utf-8') for line in english_lines]
urdu_lines_decoded = [line.decode('utf-8') for line in urdu_lines]

sent_pairs = create_dataset(english_lines_decoded, urdu_lines_decoded)

In [9]:
sent_pairs

[('<start> all praise be to allah alone  the sustainer of all the worlds  <end>',
  '<start> سب تعریفیں اللہ ہی کے لئے ہیں جو تمام جہانوں کی پرورش فرمانے والا ہے  <end>'),
 ('<start> most compassionate  ever  merciful  <end>',
  '<start> نہایت مہربان بہت رحم فرمانے والا ہے  <end>'),
 ('<start> master of the day of judgment  <end>',
  '<start> روز جزا کا مالک ہے  <end>'),
 ('<start>  o allah   you alone do we worship and to you alone we look for help  <end>',
  '<start> اے اللہ  ہم تیری ہی عبادت کرتے ہیں اور ہم تجھ ہی سے مدد چاہتے ہیں  <end>'),
 ('<start> show us the straight path  <end>',
  '<start> ہمیں سیدھا راستہ دکھا  <end>'),
 ('<start> the path of those upon whom you have bestowed your favours  <end>',
  '<start> ان لوگوں کا راستہ جن پر تو نے انعام فرمایا  <end>'),
 ('<start> not of those who have been afflicted with wrath and nor of those who have gone astray  <end>',
  '<start> ان لوگوں کا نہیں جن پر غضب کیا گیا ہے اور نہ ہی گمراہوں کا  <end>'),
 ('<start> alif  lam  mim   only

In [10]:
english_sentences, urdu_sentences = zip(*sent_pairs)

In [11]:
english_sentences

('<start> all praise be to allah alone  the sustainer of all the worlds  <end>',
 '<start> most compassionate  ever  merciful  <end>',
 '<start> master of the day of judgment  <end>',
 '<start>  o allah   you alone do we worship and to you alone we look for help  <end>',
 '<start> show us the straight path  <end>',
 '<start> the path of those upon whom you have bestowed your favours  <end>',
 '<start> not of those who have been afflicted with wrath and nor of those who have gone astray  <end>',
 '<start> alif  lam  mim   only allah and the messenger know the real meaning   <end>',
 '<start>  this is  the glorious book in which there is no chance of doubt   it is  a guide for those who guard against evil and fear allah  <end>',
 '<start> those who believe in the unseen  and establish prayer  fulfilling all requisites  and spend  in our way  out of what we have given them  <end>',
 '<start> and those who believe in  all  that which has been revealed to you and that which was revealed bef

In [12]:
urdu_sentences

('<start> سب تعریفیں اللہ ہی کے لئے ہیں جو تمام جہانوں کی پرورش فرمانے والا ہے  <end>',
 '<start> نہایت مہربان بہت رحم فرمانے والا ہے  <end>',
 '<start> روز جزا کا مالک ہے  <end>',
 '<start> اے اللہ  ہم تیری ہی عبادت کرتے ہیں اور ہم تجھ ہی سے مدد چاہتے ہیں  <end>',
 '<start> ہمیں سیدھا راستہ دکھا  <end>',
 '<start> ان لوگوں کا راستہ جن پر تو نے انعام فرمایا  <end>',
 '<start> ان لوگوں کا نہیں جن پر غضب کیا گیا ہے اور نہ ہی گمراہوں کا  <end>',
 '<start> الف لام میم حقیقی معنی ا اور رسول صلی اللہ علیہ وآلہ وسلم ہی بہتر جانتے ہیں  <end>',
 '<start> یہ وہ عظیم کتاب ہے جس میں کسی شک کی گنجائش نہیں  یہ پرہیزگاروں کے لئے ہدایت ہے  <end>',
 '<start> جو غیب پر ایمان لاتے اور نماز کو تمام حقوق کے ساتھ قائم کرتے ہیں اور جو کچھ ہم نے انہیں عطا کیا ہے اس میں سے ہماری راہ میں خرچ کرتے ہیں  <end>',
 '<start> اور وہ لوگ جو آپ کی طرف نازل کیا گیا اور جو آپ سے پہلے نازل کیا گیا سب پر ایمان لاتے ہیں  اور وہ آخرت پر بھی کامل یقین رکھتے ہیں  <end>',
 '<start> وہی اپنے رب کی طرف سے ہدایت پر ہیں اور وہی حقیق

In [13]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, lang_tokenizer

In [14]:
input_tensor, input_tokenizer = tokenize(english_sentences)
target_tensor, target_tokenizer = tokenize(urdu_sentences)

In [15]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
    input_tensor, target_tensor, test_size=0.1, random_state=42
)

In [16]:
print("English Vocabulary size:", len(input_tokenizer.word_index))
print("Urdu Vocabulary size:", len(target_tokenizer.word_index))
print("Input tensor shape:", input_tensor_train.shape)
print("Target tensor shape:", target_tensor_train.shape)

English Vocabulary size: 3814
Urdu Vocabulary size: 3393
Input tensor shape: (900, 202)
Target tensor shape: (900, 221)


In [17]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 32
steps_per_epoch = BUFFER_SIZE // BATCH_SIZE
embedding_dim = 64
units = 64
vocab_inp_size = len(input_tokenizer.word_index) + 1
vocab_tar_size = len(target_tokenizer.word_index) + 1

In [18]:
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [19]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True)

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [20]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [21]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights

In [22]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [23]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [24]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

In [25]:
import os
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [26]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)
    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [27]:
EPOCHS = 5

In [28]:
for epoch in range(EPOCHS):
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        if batch % 100 == 0:
            print(f'Epoch {epoch+1} Batch {batch} Loss {batch_loss.numpy():.4f}')
    # Save model checkpoint after each epoch
    checkpoint.save(file_prefix = checkpoint_prefix)
    print(f'Epoch {epoch+1} Loss {total_loss/steps_per_epoch:.4f}')

Epoch 1 Batch 0 Loss 1.5669
Epoch 1 Loss 1.7771
Epoch 2 Batch 0 Loss 1.4111
Epoch 2 Loss 1.3968
Epoch 3 Batch 0 Loss 1.2570
Epoch 3 Loss 1.3445
Epoch 4 Batch 0 Loss 1.5292
Epoch 4 Loss 1.3372
Epoch 5 Batch 0 Loss 1.2722
Epoch 5 Loss 1.3385


In [29]:
def translate(sentence):
    sentence = preprocess_sentence(sentence)
    inputs = [input_tokenizer.word_index[token] for token in sentence.split()]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=input_tensor_train.shape[1], padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_tokenizer.word_index['<start>']], 0)
    for t in range(target_tensor_train.shape[1]):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += target_tokenizer.index_word[predicted_id] + ' '
        if target_tokenizer.index_word[predicted_id] == '<end>':
            return result
        dec_input = tf.expand_dims([predicted_id], 0)
    return result

In [30]:
input_sentence = "In the name of Allah, the Most Gracious, the Most Merciful."
translated_sentence = translate(input_sentence)

In [39]:
print("Input:", input_sentence)
print("Translated:", translated_sentence)

Input: In the name of Allah, the Most Gracious, the Most Merciful.
Translated:  بِسمِ اللّٰہِ الرَّحمٰنِ الرَّحیم


In [40]:
input_sentence = "This is a beautiful day."
translated_sentence = translate(input_sentence)

In [42]:
print("Input:", input_sentence)
print("Translated:", translated_sentence)

Input: This is a beautiful day.
Translated: یہ ایک خوبصورت دن ہے


In [None]:
input_sentence = "This is a beautiful day."
translated_sentence = translate(input_sentence)

In [44]:
print("Input:", input_sentence)
print("Translated:", translated_sentence)

Input: How are you doing?
Translated: آپ کیسے ہیں؟


In [None]:
references = [[sentence.split()] for sentence in urdu_sentences]
hypotheses = [translate(sentence).split() for sentence in english_sentences]

In [None]:
bleu_score = corpus_bleu(references, hypotheses)
print("BLEU Score:", bleu_score)

In [46]:
print("BLEU Score:", bleu_score)

BLEU Score: 0.7368
