In [30]:
# Install required packages
!pip install -U -q gdown
!pip install -q tensorflow-text

# Import libraries
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import gdown
import os
import warnings
warnings.filterwarnings('ignore')


In [31]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
def load_text_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]


In [33]:
urdu_file = "/content/drive/MyDrive/dlp/urdu.txt"  # Path to Urdu text file
english_file = "/content/drive/MyDrive/dlp/english.txt"  # Path to English text file

# Load both files
urdu_texts = load_text_file(urdu_file)
eng_texts = load_text_file(english_file)

# Verify we have matching number of verses
assert len(urdu_texts) == len(eng_texts), "English and Urdu files have different number of verses"


In [34]:
def preprocess(text, lang):
    text = text.lower() if lang == 'en' else text  # Only lowercase English
    text = re.sub(r'[^\w\s۔،؟]', '', text)  # Remove special chars
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return '[START] ' + text + ' [END]'

In [35]:
df = pd.DataFrame({
    'english': [preprocess(t, 'en') for t in eng_texts],
    'urdu': [preprocess(t, 'ur') for t in urdu_texts]
})

# 4. Text Vectorization
MAX_VOCAB = 5000
MAX_LEN = 50

eng_vectorizer = TextVectorization(
    max_tokens=MAX_VOCAB,
    output_mode='int',
    output_sequence_length=MAX_LEN
)
eng_vectorizer.adapt(df['english'])

urdu_vectorizer = TextVectorization(
    max_tokens=MAX_VOCAB,
    output_mode='int',
    output_sequence_length=MAX_LEN
)
urdu_vectorizer.adapt(df['urdu'])


In [36]:
def build_model(src_vocab_size, tgt_vocab_size):
    # Encoder
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(src_vocab_size, 256)(encoder_inputs)
    _, state_h, state_c = LSTM(512, return_state=True)(encoder_embedding)

    # Decoder
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(tgt_vocab_size, 256)(decoder_inputs)
    decoder_outputs = LSTM(512, return_sequences=True)(
        decoder_embedding, initial_state=[state_h, state_c])
    decoder_outputs = Dense(tgt_vocab_size, activation='softmax')(decoder_outputs)

    return Model([encoder_inputs, decoder_inputs], decoder_outputs)



In [37]:
# English to Urdu model

en2ur_model = build_model(
    len(eng_vectorizer.get_vocabulary()),
    len(urdu_vectorizer.get_vocabulary())
)
en2ur_model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy')

# Urdu to English model
ur2en_model = build_model(
    len(urdu_vectorizer.get_vocabulary()),
    len(eng_vectorizer.get_vocabulary())
)
ur2en_model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy')


In [38]:
X_train, X_val, y_train, y_val = train_test_split(
    df['english'], df['urdu'], test_size=0.2)

print("Training English to Urdu model...")
en2ur_model.fit(
    [eng_vectorizer(X_train), urdu_vectorizer(y_train)[:, :-1]],
    urdu_vectorizer(y_train)[:, 1:],
    epochs=5,
    batch_size=32
)


Training English to Urdu model...
Epoch 1/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 3s/step - loss: 4.7875
Epoch 2/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m437s[0m 3s/step - loss: 3.4670
Epoch 3/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 3s/step - loss: 3.2096
Epoch 4/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 3s/step - loss: 2.9748
Epoch 5/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 3s/step - loss: 2.8116


<keras.src.callbacks.history.History at 0x7df6bc920c90>

In [39]:
print("\nTraining Urdu to English model...")
ur2en_model.fit(
    [urdu_vectorizer(X_train), eng_vectorizer(y_train)[:, :-1]],
    eng_vectorizer(y_train)[:, 1:],
    epochs=5,
    batch_size=32
)


Training Urdu to English model...
Epoch 1/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m439s[0m 3s/step - loss: 1.3031
Epoch 2/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 3s/step - loss: 0.0483
Epoch 3/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m432s[0m 3s/step - loss: 0.0467
Epoch 4/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m430s[0m 3s/step - loss: 0.0461
Epoch 5/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m445s[0m 3s/step - loss: 0.0450


<keras.src.callbacks.history.History at 0x7df6bc8b1110>

In [41]:
def translate(model, src_vectorizer, tgt_vectorizer, text):
    text = preprocess(text, 'en' if src_vectorizer == eng_vectorizer else 'ur')
    tokens = src_vectorizer([text])
    decoded = '[START]'
    for _ in range(MAX_LEN):
        pred = model.predict([tokens, tgt_vectorizer([decoded])[:, :-1]], verbose=0)
        next_word = tgt_vectorizer.get_vocabulary()[np.argmax(pred[0, -1])]
        decoded += ' ' + next_word
        if next_word == '[END]':
            break
    return decoded.replace('[START] ', '').replace(' [END]', '')

In [42]:
test_samples = 3
for i in range(test_samples):
    # English to Urdu
    eng_text = df['english'].iloc[i].replace('[START] ', '').replace(' [END]', '')
    urdu_trans = translate(en2ur_model, eng_vectorizer, urdu_vectorizer, eng_text)

    # Urdu to English
    urdu_text = df['urdu'].iloc[i].replace('[START] ', '').replace(' [END]', '')
    eng_trans = translate(ur2en_model, urdu_vectorizer, eng_vectorizer, urdu_text)

In [43]:
    print(f"\nSample {i+1}:")
    print(f"English: {eng_text}")
    print(f"Urdu Translation: {urdu_trans}")
    print(f"Actual Urdu: {urdu_text}")
    print(f"\nUrdu: {urdu_text}")
    print(f"English Translation: {eng_trans}")
    print(f"Actual English: {eng_text}")




Sample 3:
English: master of the day of judgment
Urdu Translation:                                                  
Actual Urdu: روز جزا کا مالک ہے ۔

Urdu: روز جزا کا مالک ہے ۔
English Translation:                                                  
Actual English: master of the day of judgment


In [46]:
# Split original DF into source and target pairs correctly
X_train_en, X_val_en, y_train_ur, y_val_ur = train_test_split(
    df['english'], df['urdu'], test_size=0.2)

X_train_ur, X_val_ur, y_train_en, y_val_en = train_test_split(
    df['urdu'], df['english'], test_size=0.2)

# English to Urdu
print("Training English to Urdu model...")
en2ur_model.fit(
    [eng_vectorizer(X_train_en), urdu_vectorizer(y_train_ur)[:, :-1]],
    urdu_vectorizer(y_train_ur)[:, 1:],
    epochs=5,
    batch_size=32
)

# Urdu to English
print("\nTraining Urdu to English model...")
ur2en_model.fit(
    [urdu_vectorizer(X_train_ur), eng_vectorizer(y_train_en)[:, :-1]],
    eng_vectorizer(y_train_en)[:, 1:],
    epochs=5,
    batch_size=32
)


Training English to Urdu model...
Epoch 1/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m435s[0m 3s/step - loss: 2.7212
Epoch 2/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m436s[0m 3s/step - loss: 2.6111
Epoch 3/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 3s/step - loss: 2.4748
Epoch 4/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 3s/step - loss: 2.3891
Epoch 5/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 3s/step - loss: 2.2889

Training Urdu to English model...
Epoch 1/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m435s[0m 3s/step - loss: 4.3286
Epoch 2/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m431s[0m 3s/step - loss: 3.0408
Epoch 3/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 3s/step - loss: 2.7584
Epoch 4/5
[1m161/161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 3s/step - loss: 2.6050
Epoch 5/5
[1m161/161

<keras.src.callbacks.history.History at 0x7df6b7130a50>

In [47]:
for i in range(5):
    eng_text = df['english'].iloc[i].replace('[START] ', '').replace(' [END]', '')
    urdu_pred = translate(en2ur_model, eng_vectorizer, urdu_vectorizer, eng_text)

    urdu_text = df['urdu'].iloc[i].replace('[START] ', '').replace(' [END]', '')
    eng_pred = translate(ur2en_model, urdu_vectorizer, eng_vectorizer, urdu_text)

    print(f"\nSample {i+1}")
    print(f"EN: {eng_text}")
    print(f"Predicted UR: {urdu_pred}")
    print(f"Actual UR: {urdu_text}")

    print(f"\nUR: {urdu_text}")
    print(f"Predicted EN: {eng_pred}")
    print(f"Actual EN: {eng_text}")



Sample 1
EN: all praise be to allah alone the sustainer of all the worlds
Predicted UR:                                                  
Actual UR: سب تعریفیں اللہ ہی کے لئے ہیں جو تمام جہانوں کی پرورش فرمانے والا ہے ۔

UR: سب تعریفیں اللہ ہی کے لئے ہیں جو تمام جہانوں کی پرورش فرمانے والا ہے ۔
Predicted EN:                                                  
Actual EN: all praise be to allah alone the sustainer of all the worlds

Sample 2
EN: most compassionate ever merciful
Predicted UR:                                                  
Actual UR: نہایت مہربان بہت رحم فرمانے والا ہے ۔

UR: نہایت مہربان بہت رحم فرمانے والا ہے ۔
Predicted EN:                                                  
Actual EN: most compassionate ever merciful

Sample 3
EN: master of the day of judgment
Predicted UR:                                                  
Actual UR: روز جزا کا مالک ہے ۔

UR: روز جزا کا مالک ہے ۔
Predicted EN:                                                  
Actual EN: master of the d

In [49]:
print(f"\nSample {i+1}")
print(f"EN: {eng_text}")
print(f"Predicted UR: {urdu_pred}")
print(f"Actual UR: {urdu_text}")
print(f"Predicted UR Tokens: {urdu_pred.split()}")
print(f"Actual UR Tokens: {urdu_text.split()}")



Sample 5
EN: show us the straight path
Predicted UR:                                                  
Actual UR: ہمیں سیدھا راستہ دکھا ۔
Predicted UR Tokens: []
Actual UR Tokens: ['ہمیں', 'سیدھا', 'راستہ', 'دکھا', '۔']


In [51]:
def translate(model, src_vectorizer, tgt_vectorizer, text):
    text = preprocess(text, 'en' if src_vectorizer == eng_vectorizer else 'ur')
    print(f"\n🔹 Input: {text}")
    tokens = src_vectorizer([text])
    decoded = '[START]'
    generated_words = []

    for i in range(MAX_LEN):
        tgt_tokens = tgt_vectorizer([decoded])[:, :-1]
        pred = model.predict([tokens, tgt_tokens], verbose=0)

        # Handle prediction failure safely
        if pred.shape[-1] == 0:
            print("⚠️ Empty prediction vector.")
            break

        next_word_id = np.argmax(pred[0, -1])
        vocab = tgt_vectorizer.get_vocabulary()

        if next_word_id >= len(vocab):
            print(f"⚠️ Invalid word index: {next_word_id}")
            next_word = '[UNK]'
        else:
            next_word = vocab[next_word_id]

        print(f"🔸 Step {i+1} → Predicted word: {next_word}")

        if next_word == '[END]' and i < 3:
            continue  # skip early [END]

        if next_word == '[END]':
            break

        generated_words.append(next_word)
        decoded += ' ' + next_word

    if not generated_words:
        print("❌ No words generated — returning fallback message.")
        return "[NO TRANSLATION]"

    final_output = ' '.join(generated_words).strip()
    print(f"✅ Final output: {final_output}")
    return final_output


In [53]:
# ------------------ Vectorization Fix ------------------

MAX_VOCAB = 5000
MAX_LEN = 50

eng_vectorizer = TextVectorization(
    max_tokens=MAX_VOCAB,
    output_mode='int',
    output_sequence_length=MAX_LEN,
    standardize=None  # 🔧 Fix: don't strip [START]/[END]
)
urdu_vectorizer = TextVectorization(
    max_tokens=MAX_VOCAB,
    output_mode='int',
    output_sequence_length=MAX_LEN,
    standardize=None  # 🔧 Fix: don't strip [START]/[END]
)

eng_vectorizer.adapt(df['english'])
urdu_vectorizer.adapt(df['urdu'])

# ------------------ Translate Function Fix ------------------

def translate(model, src_vectorizer, tgt_vectorizer, text):
    text = preprocess(text, 'en' if src_vectorizer == eng_vectorizer else 'ur')
    print(f"\n🔹 Input: {text}")
    tokens = src_vectorizer([text])
    decoded = '[START]'
    generated_words = []

    for i in range(MAX_LEN):
        tgt_tokens = tgt_vectorizer([decoded])[:, :-1]
        pred = model.predict([tokens, tgt_tokens], verbose=0)
        if pred.shape[-1] == 0:
            break

        next_word_id = np.argmax(pred[0, -1])
        vocab = tgt_vectorizer.get_vocabulary()
        next_word = vocab[next_word_id] if next_word_id < len(vocab) else '[UNK]'

        print(f"🔸 Step {i+1} → {next_word}")

        if next_word == '[END]' and i < 3:
            continue
        if next_word == '[END]':
            break

        generated_words.append(next_word)
        decoded += ' ' + next_word

    final_output = ' '.join(generated_words).strip()
    if not generated_words:
        print("❌ No translation generated.")
        return "[NO TRANSLATION]"
    print(f"✅ Output: {final_output}")
    return final_output

# ------------------ Run & Evaluate ------------------

for i in range(5):
    eng_text = df['english'].iloc[i].replace('[START] ', '').replace(' [END]', '')
    urdu_text = df['urdu'].iloc[i].replace('[START] ', '').replace(' [END]', '')

    urdu_pred = translate(en2ur_model, eng_vectorizer, urdu_vectorizer, eng_text)
    eng_pred = translate(ur2en_model, urdu_vectorizer, eng_vectorizer, urdu_text)

    print(f"\nSample {i+1}")
    print(f"EN: {eng_text}")
    print(f"Predicted UR: {urdu_pred}")
    print(f"Actual UR: {urdu_text}")
    print(f"Predicted UR Tokens: {urdu_pred.split()}")
    print(f"Actual UR Tokens: {urdu_text.split()}")

    print(f"\nUR: {urdu_text}")
    print(f"Predicted EN: {eng_pred}")
    print(f"Actual EN: {eng_text}")
    print(f"Predicted EN Tokens: {eng_pred.split()}")
    print(f"Actual EN Tokens: {eng_text.split()}")

    # ------------------ BLEU Score ------------------
    bleu_en_ur = sentence_bleu(
        [urdu_text.split()],
        urdu_pred.split(),
        smoothing_function=SmoothingFunction().method1
    )
    bleu_ur_en = sentence_bleu(
        [eng_text.split()],
        eng_pred.split(),
        smoothing_function=SmoothingFunction().method1
    )

    print(f"\n🔵 BLEU Scores:")
    print(f"EN → UR: {bleu_en_ur:.4f}")
    print(f"UR → EN: {bleu_ur_en:.4f}")
    print("="*50)




🔹 Input: [START] all praise be to allah alone the sustainer of all the worlds [END]
🔸 Step 1 → 
🔸 Step 2 → 
🔸 Step 3 → 
🔸 Step 4 → 
🔸 Step 5 → 
🔸 Step 6 → 
🔸 Step 7 → 
🔸 Step 8 → 
🔸 Step 9 → 
🔸 Step 10 → 
🔸 Step 11 → 
🔸 Step 12 → 
🔸 Step 13 → 
🔸 Step 14 → 
🔸 Step 15 → 
🔸 Step 16 → 
🔸 Step 17 → 
🔸 Step 18 → 
🔸 Step 19 → 
🔸 Step 20 → 
🔸 Step 21 → 
🔸 Step 22 → 
🔸 Step 23 → 
🔸 Step 24 → 
🔸 Step 25 → 
🔸 Step 26 → 
🔸 Step 27 → 
🔸 Step 28 → 
🔸 Step 29 → 
🔸 Step 30 → 
🔸 Step 31 → 
🔸 Step 32 → 
🔸 Step 33 → 
🔸 Step 34 → 
🔸 Step 35 → 
🔸 Step 36 → 
🔸 Step 37 → 
🔸 Step 38 → 
🔸 Step 39 → 
🔸 Step 40 → 
🔸 Step 41 → 
🔸 Step 42 → 
🔸 Step 43 → 
🔸 Step 44 → 
🔸 Step 45 → 
🔸 Step 46 → 
🔸 Step 47 → 
🔸 Step 48 → 
🔸 Step 49 → 
🔸 Step 50 → 
✅ Output: 

🔹 Input: [START] سب تعریفیں اللہ ہی کے لئے ہیں جو تمام جہانوں کی پرورش فرمانے والا ہے ۔ [END]
🔸 Step 1 → 
🔸 Step 2 → 
🔸 Step 3 → 
🔸 Step 4 → 
🔸 Step 5 → 
🔸 Step 6 → 
🔸 Step 7 → 
🔸 Step 8 → 
🔸 Step 9 → 
🔸 Step 10 → 
🔸 Step 11 → 
🔸 Step 12 → 
🔸 Step 13 → 
🔸 Step 1

In [48]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

bleu_en_ur = sentence_bleu(
    [urdu_text.split()],
    urdu_pred.split(),
    smoothing_function=SmoothingFunction().method1
)
bleu_ur_en = sentence_bleu(
    [eng_text.split()],
    eng_pred.split(),
    smoothing_function=SmoothingFunction().method1
)

print(f"\nBLEU Scores:")
print(f"EN→UR: {bleu_en_ur:.4f}")
print(f"UR→EN: {bleu_ur_en:.4f}")



BLEU Scores:
EN→UR: 0.0000
UR→EN: 0.0000


In [45]:
bleu_en_ur = sentence_bleu(
        [urdu_text.split()],
        urdu_trans.split(),
        smoothing_function=SmoothingFunction().method1
    )
bleu_ur_en = sentence_bleu(
        [eng_text.split()],
        eng_trans.split(),
        smoothing_function=SmoothingFunction().method1
    )

print(f"\nBLEU Scores:")
print(f"EN->UR: {bleu_en_ur:.4f}")
print(f"UR->EN: {bleu_ur_en:.4f}")
print("="*50)



BLEU Scores:
EN->UR: 0.0000
UR->EN: 0.0000


In [54]:
# Freeze vocabulary
eng_vocab = eng_vectorizer.get_vocabulary()
urdu_vocab = urdu_vectorizer.get_vocabulary()

eng_vocab_size = len(eng_vocab)
urdu_vocab_size = len(urdu_vocab)


In [55]:
en2ur_model = build_model(eng_vocab_size, urdu_vocab_size)
ur2en_model = build_model(urdu_vocab_size, eng_vocab_size)


In [56]:
print("[START] in English vocab?", "[START]" in eng_vocab)
print("[END] in English vocab?", "[END]" in eng_vocab)
print("[START] in Urdu vocab?", "[START]" in urdu_vocab)
print("[END] in Urdu vocab?", "[END]" in urdu_vocab)


[START] in English vocab? True
[END] in English vocab? True
[START] in Urdu vocab? True
[END] in Urdu vocab? True


In [59]:
def translate(model, src_vectorizer, tgt_vectorizer, text, lang='en'):
    print(f"\n🔹 Preprocessing Input Text: {text}")
    text = preprocess(text, lang)
    print(f"🔸 Preprocessed Text: {text}")

    # Encode the source text (input to the encoder)
    encoder_input = src_vectorizer([text])
    print(f"🔸 Encoder Input: {encoder_input}")

    # Decode with initial '[START]' token
    decoded_sentence = '[START]'
    result = []

    for i in range(MAX_LEN):
        # Vectorize the partial output (i.e., previously generated words)
        decoder_input = tgt_vectorizer([decoded_sentence])[:, :-1]  # Exclude the last token
        print(f"🔸 Decoder Input for Step {i+1}: {decoder_input}")

        # Make a prediction
        preds = model.predict([encoder_input, decoder_input], verbose=0)
        print(f"🔸 Prediction Shape: {preds.shape}")

        # Get the most likely next token
        next_token_id = np.argmax(preds[0, -1])
        vocab = tgt_vectorizer.get_vocabulary()

        # Check for valid prediction
        if next_token_id >= len(vocab):
            print(f"⚠️ Invalid token ID: {next_token_id}")
            break

        next_word = vocab[next_token_id]
        print(f"🔸 Step {i+1}: Predicted Word: {next_word}")

        # If we hit the end token, stop
        if next_word == '[END]':
            break

        result.append(next_word)
        decoded_sentence += ' ' + next_word

    # If no output is generated, return a fallback message
    if not result:
        return "[NO TRANSLATION]"

    final_output = ' '.join(result).strip()
    print(f"✅ Final Output: {final_output}")
    return final_output

# Test the translate function with a simple example to verify everything works
simple_input = "show us the straight path"
print("\nTesting Translation with Simple Input:")
output = translate(en2ur_model, eng_vectorizer, urdu_vectorizer, simple_input, lang='en')
print(f"\nTranslated Output: {output}")



Testing Translation with Simple Input:

🔹 Preprocessing Input Text: show us the straight path
🔸 Preprocessed Text: [START] show us the straight path [END]
🔸 Encoder Input: [[  5 322  68   2 276 150   6   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
🔸 Decoder Input for Step 1: [[5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]]
🔸 Prediction Shape: (1, 49, 5000)
🔸 Step 1: Predicted Word: لائی
🔸 Decoder Input for Step 2: [[   5 3277    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0]]
🔸 Prediction Shape: (1, 49, 5000)
🔸 Step 2: Predicted Word: لائی
🔸 Decoder Input for Step 3: [[   5 3277 3277    0    0    0    0    0    0

In [60]:
from tensorflow.keras.layers import Attention

def build_model_with_attention(src_vocab_size, tgt_vocab_size):
    # Encoder
    encoder_inputs = Input(shape=(None,))
    encoder_embedding = Embedding(src_vocab_size, 256)(encoder_inputs)
    encoder_lstm = LSTM(512, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

    # Decoder with attention
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(tgt_vocab_size, 256)(decoder_inputs)
    decoder_lstm = LSTM(512, return_sequences=True)
    decoder_lstm_output = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

    # Attention Layer
    attention = Attention()([decoder_lstm_output, encoder_outputs])
    context_vector = attention  # Attention context vector

    # Final output layer
    decoder_outputs = Dense(tgt_vocab_size, activation='softmax')(context_vector)

    return Model([encoder_inputs, decoder_inputs], decoder_outputs)



In [62]:
pip install reportlab


Collecting reportlab
  Downloading reportlab-4.4.0-py3-none-any.whl.metadata (1.8 kB)
Downloading reportlab-4.4.0-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m1.1/2.0 MB[0m [31m34.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.0


In [64]:
!pip install fpdf
from fpdf import FPDF


Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=7685c175cb0854fcc982e7d215b9f4568c4a802543f61ddeb43dc51d4935805b
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [72]:
# -*- coding: utf-8 -*-
"""Save Translation Outputs to PDF.ipynb"""

from fpdf import FPDF
from google.colab import drive
import arabic_reshaper
from bidi.algorithm import get_display

# Mount Google Drive
drive.mount('/content/drive')

# Create PDF class to capture print outputs
class TranslationPDF(FPDF):
    def __init__(self):
        super().__init__()
        self.set_auto_page_break(auto=True, margin=15)
        self.add_page()
        self.set_font('Arial', '', 12)

    def capture_print(self, text):
        # Handle Arabic/Urdu text
        if any('\u0600' <= char <= '\u06FF' for char in text):
            reshaped = arabic_reshaper.reshape(text)
            bidi_text = get_display(reshaped)
            self.multi_cell(0, 10, bidi_text, align='R')
        else:
            self.multi_cell(0, 10, text)
        self.ln(5)

# Create PDF and capture outputs
pdf = TranslationPDF()

for i in range(5):
    eng_text = df['english'].iloc[i].replace('[START] ', '').replace(' [END]', '')
    urdu_pred = translate(en2ur_model, eng_vectorizer, urdu_vectorizer, eng_text)
    urdu_text = df['urdu'].iloc[i].replace('[START] ', '').replace(' [END]', '')
    eng_pred = translate(ur2en_model, urdu_vectorizer, eng_vectorizer, urdu_text)

    # Capture all print statements
    pdf.capture_print(f"\nSample {i+1}")
    pdf.capture_print(f"EN: {eng_text}")
    pdf.capture_print(f"Predicted UR: {urdu_pred}")
    pdf.capture_print(f"Actual UR: {urdu_text}")
    pdf.capture_print(f"\nUR: {urdu_text}")
    pdf.capture_print(f"Predicted EN: {eng_pred}")
    pdf.capture_print(f"Actual EN: {eng_text}")
    pdf.capture_print("-" * 50)

# Save to Google Drive
output_path = '/content/drive/MyDrive/translation_outputs.pdf'
pdf.output(output_path)

print(f"\nAll translation outputs saved to: {output_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

🔹 Preprocessing Input Text: all praise be to allah alone the sustainer of all the worlds
🔸 Preprocessed Text: [START] all praise be to allah alone the sustainer of all the worlds [END]
🔸 Encoder Input: [[  5  34 405  26   8  12  85   2 931   4  34   2 377   6   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
🔸 Decoder Input for Step 1: [[5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]]
🔸 Prediction Shape: (1, 49, 5000)
🔸 Step 1: Predicted Word: لائی
🔸 Decoder Input for Step 2: [[   5 3277    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0   

UnicodeEncodeError: 'latin-1' codec can't encode characters in position 193-197: ordinal not in range(256)