#Urdu -> English -> French

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import pickle

In [None]:
# At the beginning of your notebook (after imports)
def load_all_models():
    # Urdu-English models
    encoder_model = load_model('encoder_model.keras')
    decoder_model = load_model('decoder_model.keras')
    with open('tokenizer_urdu.pkl', 'rb') as f:
        tokenizer_urdu = pickle.load(f)
    with open('tokenizer_english.pkl', 'rb') as f:
        tokenizer_english = pickle.load(f)

    # English-French models
    eng_fr_model = load_model('/content/eng_fr_transformer.keras')
    with open('tokenizer_en.pkl', 'rb') as f:
        tokenizer_en = pickle.load(f)
    with open('tokenizer_fr.pkl', 'rb') as f:
        tokenizer_fr = pickle.load(f)

    return {
        'urdu_encoder': encoder_model,
        'urdu_decoder': decoder_model,
        'urdu_tokenizer': tokenizer_urdu,
        'english_tokenizer': tokenizer_english,
        'eng_fr_model': eng_fr_model,
        'en_tokenizer': tokenizer_en,
        'fr_tokenizer': tokenizer_fr
    }


In [None]:
# Load dataset (update path)
df = pd.read_excel('/content/english_urdu_dataset (1).xlsx', engine='openpyxl')
urdu_texts = df['Urdu'].astype(str).tolist()
english_texts = df['English'].astype(str).tolist()

# urdu_texts = urdu_texts[:30000]
# english_texts = english_texts[:30000]

# Preprocessing
MAX_LENGTH = 20  # Filter long sentences
urdu_texts = [text.lower() for text in urdu_texts]
english_texts = [text.lower() for text in english_texts]
filtered_pairs = [(u, e) for u, e in zip(urdu_texts, english_texts)
                 if len(u.split()) <= MAX_LENGTH and len(e.split()) <= MAX_LENGTH]
urdu_texts, english_texts = zip(*filtered_pairs)

In [None]:
# Urdu tokenizer
tokenizer_urdu = Tokenizer(filters='', oov_token='<OOV>')
tokenizer_urdu.fit_on_texts(urdu_texts)
urdu_vocab_size = len(tokenizer_urdu.word_index) + 1

# English tokenizer
tokenizer_english = Tokenizer(filters='', oov_token='<OOV>')
tokenizer_english.fit_on_texts(english_texts)
english_vocab_size = len(tokenizer_english.word_index) + 1

# Save tokenizers
with open('tokenizer_urdu.pkl', 'wb') as f:
    pickle.dump(tokenizer_urdu, f)
with open('tokenizer_english.pkl', 'wb') as f:
    pickle.dump(tokenizer_english, f)

In [None]:
# Convert to sequences
urdu_sequences = tokenizer_urdu.texts_to_sequences(urdu_texts)
english_sequences = tokenizer_english.texts_to_sequences(english_texts)

# Padding
urdu_padded = pad_sequences(urdu_sequences, padding='post')
english_padded = pad_sequences(english_sequences, padding='post')

# For teacher forcing
decoder_input_data = english_padded[:, :-1]
decoder_target_data = english_padded[:, 1:]

In [None]:
# Hyperparameters
EMBEDDING_DIM = 512
LSTM_UNITS = 256

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(urdu_vocab_size, EMBEDDING_DIM)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(LSTM_UNITS, return_state=True))
_, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(english_vocab_size, EMBEDDING_DIM)(decoder_inputs)
decoder_lstm = LSTM(LSTM_UNITS*2, return_sequences=True, return_state=True)  # *2 for bidirectional
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(english_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Compile
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(0.001), loss='sparse_categorical_crossentropy')
model.summary()

In [None]:
# Callbacks
checkpoint = ModelCheckpoint(
    'seq2seq_best.keras',
    monitor='loss',
    save_best_only=True,
    mode='min'
)
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3)

# Train
history = model.fit(
    [urdu_padded, decoder_input_data],
    np.expand_dims(decoder_target_data, -1),
    batch_size=32,
    epochs=30,
    callbacks=[checkpoint, reduce_lr]
)

Epoch 1/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 58ms/step - loss: 2.1462 - learning_rate: 0.0010
Epoch 2/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 62ms/step - loss: 1.2697 - learning_rate: 0.0010
Epoch 3/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 61ms/step - loss: 0.8699 - learning_rate: 0.0010
Epoch 4/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 62ms/step - loss: 0.5784 - learning_rate: 0.0010
Epoch 5/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 62ms/step - loss: 0.3705 - learning_rate: 0.0010
Epoch 6/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 87ms/step - loss: 0.2388 - learning_rate: 0.0010
Epoch 7/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 62ms/step - loss: 0.1552 - learning_rate: 0.0010
Epoch 8/30
[1m1585/1585[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 62ms/step -

In [None]:
# Encoder inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder inference
decoder_state_input_h = Input(shape=(LSTM_UNITS*2,))
decoder_state_input_c = Input(shape=(LSTM_UNITS*2,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

# Save
encoder_model.save('encoder_model.keras')
decoder_model.save('decoder_model.keras')

In [None]:
def translate_urdu_to_english(input_urdu):
    # Preprocess input
    input_seq = tokenizer_urdu.texts_to_sequences([input_urdu.lower()])
    input_seq = pad_sequences(input_seq, maxlen=urdu_padded.shape[1], padding='post')

    # Encode input
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence with START (use the first token index)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_english.word_index.get(
        next(iter(tokenizer_english.word_index)), 1  # Fallback to 1 if empty
    )

    # Decode step-by-step
    decoded_sentence = []
    for _ in range(english_padded.shape[1]):  # Max output length
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0)

        # Get most likely word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_english.index_word.get(sampled_token_index, '')

        # Stop if unknown or max length reached
        if not sampled_word or sampled_token_index == 0:
            break

        decoded_sentence.append(sampled_word)

        # Update for next iteration
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return ' '.join(decoded_sentence)

# Test with your example
print(translate_urdu_to_english("تم کیا کھا رہے ہو"))

are you eaten


In [None]:
!pip install transformers torch sentencepiece

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load your English-French dataset (update path)
df = pd.read_csv("/content/eng_-french.csv")  # Replace with your file
english_texts = df["English words/sentences"].astype(str).tolist()
french_texts = df["French words/sentences"].astype(str).tolist()

print(f"Sample English: {english_texts[0]}")
print(f"Sample French: {french_texts[0]}")

Sample English: Hi.
Sample French: Salut!


In [None]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zéèêëàâùûüçôîïÿœæ\s]", "", text)  # Keep French accents
    return text.strip()

english_texts = [clean_text(text) for text in english_texts]
french_texts = [clean_text(text) for text in french_texts]

# Add <start> and <end> tokens to French (for decoder input)
french_texts = ["<start> " + text + " <end>" for text in french_texts]

In [None]:
# Trim dataset to top frequent words (reduce vocab size)
MAX_VOCAB_SIZE = 10000  # Reduced from ~30k to 10k
MAX_LENGTH = 20
SAMPLE_SIZE = 50000  # Adjust based on your dataset size

# Filter long sentences
filtered_pairs = [(en, fr) for en, fr in zip(english_texts, french_texts)
                 if len(en.split()) <= MAX_LENGTH and len(fr.split()) <= MAX_LENGTH]

# Take a subset if needed
if len(filtered_pairs) > SAMPLE_SIZE:
    filtered_pairs = filtered_pairs[:SAMPLE_SIZE]

english_texts, french_texts = zip(*filtered_pairs)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# English tokenizer with reduced vocab
tokenizer_en = Tokenizer(
    num_words=MAX_VOCAB_SIZE,
    filters='',
    oov_token='<OOV>'
)
tokenizer_en.fit_on_texts(english_texts)

# French tokenizer with reduced vocab
tokenizer_fr = Tokenizer(
    num_words=MAX_VOCAB_SIZE,
    filters='',
    oov_token='<OOV>'
)
tokenizer_fr.fit_on_texts(french_texts)

# Update vocab sizes
en_vocab_size = min(MAX_VOCAB_SIZE, len(tokenizer_en.word_index)) + 1
fr_vocab_size = min(MAX_VOCAB_SIZE, len(tokenizer_fr.word_index)) + 1

print(f"Trimmed English vocab: {en_vocab_size}")
print(f"Trimmed French vocab: {fr_vocab_size}")

Trimmed English vocab: 5952
Trimmed French vocab: 10001


In [None]:
# Convert texts to sequences and pad (ADD THIS CELL)
en_sequences = tokenizer_en.texts_to_sequences(english_texts)
fr_sequences = tokenizer_fr.texts_to_sequences(french_texts)

en_padded = pad_sequences(en_sequences, maxlen=MAX_LENGTH, padding='post')
fr_padded = pad_sequences(fr_sequences, maxlen=MAX_LENGTH, padding='post')

print(f"English padded shape: {en_padded.shape}")
print(f"French padded shape: {fr_padded.shape}")

English padded shape: (50000, 20)
French padded shape: (50000, 20)


In [None]:
# Verify shapes
print(f"English padded shape: {en_padded.shape}")
print(f"French padded shape: {fr_padded.shape}")

# Create decoder input (remove last token) and target (remove first token)
decoder_input_data = fr_padded[:, :-1]  # shape: (samples, max_length-1)
decoder_target_data = fr_padded[:, 1:]   # shape: (samples, max_length-1)

# Pad decoder sequences to match max_length
decoder_input_data = pad_sequences(decoder_input_data, maxlen=MAX_LENGTH, padding='post')
decoder_target_data = pad_sequences(decoder_target_data, maxlen=MAX_LENGTH, padding='post')

print(f"\nAfter padding:")
print(f"Decoder input shape: {decoder_input_data.shape}")
print(f"Decoder target shape: {decoder_target_data.shape}")

English padded shape: (50000, 20)
French padded shape: (50000, 20)

After padding:
Decoder input shape: (50000, 20)
Decoder target shape: (50000, 20)


In [None]:
import pickle
import tensorflow as tf

def save_models():
    # Save English→French Transformer
    model.save('eng_fr_transformer.keras')

    # Save tokenizers
    with open('tokenizer_en.pkl', 'wb') as f:
        pickle.dump(tokenizer_en, f)
    with open('tokenizer_fr.pkl', 'wb') as f:
        pickle.dump(tokenizer_fr, f)

def load_models():
    # Load English→French Transformer
    loaded_model = tf.keras.models.load_model('/content/eng_fr_transformer.keras')

    # Load tokenizers
    with open('tokenizer_en.pkl', 'rb') as f:
        loaded_tokenizer_en = pickle.load(f)
    with open('tokenizer_fr.pkl', 'rb') as f:
        loaded_tokenizer_fr = pickle.load(f)

    return loaded_model, loaded_tokenizer_en, loaded_tokenizer_fr

In [None]:
print("Shapes verification:")
print(f"English padded: {en_padded.shape} (should be [samples, sequence_length])")
print(f"French decoder input: {decoder_input_data.shape} (should match English)")
print(f"French decoder target: {decoder_target_data.shape}")
print(f"Model input shapes: {[i.shape for i in model.inputs]}")
print(f"Model output shape: {model.output.shape}")

Shapes verification:
English padded: (50000, 20) (should be [samples, sequence_length])
French decoder input: (50000, 20) (should match English)
French decoder target: (50000, 20)
Model input shapes: [(None, None), (None, None)]
Model output shape: (None, None, 17735)


In [None]:
# Train with explicit validation split
history = model.fit(
    x=[en_padded, decoder_input_data],
    y=np.expand_dims(decoder_target_data, -1),  # Add extra dimension for sparse_categorical_crossentropy
    batch_size=16,
    epochs=25,
    validation_split=0.2,
    callbacks=[
        tf.keras.callbacks.ModelCheckpoint(
            'best_eng_fr.keras',
            save_best_only=True,
            monitor='val_loss'
        ),
    ]
)

Epoch 1/25
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 48ms/step - loss: 0.0609 - val_loss: 1.1271
Epoch 2/25
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 46ms/step - loss: 0.0589 - val_loss: 1.1214
Epoch 3/25
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 44ms/step - loss: 0.0583 - val_loss: 1.1286
Epoch 4/25
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 44ms/step - loss: 0.0580 - val_loss: 1.1392
Epoch 5/25
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 42ms/step - loss: 0.0573 - val_loss: 1.1388
Epoch 6/25
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 42ms/step - loss: 0.0571 - val_loss: 1.1377
Epoch 7/25
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 44ms/step - loss: 0.0567 - val_loss: 1.1513
Epoch 8/25
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 44ms/step - loss: 0.0566 - val_loss: 1.1414


In [None]:
save_models()

In [None]:
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load Urdu to English model components
def load_urdu_english_models():
    """Load the Urdu to English translation models and tokenizers"""
    try:
        # Load models
        encoder_model = load_model('/content/drive/MyDrive/DLP Models/encoder_model (1).keras')
        decoder_model = load_model('/content/drive/MyDrive/DLP Models/decoder_model (1).keras')

        # Load tokenizers
        with open('/content/drive/MyDrive/DLP Models/tokenizer_urdu (1).pkl', 'rb') as f:
            tokenizer_urdu = pickle.load(f)
        with open('/content/drive/MyDrive/DLP Models/tokenizer_english (1).pkl', 'rb') as f:
            tokenizer_english = pickle.load(f)

        return encoder_model, decoder_model, tokenizer_urdu, tokenizer_english
    except Exception as e:
        print(f"Error loading Urdu-English models: {e}")
        return None, None, None, None

# Load English to French model components
def load_english_french_models():
    """Load the English to French translation model and tokenizers"""
    try:
        # Load model
        model = load_model('/content/drive/MyDrive/DLP Models/eng_fr_transformer.keras')

        # Load tokenizers
        with open('/content/drive/MyDrive/DLP Models/tokenizer_en.pkl', 'rb') as f:
            tokenizer_en = pickle.load(f)
        with open('/content/drive/MyDrive/DLP Models/tokenizer_fr.pkl', 'rb') as f:
            tokenizer_fr = pickle.load(f)

        return model, tokenizer_en, tokenizer_fr
    except Exception as e:
        print(f"Error loading English-French models: {e}")
        return None, None, None

# Urdu to English translation function
def translate_urdu_to_english(input_urdu, encoder_model, decoder_model, tokenizer_urdu, tokenizer_english, max_length=20):
    """Translate Urdu text to English using the loaded models"""
    # Preprocess input
    input_seq = tokenizer_urdu.texts_to_sequences([input_urdu.lower()])
    input_seq = pad_sequences(input_seq, maxlen=max_length, padding='post')

    # Encode input
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence with START token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_english.word_index.get('start', 1)  # Fallback to 1 if 'start' not found

    # Decode step-by-step
    decoded_sentence = []
    for _ in range(max_length):
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0)

        # Get most likely word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_english.index_word.get(sampled_token_index, '')

        # Stop if unknown or end token reached
        if not sampled_word or sampled_token_index == tokenizer_english.word_index.get('end', 0):
            break

        decoded_sentence.append(sampled_word)

        # Update for next iteration
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return ' '.join(decoded_sentence)

# English to French translation function
def translate_english_to_french(english_text, model, tokenizer_en, tokenizer_fr, max_length=20):
    """Translate English text to French using the loaded model"""
    # Preprocess input
    input_seq = tokenizer_en.texts_to_sequences([english_text.lower()])
    input_seq = pad_sequences(input_seq, maxlen=max_length, padding='post')

    # Initialize decoder with <start>
    decoder_input = tf.expand_dims([tokenizer_fr.word_index['<start>']], 0)

    # Generate translation
    output = []
    for _ in range(max_length):
        predictions = model.predict([input_seq, decoder_input], verbose=0)
        predicted_id = tf.argmax(predictions[0, -1, :]).numpy()

        if predicted_id == tokenizer_fr.word_index['<end>']:
            break

        output.append(tokenizer_fr.index_word[predicted_id])
        decoder_input = tf.concat([decoder_input, tf.expand_dims([predicted_id], 0)], axis=-1)

    return ' '.join(output)

# Full pipeline function
def full_pipeline(urdu_text):
    """Translate Urdu to French via English"""
    # Load models (only once if you make this a class or use global variables)
    encoder_model, decoder_model, tokenizer_urdu, tokenizer_english = load_urdu_english_models()
    eng_fr_model, tokenizer_en, tokenizer_fr = load_english_french_models()

    if None in [encoder_model, decoder_model, tokenizer_urdu, tokenizer_english]:
        return {"error": "Urdu-English models failed to load"}
    if None in [eng_fr_model, tokenizer_en, tokenizer_fr]:
        return {"error": "English-French models failed to load"}

    # Step 1: Urdu to English
    english_text = translate_urdu_to_english(
        urdu_text,
        encoder_model,
        decoder_model,
        tokenizer_urdu,
        tokenizer_english
    )

    # Step 2: English to French
    french_text = translate_english_to_french(
        english_text,
        eng_fr_model,
        tokenizer_en,
        tokenizer_fr
    )

    return {
        'urdu': urdu_text,
        'english': english_text,
        'french': french_text
    }

# Example usage
if __name__ == "__main__":
    # Test the pipeline
    result = full_pipeline("تم کیا کھا رہے ہو")  # "What are you eating?"

    print(f"""
    Urdu: {result['urdu']}
    English: {result['english']}
    French: {result['french']}
    """)


    Urdu: تم کیا کھا رہے ہو
    English: eating
    French: vous êtes en train de manger
    


#French -> English -> Urdu

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, MultiHeadAttention, LayerNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam
import pickle
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import re

In [None]:
# Load dataset
df = pd.read_csv("/content/drive/MyDrive/DLP Models/eng_-french.csv")  # Replace with your dataset
french_texts = df["French words/sentences"].astype(str).tolist()
english_texts = df["English words/sentences"].astype(str).tolist()

# Clean and preprocess
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zéèêëàâùûüçôîïÿœæ\s]", "", text)
    return text.strip()

french_texts = [clean_text(text) for text in french_texts]
english_texts = ["<start> " + clean_text(text) + " <end>" for text in english_texts]

# Filter by length
MAX_LENGTH = 20
filtered_pairs = [(fr, en) for fr, en in zip(french_texts, english_texts)
                 if len(fr.split()) <= MAX_LENGTH and len(en.split()) <= MAX_LENGTH]
french_texts, english_texts = zip(*filtered_pairs[:50000])

In [None]:
# French tokenizer
tokenizer_fr = Tokenizer(filters='', oov_token='<OOV>')
tokenizer_fr.fit_on_texts(french_texts)
fr_vocab_size = len(tokenizer_fr.word_index) + 1

# English tokenizer
tokenizer_en = Tokenizer(filters='', oov_token='<OOV>')
tokenizer_en.fit_on_texts(english_texts)
en_vocab_size = len(tokenizer_en.word_index) + 1

# Save tokenizers
with open('tokenizer_fr_reverse.pkl', 'wb') as f:
    pickle.dump(tokenizer_fr, f)
with open('tokenizer_en_reverse.pkl', 'wb') as f:
    pickle.dump(tokenizer_en, f)

In [None]:
# Hyperparameters
EMBED_DIM = 256
NUM_HEADS = 8
FF_DIM = 512
DROPOUT_RATE = 0.1

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(fr_vocab_size, EMBED_DIM)(encoder_inputs)
enc_emb = Dropout(DROPOUT_RATE)(enc_emb)

# Transformer Encoder Layers
for _ in range(4):
    attn_output = MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMBED_DIM)(enc_emb, enc_emb)
    attn_output = Dropout(DROPOUT_RATE)(attn_output)
    enc_emb = LayerNormalization(epsilon=1e-6)(enc_emb + attn_output)

    ffn_output = Dense(FF_DIM, activation='relu')(enc_emb)
    ffn_output = Dense(EMBED_DIM)(ffn_output)
    ffn_output = Dropout(DROPOUT_RATE)(ffn_output)
    enc_emb = LayerNormalization(epsilon=1e-6)(enc_emb + ffn_output)

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(en_vocab_size, EMBED_DIM)(decoder_inputs)
dec_emb = Dropout(DROPOUT_RATE)(dec_emb)

# Transformer Decoder Layers
for _ in range(4):
    attn1 = MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMBED_DIM)(dec_emb, dec_emb)
    attn1 = Dropout(DROPOUT_RATE)(attn1)
    dec_emb = LayerNormalization(epsilon=1e-6)(dec_emb + attn1)

    attn2 = MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMBED_DIM)(dec_emb, enc_emb)
    attn2 = Dropout(DROPOUT_RATE)(attn2)
    dec_emb = LayerNormalization(epsilon=1e-6)(dec_emb + attn2)

    ffn_output = Dense(FF_DIM, activation='relu')(dec_emb)
    ffn_output = Dense(EMBED_DIM)(ffn_output)
    ffn_output = Dropout(DROPOUT_RATE)(ffn_output)
    dec_emb = LayerNormalization(epsilon=1e-6)(dec_emb + ffn_output)

# Final output
decoder_outputs = Dense(en_vocab_size, activation='softmax')(dec_emb)

# Create and compile model
fr_en_transformer = Model([encoder_inputs, decoder_inputs], decoder_outputs)
fr_en_transformer.compile(optimizer=Adam(0.0001), loss='sparse_categorical_crossentropy')
fr_en_transformer.summary()

In [None]:
# Prepare sequences
fr_sequences = tokenizer_fr.texts_to_sequences(french_texts)
en_sequences = tokenizer_en.texts_to_sequences(english_texts)

# Padding
fr_padded = pad_sequences(fr_sequences, padding='post')
en_padded = pad_sequences(en_sequences, padding='post')

# Create decoder inputs and targets
decoder_input_data = en_padded[:, :-1]
decoder_target_data = en_padded[:, 1:]

# Callbacks
checkpoint = ModelCheckpoint('fr_en_transformer_best.keras',
                           monitor='val_loss',
                           save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3)
early_stop = EarlyStopping(monitor='val_loss', patience=5)

# Train
history = fr_en_transformer.fit(
    [fr_padded, decoder_input_data],
    np.expand_dims(decoder_target_data, -1),
    batch_size=64,
    epochs=50,
    validation_split=0.2,
    callbacks=[checkpoint, reduce_lr, early_stop]
)

# Save final model
fr_en_transformer.save('fr_en_transformer.keras')

Epoch 1/50
[1m113/625[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m1:32[0m 180ms/step - loss: 6.1191

KeyboardInterrupt: 

In [None]:
# Load Urdu-English dataset
df = pd.read_excel('/content/drive/MyDrive/DLP Models/english_urdu_dataset (1).xlsx')
english_texts = df['English'].astype(str).tolist()
urdu_texts = df['Urdu'].astype(str).tolist()

# Preprocessing
english_texts = [text.lower() for text in english_texts]
urdu_texts = [text.lower() for text in urdu_texts]
filtered_pairs = [(en, ur) for en, ur in zip(english_texts, urdu_texts)
                 if len(en.split()) <= MAX_LENGTH and len(ur.split()) <= MAX_LENGTH]
english_texts, urdu_texts = zip(*filtered_pairs)

# Tokenizers
tokenizer_en2 = Tokenizer(filters='', oov_token='<OOV>')
tokenizer_en2.fit_on_texts(english_texts)
en2_vocab_size = len(tokenizer_en2.word_index) + 1

tokenizer_ur = Tokenizer(filters='', oov_token='<OOV>')
tokenizer_ur.fit_on_texts(urdu_texts)
ur_vocab_size = len(tokenizer_ur.word_index) + 1

# Save tokenizers
with open('tokenizer_en2_reverse.pkl', 'wb') as f:
    pickle.dump(tokenizer_en2, f)
with open('tokenizer_ur_reverse.pkl', 'wb') as f:
    pickle.dump(tokenizer_ur, f)

In [None]:
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Dropout, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# Hyperparameters
EMBEDDING_DIM = 256
LSTM_UNITS = 256
DROPOUT_RATE = 0.2

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(en2_vocab_size, EMBEDDING_DIM)(encoder_inputs)
encoder_embedding = Dropout(DROPOUT_RATE)(encoder_embedding)

# Return sequences for attention
encoder_lstm = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True, return_state=True, dropout=DROPOUT_RATE))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)

# Concatenate states
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

# Reshape encoder outputs using Lambda layer
def reshape_encoder_outputs(x):
    shape = tf.shape(x)
    return tf.reshape(x, [shape[0], shape[1], LSTM_UNITS*2])

encoder_outputs_reshaped = Lambda(reshape_encoder_outputs)(encoder_outputs)

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(ur_vocab_size, EMBEDDING_DIM)(decoder_inputs)
decoder_embedding = Dropout(DROPOUT_RATE)(decoder_embedding)

decoder_lstm = LSTM(LSTM_UNITS*2, return_sequences=True, return_state=True, dropout=DROPOUT_RATE)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Attention layer
attention = tf.keras.layers.Attention()([decoder_outputs, encoder_outputs_reshaped])
decoder_concat = Concatenate()([decoder_outputs, attention])

decoder_dense = Dense(ur_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat)

en_ur_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
en_ur_model.compile(optimizer=Adam(0.001), loss='sparse_categorical_crossentropy')
en_ur_model.summary()

In [None]:
# Prepare sequences
en_sequences = tokenizer_en2.texts_to_sequences(english_texts)
ur_sequences = tokenizer_ur.texts_to_sequences(urdu_texts)

# Padding
max_length = max(
    max(len(seq) for seq in en_sequences),
    max(len(seq) for seq in ur_sequences)
)
en_padded = pad_sequences(en_sequences, maxlen=max_length, padding='post')
ur_padded = pad_sequences(ur_sequences, maxlen=max_length, padding='post')

# Create decoder inputs and targets
decoder_input_data = ur_padded[:, :-1]
decoder_target_data = ur_padded[:, 1:]

# Callbacks
checkpoint = ModelCheckpoint('en_ur_lstm_best.keras',
                           monitor='val_loss',
                           save_best_only=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3)
early_stop = EarlyStopping(monitor='val_loss', patience=5)

# Train
history = en_ur_model.fit(
    [en_padded, decoder_input_data],
    np.expand_dims(decoder_target_data, -1),
    batch_size=64,
    epochs=50,
    validation_split=0.2,
    callbacks=[checkpoint, reduce_lr, early_stop]
)


Epoch 1/50
[1m 80/634[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:12[0m 132ms/step - loss: 4.5041

KeyboardInterrupt: 

In [None]:
# Save final model
en_ur_model.save('en_ur_lstm.keras')

# Create inference encoder model
encoder_model = Model(encoder_inputs, [encoder_outputs_reshaped, state_h, state_c])

# Create inference decoder model
decoder_state_input_h = Input(shape=(LSTM_UNITS*2,))
decoder_state_input_c = Input(shape=(LSTM_UNITS*2,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

# Attention in inference
encoder_outputs_input = Input(shape=(None, LSTM_UNITS*2))
attention = tf.keras.layers.Attention()([decoder_outputs, encoder_outputs_input])
decoder_concat = Concatenate()([decoder_outputs, attention])
decoder_outputs = decoder_dense(decoder_concat)

decoder_model = Model(
    [decoder_inputs, encoder_outputs_input] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

encoder_model.save('en_ur_encoder.keras')
decoder_model.save('en_ur_decoder.keras')

In [None]:
def load_all_models():
    # Load French-English Transformer
    fr_en_transformer = load_model('fr_en_transformer.keras')

    # Load English-Urdu models
    en_ur_model = load_model('en_ur_lstm.keras')
    encoder_model = load_model('en_ur_encoder.keras')
    decoder_model = load_model('en_ur_decoder.keras')

    # Load tokenizers
    with open('tokenizer_fr_reverse.pkl', 'rb') as f:
        tokenizer_fr = pickle.load(f)
    with open('tokenizer_en_reverse.pkl', 'rb') as f:
        tokenizer_en = pickle.load(f)
    with open('tokenizer_en2_reverse.pkl', 'rb') as f:
        tokenizer_en2 = pickle.load(f)
    with open('tokenizer_ur_reverse.pkl', 'rb') as f:
        tokenizer_ur = pickle.load(f)

    return (fr_en_transformer, en_ur_model, encoder_model, decoder_model,
            tokenizer_fr, tokenizer_en, tokenizer_en2, tokenizer_ur)

In [None]:
def french_to_english_translate(input_text):
    # Load models
    (fr_en_transformer, _, _, _,
     tokenizer_fr, tokenizer_en, _, _) = load_all_models()

    # Preprocess input
    input_seq = tokenizer_fr.texts_to_sequences([input_text.lower()])
    input_seq = pad_sequences(input_seq, maxlen=MAX_LENGTH, padding='post')

    # Initialize decoder
    decoder_input = tf.expand_dims([tokenizer_en.word_index['<start>']], 0)

    # Generate translation
    output = []
    for _ in range(MAX_LENGTH):
        predictions = fr_en_transformer.predict([input_seq, decoder_input], verbose=0)
        predicted_id = tf.argmax(predictions[0, -1, :]).numpy()

        if predicted_id == tokenizer_en.word_index['<end>']:
            break

        output.append(tokenizer_en.index_word[predicted_id])
        decoder_input = tf.concat([decoder_input, tf.expand_dims([predicted_id], 0)], axis=-1)

    return ' '.join(output)

def english_to_urdu_translate(english_text):
    # Load models
    (_, _, encoder_model, decoder_model,
     _, _, tokenizer_en2, tokenizer_ur) = load_all_models()

    # Preprocess input
    input_seq = tokenizer_en2.texts_to_sequences([english_text.lower()])
    input_seq = pad_sequences(input_seq, maxlen=MAX_LENGTH, padding='post')

    # Encode input
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Initialize decoder
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_ur.word_index.get('start', 1)

    # Generate translation
    decoded_sentence = []
    for _ in range(MAX_LENGTH):
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer_ur.index_word.get(sampled_token_index, '')

        if not sampled_word or sampled_token_index == tokenizer_ur.word_index.get('end', 0):
            break

        decoded_sentence.append(sampled_word)
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return ' '.join(decoded_sentence)

In [None]:
def french_to_urdu_pipeline(french_text):
    # Step 1: French to English
    english_text = french_to_english_translate(french_text)

    # Step 2: English to Urdu
    urdu_text = english_to_urdu_translate(english_text)

    return {
        'french': french_text,
        'english': english_text,
        'urdu': urdu_text
    }

# Example usage
result = french_to_urdu_pipeline("Comment allez-vous aujourd'hui?")
print(f"""
French: {result['french']}
English: {result['english']}
Urdu: {result['urdu']}
""")

ValueError: File not found: filepath=fr_en_transformer.keras. Please ensure the file is an accessible `.keras` zip file.