In [3]:
!pip install tensorflow tensorflow-text datasets --quiet


In [4]:
import re
import numpy as np
import tensorflow as tf
from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
from datasets import load_dataset

# Load parallel datasets via Hugging Face "opus100"
PAIRS = ['en-hi','en-ja','en-fr','en-es']  
raw_datasets = {pair: load_dataset('opus100', pair, split='train') for pair in PAIRS}

# Clean & tag sentences
def clean_and_tag(sentences, lang_code):
    # Example function to clean and tag sentences (you can modify it based on your needs)
    return [f"{lang_code}: {sentence.strip()}" for sentence in sentences]

# Tag the datasets
for pair in PAIRS:
    lang1, lang2 = pair.split('-')

    # Use map to clean and tag sentences for both source and target
    raw_datasets[pair] = raw_datasets[pair].map(lambda example: {
        'source': clean_and_tag(example['translation'][lang1], lang1),
        'target': clean_and_tag(example['translation'][lang2], lang2)
    })

# Display a sample to check
print(raw_datasets['en-hi'][0])  # You can change this index for other pairs


{'translation': {'en': "Yeah, that's not exactly...", 'ru': 'Да, но не совсем...'}, 'source': ['en: Y', 'en: e', 'en: a', 'en: h', 'en: ,', 'en: ', 'en: t', 'en: h', 'en: a', 'en: t', "en: '", 'en: s', 'en: ', 'en: n', 'en: o', 'en: t', 'en: ', 'en: e', 'en: x', 'en: a', 'en: c', 'en: t', 'en: l', 'en: y', 'en: .', 'en: .', 'en: .'], 'target': ['ru: Д', 'ru: а', 'ru: ,', 'ru: ', 'ru: н', 'ru: о', 'ru: ', 'ru: н', 'ru: е', 'ru: ', 'ru: с', 'ru: о', 'ru: в', 'ru: с', 'ru: е', 'ru: м', 'ru: .', 'ru: .', 'ru: .']}


In [None]:
# Process the full dataset instead of a subset
src_texts = []
tgt_texts = []

# Process the full dataset
for pair in PAIRS:
    dataset = raw_datasets[pair]
    
    # Use the entire dataset instead of subsetting
    src_texts.extend([ex['source'][0] for ex in dataset])
    tgt_texts.extend([ex['target'][0] for ex in dataset])

print(f"Collected {len(src_texts)} source texts and {len(tgt_texts)} target texts.")

Collected 250000 source texts and 250000 target texts.


In [12]:
# Combine all texts for tokenizer fitting
all_texts = src_texts + tgt_texts

# Create tokenizer
tokenizer = Tokenizer(filters='', oov_token='<unk>')
tokenizer.fit_on_texts(all_texts)
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'

# Tokenize and pad
src_seqs = tokenizer.texts_to_sequences(src_texts)
tgt_seqs = tokenizer.texts_to_sequences(tgt_texts)

src_padded = pad_sequences(src_seqs, padding='post')
tgt_padded = pad_sequences(tgt_seqs, padding='post')

print("Sample tokenized source sequence:", src_padded[0])
print("Sample tokenized target sequence:", tgt_padded[0])


Sample tokenized source sequence: [2 9]
Sample tokenized target sequence: [3 9]


In [13]:
# Define model architecture
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Set parameters
VOCAB_SIZE = len(tokenizer.word_index) + 1  # Add 1 for padding token
EMBEDDING_DIM = 256  # You can adjust this depending on your model size
LATENT_DIM = 512  # LSTM latent dimensionality, can be adjusted

# Define encoder input
encoder_input = Input(shape=(None,))

# Embedding layer for source sequences
encoder_emb = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(encoder_input)

# Encoder LSTM
encoder_lstm = LSTM(LATENT_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_emb)

# Encoder states (these will be used in the decoder)
encoder_states = [state_h, state_c]

# Define decoder input
decoder_input = Input(shape=(None,))

# Embedding layer for target sequences
decoder_emb = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(decoder_input)

# Decoder LSTM, using encoder states as initial states
decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_states)

# Dense layer to predict the vocabulary at each timestep
decoder_dense = Dense(VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the full model
model = Model([encoder_input, decoder_input], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()


In [14]:
# Define target data for training (shift the target sequences by 1 timestep for the decoder)
# Shift target sequences to create the decoder target
tgt_input = tgt_padded[:, :-1]
tgt_output = tgt_padded[:, 1:]

print("Sample decoder input sequence:", tgt_input[0])
print("Sample decoder output sequence:", tgt_output[0])


Sample decoder input sequence: [3]
Sample decoder output sequence: [9]


In [15]:
# Train the model
model.fit([src_padded, tgt_input], np.expand_dims(tgt_output, -1),
          epochs=10,  # You can adjust the number of epochs
          batch_size=64,  # You can adjust the batch size
          validation_split=0.2)  # You can also adjust the validation split


Epoch 1/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1150s[0m 366ms/step - accuracy: 0.2267 - loss: 3.1558 - val_accuracy: 0.0630 - val_loss: 10.6498
Epoch 2/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1172s[0m 375ms/step - accuracy: 0.2956 - loss: 2.6740 - val_accuracy: 0.0646 - val_loss: 11.6569
Epoch 3/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1138s[0m 364ms/step - accuracy: 0.2970 - loss: 2.6599 - val_accuracy: 0.0634 - val_loss: 11.9495
Epoch 4/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1159s[0m 363ms/step - accuracy: 0.2980 - loss: 2.6482 - val_accuracy: 0.0613 - val_loss: 11.7063
Epoch 5/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1267s[0m 405ms/step - accuracy: 0.2985 - loss: 2.6370 - val_accuracy: 0.0632 - val_loss: 11.1014
Epoch 6/10
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1288s[0m 408ms/step - accuracy: 0.2985 - loss: 2.6364 - val_accuracy: 0.0632 

KeyboardInterrupt: 

In [16]:
# Save the trained model
model.save("multilingual_translation_model.h5")
print("Model saved successfully!")




Model saved successfully!


In [29]:
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam

# Load the model without compiling
model = load_model("multilingual_translation_model.h5", compile=False)

# Recompile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [30]:
# Encoder model
encoder_model = Model(encoder_input, encoder_states)

# Decoder setup
decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_state_input_c = Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# Reuse the previously defined embedding layer for the decoder
decoder_emb2 = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(decoder_input)

# Decoder LSTM with the encoder states as initial states
decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_emb2, initial_state=decoder_states_inputs)

# Decoder states (h and c) for the next time step
decoder_states2 = [state_h2, state_c2]

# Dense layer to predict the output token at each time step
decoder_outputs2 = decoder_dense(decoder_outputs2)

# Define the decoder model
decoder_model = Model([decoder_input] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

# Model summary
decoder_model.summary()


In [31]:
def decode_sequence(input_seq):
    # Encode input
    states_value = encoder_model.predict(input_seq)

    # Create empty target sequence with just the start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index.get('en', 1)  # Ensure proper start token

    stop_condition = False
    decoded_sentence = ""

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word.get(sampled_token_index, '')

        # Debug: Print output tokens and sampled token
        print(f"Sampled token index: {sampled_token_index}")
        print(f"Sampled word: {sampled_word}")

        if sampled_word == '<pad>' or len(decoded_sentence.split()) > 30:
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

            # Update target sequence and states
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index
            states_value = [h, c]

    return decoded_sentence.strip()


In [32]:
def translate(sentence, lang_code='en'):
    # Preprocess the sentence by adding the language code
    tagged = f"{lang_code}: {sentence.strip()}"

    # Convert sentence to sequence
    seq = tokenizer.texts_to_sequences([tagged])

    # Pad the sequence (ensure maxlen is consistent with training data)
    padded = pad_sequences(seq, maxlen=src_padded.shape[1], padding='post')

    # Decode the padded sequence to get the translation
    translation = decode_sequence(padded)

    print(f"Input ({lang_code}):", sentence)
    print("Translated:", translation)

# Example calls for translation
translate("How are you?", lang_code='hi')  # English to Hindi
translate("I love you", lang_code='ja')    # English to Japanese
translate("Where is the market?", lang_code='fr')  # English to French
translate("Good morning", lang_code='es')  # English to Spanish


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 329ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 256ms/step
Sampled token index: 15
Sampled word: c
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Sampled token index: 14
Sampled word: l
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Sampled token index: 14
Sampled word: l
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Sampled token index: 14
Sampled word: l
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Sampled token index: 14
Sampled word: l
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Sampled token index: 14
Sampled word: l
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Sampled token index: 14
Sampled word: l
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Sampled token index: 14
Sampled word: l
[1m1/1[0m [32m━━━━━━━━━━━━━