<a href="https://colab.research.google.com/github/AniruddhMukherjee/MultiModel_Translation_Project/blob/main/translation(separate_models).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Concatenate
from tqdm import tqdm
import re

In [16]:
# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.strip()
    return text

In [17]:
# Function to load and preprocess dataset
def load_and_preprocess_dataset(filepath, max_len):
    data = pd.read_csv(filepath, sep='\t', header=None, names=["source", "target", "metadata"])
    data["source"] = data["source"].apply(clean_text)
    data["target"] = data["target"].apply(clean_text).apply(lambda x: '<START> ' + x + ' <END>')
    return data

In [18]:
# Function to tokenize and pad sequences
def tokenize_and_pad(data, max_len):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data)
    vocab_size = len(tokenizer.word_index) + 1
    sequences = tokenizer.texts_to_sequences(data)
    pad_sequences_ = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    return tokenizer, vocab_size, pad_sequences_

In [20]:
# Load datasets for all languages
datasets = {
    "german": "/content/deu.txt",
    "hindi": "/content/hin.txt",
    "bengali": "/content/ben.txt",
    "marathi": "/content/mar.txt"
}

max_len = 40  # Set maximum sequence length for padding
processed_data = {}

In [21]:
for lang, filepath in datasets.items():
    print(f"Processing dataset for {lang.capitalize()}...")
    data = load_and_preprocess_dataset(filepath, max_len)
    processed_data[lang] = data

# Tokenize and pad sequences
tokenizers = {}
vocab_sizes = {}
padded_sequences = {}

Processing dataset for German...
Processing dataset for Hindi...
Processing dataset for Bengali...
Processing dataset for Marathi...


In [22]:
for lang, data in processed_data.items():
    print(f"Tokenizing and padding for {lang.capitalize()}...")
    src_tokenizer, src_vocab_size, src_pad_seq = tokenize_and_pad(data["source"], max_len)
    tgt_tokenizer, tgt_vocab_size, tgt_pad_seq = tokenize_and_pad(data["target"], max_len)

    tokenizers[lang] = {"source": src_tokenizer, "target": tgt_tokenizer}
    vocab_sizes[lang] = {"source": src_vocab_size, "target": tgt_vocab_size}
    padded_sequences[lang] = {"source": src_pad_seq, "target": tgt_pad_seq}


Tokenizing and padding for German...
Tokenizing and padding for Hindi...
Tokenizing and padding for Bengali...
Tokenizing and padding for Marathi...


In [23]:
# Define the attention layer
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W_a = self.add_weight(name='W_a',
                                   shape=(input_shape[0][2], input_shape[0][2]),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=(input_shape[1][2], input_shape[0][2]),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=(input_shape[0][2], 1),
                                   initializer='uniform',
                                   trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        encoder_out_seq, decoder_out_seq = inputs

        def energy_step(inputs, states):
            encoder_full_seq = states[-1]
            W_a_dot_s = tf.keras.backend.dot(encoder_full_seq, self.W_a)
            U_a_dot_h = tf.keras.backend.expand_dims(tf.keras.backend.dot(inputs, self.U_a), 1)
            e_i = tf.keras.backend.softmax(
                tf.keras.backend.squeeze(tf.keras.backend.dot(tf.keras.backend.tanh(W_a_dot_s + U_a_dot_h), self.V_a),
                                         axis=-1))
            return e_i, [e_i]

        def context_step(inputs, states):
            encoder_full_seq = states[-1]
            c_i = tf.keras.backend.sum(encoder_full_seq * tf.keras.backend.expand_dims(inputs, -1), axis=1)
            return c_i, [c_i]

        fake_state_c = tf.keras.backend.sum(encoder_out_seq, axis=1)
        fake_state_e = tf.keras.backend.sum(encoder_out_seq, axis=2)
        _, e_outputs, _ = tf.keras.backend.rnn(energy_step, decoder_out_seq, [fake_state_e], constants=[encoder_out_seq])
        _, c_outputs, _ = tf.keras.backend.rnn(context_step, e_outputs, [fake_state_c], constants=[encoder_out_seq])

        return c_outputs, e_outputs

In [24]:
# Function to build the translation model
def build_translation_model(src_vocab_size, tgt_vocab_size):
    encoder_inputs = Input(shape=(max_len,))
    encoder_emb = Embedding(src_vocab_size, 100, trainable=True)(encoder_inputs)

    encoder_lstm = LSTM(300, return_sequences=True, return_state=True, dropout=0.3, recurrent_dropout=0.2)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_emb)
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(None,))
    decoder_emb = Embedding(tgt_vocab_size, 100, trainable=True)(decoder_inputs)
    decoder_lstm = LSTM(300, return_sequences=True, return_state=True, dropout=0.3, recurrent_dropout=0.2)
    decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_states)

    attn_layer = AttentionLayer()
    attn_outputs, attn_states = attn_layer([encoder_outputs, decoder_outputs])
    decoder_concat_outputs = Concatenate(axis=-1)([decoder_outputs, attn_outputs])

    decoder_dense = TimeDistributed(Dense(tgt_vocab_size, activation='softmax'))
    decoder_outputs = decoder_dense(decoder_concat_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

In [25]:
# Train and save models for all language pairs
for lang, data in processed_data.items():
    print(f"Building model for English to {lang.capitalize()}...")
    src_vocab_size = vocab_sizes[lang]["source"]
    tgt_vocab_size = vocab_sizes[lang]["target"]
    model = build_translation_model(src_vocab_size, tgt_vocab_size)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # Train the model
    model.fit([padded_sequences[lang]["source"], padded_sequences[lang]["target"][:, :-1]],
              padded_sequences[lang]["target"][:, 1:],
              batch_size=64, epochs=1, validation_split=0.1)

Building model for English to German...
[1m3908/3908[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1949s[0m 495ms/step - loss: 1.0537 - val_loss: 1.3198
Building model for English to Hindi...
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 283ms/step - loss: 3.3413 - val_loss: 2.0375
Building model for English to Bengali...
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 264ms/step - loss: 2.4354 - val_loss: 1.5538
Building model for English to Marathi...
[1m661/661[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 306ms/step - loss: 1.2462 - val_loss: 1.3568


In [26]:
for lang, data in processed_data.items():
      model_name = f'english_to_{lang}_translator.h5'
      model.save(model_name)
      print(f"Model saved as {model_name}")



Model saved as english_to_german_translator.h5




Model saved as english_to_hindi_translator.h5




Model saved as english_to_bengali_translator.h5
Model saved as english_to_marathi_translator.h5
