<a href="https://colab.research.google.com/github/AniruddhMukherjee/MultiModel_Translation_Project/blob/main/Single_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

aniruddhmukherjee_translation_dataset_path = kagglehub.dataset_download('aniruddhmukherjee/translation-dataset')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/translation-dataset/hin.txt
/kaggle/input/translation-dataset/mar.txt
/kaggle/input/translation-dataset/deu.txt
/kaggle/input/translation-dataset/ben.txt


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Concatenate
from tqdm import tqdm
import re

In [None]:
# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.strip()
    return text

In [None]:
# Load and preprocess datasets
def load_and_preprocess_datasets(filepaths):
    combined_data = []
    for lang, filepath in filepaths.items():
        data = pd.read_csv(filepath, sep='\t', header=None, names=["source", "target", "metadata"])
        data["source"] = data["source"].apply(clean_text)
        data["target"] = data["target"].apply(clean_text).apply(lambda x: f'<{lang}> <START> ' + x + ' <END>')
        combined_data.append(data[["source", "target"]])
    combined_df = pd.concat(combined_data, ignore_index=True)
    return combined_df

In [None]:
# Tokenize and pad sequences
def tokenize_and_pad(data, max_len):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(data)
    vocab_size = len(tokenizer.word_index) + 1
    sequences = tokenizer.texts_to_sequences(data)
    pad_sequences_ = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    return tokenizer, vocab_size, pad_sequences_

# Filepaths for datasets
filepaths = {
    "hindi": "/kaggle/input/translation-dataset/hin.txt",
    "bengali": "/kaggle/input/translation-dataset/ben.txt",
    "marathi": "/kaggle/input/translation-dataset/mar.txt",
    "german": "/kaggle/input/translation-dataset/deu.txt"
}

# Load and preprocess all datasets
max_len = 40
print("Loading and preprocessing datasets...")
combined_df = load_and_preprocess_datasets(filepaths)

# Tokenize and pad sequences
print("Tokenizing and padding sequences...")
source_tokenizer, source_vocab_size, source_padded = tokenize_and_pad(combined_df["source"], max_len)
target_tokenizer, target_vocab_size, target_padded = tokenize_and_pad(combined_df["target"], max_len)


Loading and preprocessing datasets...
Tokenizing and padding sequences...


In [None]:
# Define the attention layer
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W_a = self.add_weight(name='W_a',
                                   shape=(input_shape[0][2], input_shape[0][2]),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=(input_shape[1][2], input_shape[0][2]),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=(input_shape[0][2], 1),
                                   initializer='uniform',
                                   trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs):
        encoder_out_seq, decoder_out_seq = inputs

        def energy_step(inputs, states):
            encoder_full_seq = states[-1]
            W_a_dot_s = tf.keras.backend.dot(encoder_full_seq, self.W_a)
            U_a_dot_h = tf.keras.backend.expand_dims(tf.keras.backend.dot(inputs, self.U_a), 1)
            e_i = tf.keras.backend.softmax(
                tf.keras.backend.squeeze(tf.keras.backend.dot(tf.keras.backend.tanh(W_a_dot_s + U_a_dot_h), self.V_a),
                                         axis=-1))
            return e_i, [e_i]

        def context_step(inputs, states):
            encoder_full_seq = states[-1]
            c_i = tf.keras.backend.sum(encoder_full_seq * tf.keras.backend.expand_dims(inputs, -1), axis=1)
            return c_i, [c_i]

        fake_state_c = tf.keras.backend.sum(encoder_out_seq, axis=1)
        fake_state_e = tf.keras.backend.sum(encoder_out_seq, axis=2)
        _, e_outputs, _ = tf.keras.backend.rnn(energy_step, decoder_out_seq, [fake_state_e], constants=[encoder_out_seq])
        _, c_outputs, _ = tf.keras.backend.rnn(context_step, e_outputs, [fake_state_c], constants=[encoder_out_seq])

        return c_outputs, e_outputs

In [None]:
# Build the multi-language model
def build_multi_language_model(source_vocab_size, target_vocab_size):
    encoder_inputs = Input(shape=(max_len,))
    encoder_emb = Embedding(source_vocab_size, 100, trainable=True)(encoder_inputs)

    encoder_lstm = LSTM(300, return_sequences=True, return_state=True, dropout=0.3, recurrent_dropout=0.2)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_emb)
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(None,))
    decoder_emb = Embedding(target_vocab_size, 100, trainable=True)(decoder_inputs)
    decoder_lstm = LSTM(300, return_sequences=True, return_state=True, dropout=0.3, recurrent_dropout=0.2)
    decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_states)

    attn_layer = AttentionLayer()
    attn_outputs, attn_states = attn_layer([encoder_outputs, decoder_outputs])
    decoder_concat_outputs = Concatenate(axis=-1)([decoder_outputs, attn_outputs])

    decoder_dense = TimeDistributed(Dense(target_vocab_size, activation='softmax'))
    decoder_outputs = decoder_dense(decoder_concat_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

In [None]:
# Build and train the model
print("Building the multi-language model...")
model = build_multi_language_model(source_vocab_size, target_vocab_size)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

print("Training the model...")
model.fit([source_padded, target_padded[:, :-1]], target_padded[:, 1:],
          batch_size=64, epochs=5, validation_split=0.1)

# Save the model
model.save('multi_language_translator.h5')
print("Model saved as 'multi_language_translator.h5'")

Building the multi-language model...
Training the model...
Epoch 1/5
[1m4704/4704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1845s[0m 390ms/step - loss: 1.0394 - val_loss: 1.2638
Epoch 2/5
[1m4704/4704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1837s[0m 391ms/step - loss: 0.4197 - val_loss: 1.0114
Epoch 3/5
[1m4704/4704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1836s[0m 390ms/step - loss: 0.2758 - val_loss: 0.9387
Epoch 4/5
[1m4704/4704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1835s[0m 390ms/step - loss: 0.2191 - val_loss: 0.9147
Epoch 5/5
[1m4704/4704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1819s[0m 387ms/step - loss: 0.1881 - val_loss: 0.9086
Model saved as 'multi_language_translator.h5'


#