<a href="https://colab.research.google.com/github/Elakkiya1802/ML-tasks/blob/main/Task9(_English_to_Tamil).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# English to Tamil Translation (Beginner NLP Project)

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 1. Dataset (25 sentences)


english_sentences = [
    "hello", "how are you", "i am fine", "what is your name",
    "my name is ram", "good morning", "good night",
    "thank you", "welcome", "i love india",
    "i am a student", "how old are you", "i am happy",
    "where are you", "i am from india", "do you like coffee",
    "yes i like coffee", "no i do not like tea",
    "what are you doing", "i am learning nlp",
    "deep learning is powerful", "machine learning is useful",
    "i like tamil", "tamil is my language", "good bye"
]

tamil_sentences = [
    "வணக்கம்", "நீங்கள் எப்படி இருக்கிறீர்கள்", "நான் நன்றாக இருக்கிறேன்",
    "உங்கள் பெயர் என்ன", "என் பெயர் ராம்",
    "காலை வணக்கம்", "இனிய இரவு",
    "நன்றி", "வரவேற்கிறேன்", "எனக்கு இந்தியா பிடிக்கும்",
    "நான் ஒரு மாணவன்", "உங்கள் வயது என்ன", "நான் மகிழ்ச்சியாக இருக்கிறேன்",
    "நீங்கள் எங்கே இருக்கிறீர்கள்", "நான் இந்தியாவிலிருந்து வந்தவன்",
    "உங்களுக்கு காபி பிடிக்குமா", "ஆம் எனக்கு காபி பிடிக்கும்",
    "இல்லை எனக்கு டீ பிடிக்காது",
    "நீங்கள் என்ன செய்கிறீர்கள்", "நான் NLP கற்றுக்கொள்கிறேன்",
    "டீப் லெர்னிங் சக்திவாய்ந்தது",
    "மெஷின் லெர்னிங் பயனுள்ளது",
    "எனக்கு தமிழ் பிடிக்கும்",
    "தமிழ் என் மொழி", "பிரியாவிடை"
]

# Add start and end tokens to target
tamil_sentences = ["start " + s + " end" for s in tamil_sentences]

# 2. Tokenization

eng_tokenizer = Tokenizer()
tam_tokenizer = Tokenizer()

eng_tokenizer.fit_on_texts(english_sentences)
tam_tokenizer.fit_on_texts(tamil_sentences)

eng_seq = eng_tokenizer.texts_to_sequences(english_sentences)
tam_seq = tam_tokenizer.texts_to_sequences(tamil_sentences)

max_eng_len = max(len(seq) for seq in eng_seq)
max_tam_len = max(len(seq) for seq in tam_seq)

encoder_input = pad_sequences(eng_seq, maxlen=max_eng_len, padding='post')
decoder_input = pad_sequences(tam_seq, maxlen=max_tam_len, padding='post')

decoder_output = np.zeros_like(decoder_input)
decoder_output[:, :-1] = decoder_input[:, 1:]

# 3. Model Parameters

eng_vocab_size = len(eng_tokenizer.word_index) + 1
tam_vocab_size = len(tam_tokenizer.word_index) + 1
embedding_dim = 64
latent_dim = 128
# 4. Encoder
encoder_inputs = Input(shape=(None,))
enc_embed = Embedding(eng_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(enc_embed)
encoder_states = [state_h, state_c]

# 5. Decoder

decoder_inputs = Input(shape=(None,))
dec_embed = Embedding(tam_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_embed, initial_state=encoder_states)
decoder_dense = Dense(tam_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# 6. Model Compile & Train

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

model.fit(
    [encoder_input, decoder_input],
    decoder_output[..., np.newaxis],
    epochs=300,
    batch_size=4
)

# 7. Inference Models

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_outputs, state_h, state_c = decoder_lstm(
    dec_embed, initial_state=decoder_states_inputs
)
dec_states = [state_h, state_c]
dec_outputs = decoder_dense(dec_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [dec_outputs] + dec_states
)
# 8. Translation Function

reverse_tam_index = {i: w for w, i in tam_tokenizer.word_index.items()}

def translate(sentence):
    seq = eng_tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_eng_len, padding='post')
    states = encoder_model.predict(seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tam_tokenizer.word_index["start"]

    result = ""

    for _ in range(max_tam_len):
        output, h, c = decoder_model.predict([target_seq] + states)
        word_index = np.argmax(output[0, -1, :])
        word = reverse_tam_index.get(word_index, "")

        if word == "end":

            break

        result += word + " "
        target_seq[0, 0] = word_index
        states = [h, c]

    return result.strip()

# 9. Test

print("English:", "good morning")
print("Tamil:", translate("good morning"))

print("English:", "i am learning nlp")
print("Tamil:", translate("i am learning nlp"))


Epoch 1/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - loss: 3.8539
Epoch 2/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 3.7011
Epoch 3/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 2.9394
Epoch 4/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 2.4225
Epoch 5/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 2.2482
Epoch 6/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 2.1488
Epoch 7/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 2.1473
Epoch 8/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 1.9291
Epoch 9/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 1.9947
Epoch 10/300
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 1.9824
Epoch 11/