In [31]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [32]:
# English-Hindi parallel sentences
english_sentences = [
    "hello", "how are you", "my name is ayush", "i love you", "what is your name",
    "where are you going", "good morning", "good night", "thank you", "i am fine",
    "please sit down", "do you speak hindi", "i don't understand", "i am hungry",
    "where is the bathroom", "see you later", "i am tired", "open the door",
    "close the window", "can you help me
    ", "i am a student", "this is my book",
    "what time is it", "where do you live", "i live in mumbai"
]

hindi_sentences = [
    "<start> नमस्ते <end>",
    "<start> आप कैसे हैं <end>",
    "<start> मेरा नाम आयुष है <end>",
    "<start> मैं तुमसे प्यार करता हूँ <end>",
    "<start> आपका नाम क्या है <end>",
    "<start> आप कहाँ जा रहे हैं <end>",
    "<start> शुभ प्रभात <end>",
    "<start> शुभ रात्रि <end>",
    "<start> धन्यवाद <end>",
    "<start> मैं ठीक हूँ <end>",
    "<start> कृपया बैठिए <end>",
    "<start> क्या आप हिंदी बोलते हैं <end>",
    "<start> मैं नहीं समझा <end>",
    "<start> मुझे भूख लगी है <end>",
    "<start> बाथरूम कहाँ है <end>",
    "<start> फिर मिलेंगे <end>",
    "<start> मैं थक गया हूँ <end>",
    "<start> दरवाज़ा खोलो <end>",
    "<start> खिड़की बंद करो <end>",
    "<start> क्या आप मेरी मदद कर सकते हैं <end>",
    "<start> मैं एक छात्र हूँ <end>",
    "<start> यह मेरी किताब है <end>",
    "<start> समय क्या हुआ है <end>",
    "<start> आप कहाँ रहते हैं <end>",
    "<start> मैं मुंबई में रहता हूँ <end>"
]

In [33]:
# Tokenize input (English)
inp_tokenizer = Tokenizer()
inp_tokenizer.fit_on_texts(english_sentences)
input_seq = inp_tokenizer.texts_to_sequences(english_sentences)
max_input_len = max(len(seq) for seq in input_seq)
input_seq = pad_sequences(input_seq, maxlen=max_input_len, padding='post')

# Tokenize target (Hindi)
targ_tokenizer = Tokenizer(filters='')
targ_tokenizer.fit_on_texts(hindi_sentences)
target_seq = targ_tokenizer.texts_to_sequences(hindi_sentences)
max_target_len = max(len(seq) for seq in target_seq)
target_seq = pad_sequences(target_seq, maxlen=max_target_len, padding='post')

# Prepare decoder input and output
decoder_input_seq = target_seq[:, :-1]
decoder_output_seq = target_seq[:, 1:]


In [34]:
vocab_inp_size = len(inp_tokenizer.word_index) + 1
vocab_tar_size = len(targ_tokenizer.word_index) + 1
embedding_dim = 64
lstm_units = 256

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(vocab_inp_size, embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(lstm_units, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(vocab_tar_size, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_tar_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define and compile model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()


In [35]:
model.fit(
    [input_seq, decoder_input_seq],
    np.expand_dims(decoder_output_seq, -1),
    batch_size=2,
    epochs=300,
    verbose=1
)


Epoch 1/300
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 17ms/step - loss: 3.9858
Epoch 2/300
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 2.4637
Epoch 3/300
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 2.2163
Epoch 4/300
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 2.1560
Epoch 5/300
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 1.9460
Epoch 6/300
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 1.7228
Epoch 7/300
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 1.9371
Epoch 8/300
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 1.8279
Epoch 9/300
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 1.7290
Epoch 10/300
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - lo

<keras.src.callbacks.history.History at 0x1c1c9ce9850>

In [36]:
# Encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder model
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = Input(shape=(1,))
dec_emb2 = dec_emb_layer(decoder_inputs_single)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)


In [37]:
reverse_target_word_index = {i: word for word, i in targ_tokenizer.word_index.items()}
start_token = targ_tokenizer.word_index['<start>']
end_token = targ_tokenizer.word_index['<end>']

def translate(sentence):
    seq = inp_tokenizer.texts_to_sequences([sentence.lower()])
    seq = pad_sequences(seq, maxlen=max_input_len, padding='post')
    states_value = encoder_model.predict(seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = start_token

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index.get(sampled_token_index, '')

        if sampled_word == '<end>' or len(decoded_sentence.split()) > max_target_len:
            stop_condition = True
        else:
            decoded_sentence += sampled_word + ' '

        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()


In [40]:
print(translate("good night"))
print(translate("how are you"))
print(translate("my name is chutiya"))
print(translate("can you help me"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
शुभ रात्रि
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
आप कैसे हैं
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[