In [40]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
data = pd.read_csv("engtomar.csv")
english_sentences = data["English"].tolist()
marathi_sentences = data["Marathi"].tolist()
marathi_sentences = [str(sentence) for sentence in marathi_sentences]
english_sentences = [str(sentence) for sentence in english_sentences]

In [41]:
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
eng_seq = tokenizer_eng.texts_to_sequences(english_sentences)

tokenizer_ma = Tokenizer()
tokenizer_ma.fit_on_texts(marathi_sentences)
ma_seq = tokenizer_ma.texts_to_sequences(marathi_sentences)

vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_ma = len(tokenizer_ma.word_index) + 1

# Padding
max_length = max(len(seq) for seq in eng_seq + ma_seq)
eng_seq_padded = pad_sequences(eng_seq, maxlen=max_length, padding='post')
ma_seq_padded = pad_sequences(ma_seq, maxlen=max_length, padding='post')

In [42]:
embedding_dim = 256
units = 512

# Encoder
encoder_inputs = Input(shape=(max_length,))
enc_emb = Embedding(input_dim=vocab_size_eng, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_length,))
dec_emb_layer = Embedding(input_dim=vocab_size_ma, output_dim=embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_ma, activation='softmax')
output = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])


In [43]:
X_train, X_val, y_train, y_val = train_test_split(eng_seq_padded, ma_seq_padded, test_size=0.2)
model.fit([X_train, X_train], y_train, validation_data=([X_val, X_val], y_val), epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x295912550>

In [44]:
model.save("translation_model.h5")


  saving_api.save_model(


In [None]:
def translate_sentence(sentence):
    seq = tokenizer_eng.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    translated = np.argmax(model.predict([padded, padded]), axis=-1)
    
    translated_sentence = []
    for i in translated[0]:
        if i in tokenizer_ma.index_word:
            translated_sentence.append(tokenizer_ma.index_word[i])
        else:
            translated_sentence.append(' ') 
        
    return ' '.join(translated_sentence)
while True:
    input_sentence = input()
    translated_sentence = translate_sentence(input_sentence)
    print(f"Input: {input_sentence}")
    print(f"Translated: {translated_sentence}")

How are you
Input: How are you
Translated: तू कसा आहेस                              
I have lot of work
Input: I have lot of work
Translated: मला आज काम आहे                            
Let me go
Input: Let me go
Translated: मला जाऊ द्या                              
