In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

data = pd.read_csv('/content/Hindi_English_Truncated_Corpus.csv')

In [2]:
print(data.columns)

Index(['source', 'english_sentence', 'hindi_sentence'], dtype='object')


In [3]:
# Assuming your dataset has two columns: 'english' and 'hindi'
English_sent = data["english_sentence"].tolist()
Hindi_sent = data["hindi_sentence"].tolist()



In [4]:
# Preprocessing
def preprocess(sentences):
    # Convert all elements in sentences to strings
    sentences = [str(sent) for sent in sentences]  # This line is added
    tokenizer = keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences, tokenizer

# Prepare the data
# Changed 'english_sentences' to 'English_sent'
X, eng_tokenizer = preprocess(English_sent)
# Changed 'hindi_sentences' to 'Hindi_sent'
y, hin_tokenizer = preprocess(Hindi_sent)

# Prepare the target data (shifted for decoder)
y = y[:, :-1]  # Remove the last token
y = np.expand_dims(y, -1)  # Add a new axis for the decoder input

In [5]:
embedding_dim = 256
units = 512
vocab_size_eng = len(eng_tokenizer.word_index) + 1
vocab_size_hin = len(hin_tokenizer.word_index) + 1

In [6]:
# Encoder
encoder_inputs = layers.Input(shape=(None,))
encoder_embedding = layers.Embedding(vocab_size_eng, embedding_dim)(encoder_inputs)
encoder_lstm = layers.LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = layers.Input(shape=(None,))
decoder_embedding = layers.Embedding(vocab_size_hin, embedding_dim)(decoder_inputs)
decoder_lstm = layers.LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = layers.Dense(vocab_size_hin, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [7]:
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
