# Mount Google Drive (for Google Colab):

In [None]:
# Mount Google Drive if running in Google Colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Libraries:

In [None]:
# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model
import pandas as pd
import re
import string
from string import digits
import numpy as np

# Read and Preprocess Data:

In [None]:
# Read the dataset
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP_LAB/Hindi_English_Truncated_Corpus.csv')

# Filter data for short sentences
data = data[(data.english_sentence.apply(lambda x: len(str(x)) <= 30)) &
            (data.hindi_sentence.apply(lambda x: len(str(x)) <= 30))]

# Convert sentences to lowercase
data['english_sentence'] = data['english_sentence'].apply(lambda x: str(x).lower())
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.lower())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['english_sentence'] = data['english_sentence'].apply(lambda x: str(x).lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.lower())


# Data Cleaning - Remove Quotes, Special Characters, Digits, and Extra Spaces

In [None]:
# Remove single quotes
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub("'", '', x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

# Define punctuation to exclude
to_exclude = set(string.punctuation)

# Remove special characters
data['english_sentence'] = data['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in to_exclude))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in to_exclude))

# Remove digits
remove_digits = str.maketrans('', '', digits)
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.translate(remove_digits))


# More Data Cleaning - Removing Specific Hindi Characters and Extra Spaces

In [None]:
# Remove specific Hindi characters
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
data['english_sentence'] = data['english_sentence'].apply(lambda x: x.strip())
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: x.strip())
data['english_sentence'] = data['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))


# Extract Input and Target Sentences, Tokenization

In [None]:
# Extract input and target sentences
input_text = []
target_text = []
input_characters = set()
target_characters = set()

for eng, hin in data[['english_sentence', 'hindi_sentence']].itertuples(index=False):
    target = 'START_' + hin + '_END'  # Add start and end tokens to target
    input_text.append(eng)
    target_text.append(target)

    # Update character sets
    for eng_char in eng.split():
        if eng_char not in input_characters:
            input_characters.add(eng_char)

    for hin_char in hin.split():
        if hin_char not in target_characters:
            target_characters.add(hin_char)


# Dataset Statistics and Token Index Dictionaries

In [None]:
# Print dataset statistics
print("Number of samples:", len(input_text))
print("Number of unique input tokens:", len(input_characters))
print("Number of unique output tokens:", len(target_characters))
print("Max sequence length for inputs:", max([len(txt) for txt in input_text]))
print("Max sequence length for outputs:", max([len(txt) for txt in target_text]))

# Create token index dictionaries
input_char = sorted(list(input_characters))
target_char = sorted(list(target_characters))

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters) + 1  # Add 1 for padding token

max_encoder_seq_length = max([len(txt) for txt in input_text])
max_decoder_seq_length = max([len(txt) for txt in target_text])

input_token_index = dict([(word, i + 1) for i, word in enumerate(input_char)])
target_token_index = dict([(word, i + 1) for i, word in enumerate(target_char)])

reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())


Number of samples: 18416
Number of unique input tokens: 9729
Number of unique output tokens: 8665
Max sequence length for inputs: 30
Max sequence length for outputs: 40


# Save and Load Token Index Dictionaries

In [None]:
# Save token index dictionaries to files using pickle
import pickle

pickle.dump(input_token_index, open('eng_input_token_index.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(target_token_index, open('hin_target_token_index.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(reverse_input_char_index, open('eng_reverse_input_char_index.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(reverse_target_char_index, open('hin_reverse_target_char_index.pickle', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

# Load token index dictionaries from files using pickle
with open('eng_input_token_index.pickle', 'rb') as fp:
    input_token_index = pickle.load(fp)
with open('hin_target_token_index.pickle', 'rb') as fp:
    target_token_index = pickle.load(fp)
with open('eng_reverse_input_char_index.pickle', 'rb') as fp:
    reverse_input_char_index = pickle.load(fp)
with open('hin_reverse_target_char_index.pickle', 'rb') as fp:
    reverse_target_char_index = pickle.load(fp)

# Split Data into Train and Test Sets

In [None]:
# Split the data into train and test sets
from sklearn.model_selection import train_test_split

X, y = data.english_sentence, data.hindi_sentence
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)


# Define a Generator Function for Training Batches

In [None]:
# Define a generator function to generate training batches
def generate_batch(X, y, batch_size):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_encoder_seq_length), dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_decoder_seq_length), dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_decoder_seq_length, num_decoder_tokens), dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j + batch_size], y[j:j + batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word]  # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t < len(target_text.split()) - 1:
                        decoder_input_data[i, t] = target_token_index[word]  # decoder input seq
                    if t > 0:
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1
            yield ([encoder_input_data, decoder_input_data], decoder_target_data)


#  Define Model Parameters and Encoder Layers

In [None]:
# Define model parameters
latent_dim = 50

num_decoder_tokens = len(target_characters) + 1

# Define the encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(num_encoder_tokens, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]


# Define Decoder Layers and Model

In [None]:
# Define the decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


# Compile Model with Custom Learning Rate Optimizer

In [None]:
# Optimizer with custom learning rate
custom_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)  # Set your learning rate here
model.compile(optimizer=custom_optimizer, loss='categorical_crossentropy', metrics=['acc'])


# Print Model Summary

In [None]:
# Print model summary
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, None, 50)             486450    ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, None, 50)             433300    ['input_2[0][0]']             
                                                                                              

# Define Training Parameters

In [None]:
# Define training parameters (optimized batch_size and epochs)
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128  # Adjust batch size as needed
epochs = 45  # Adjust the number of epochs as needed


# Train the Model

In [None]:
model.fit_generator(
    generator=generate_batch(X_train, y_train, batch_size=batch_size),
    steps_per_epoch=train_samples // batch_size,
    epochs=epochs,
    validation_data=generate_batch(X_test, y_test, batch_size=batch_size),
    validation_steps=val_samples // batch_size
)


  model.fit_generator(


Epoch 1/45
 16/129 [==>...........................] - ETA: 3:39 - loss: 9.0055 - acc: 0.0476

InvalidArgumentError: ignored


# Save Model Weights

In [None]:
# Save model weights
model.save_weights('nmt_eng_hin_translation.h5')


# Define Encoder Model

In [None]:
# Define the encoder model
encoder_model = Model(encoder_inputs, encoder_states)

# Define Decoder Model and Decoding Function

In [None]:
# Define the decoder model
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

# Define a function to decode sequences
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_token_index['START_']
    decoded_sentence = ''

    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]

        if sampled_char == '_END' or len(decoded_sentence.split()) > max_decoder_seq_length:
            break

        decoded_sentence += ' ' + sampled_char
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()


# Generate Validation Data and Decode Sequences

In [None]:
# Generate a batch of validation data
val_gen = generate_batch(X_test, y_test, batch_size=1)
k = -1

# Iterate over validation samples and decode
k += 2
(input_seq, actual_output), _ = next(val_gen)
decoded_sentence = decode_sequence(input_seq)

# Print results
print('Input English sentence:', X_test[k:k+1].values[0])
print('Actual Hindi Translation:', y_test[k:k+1].values[0])
print('Predicted Hindi Translation:', decoded_sentence)