<a href="https://colab.research.google.com/github/CaptaiN785/DL-Notebooks/blob/main/Enc-Dec-language-translation/en_hi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset link: <a href = "https://www.kaggle.com/code/aiswaryaramachandran/english-to-hindi-neural-machine-translation/data" >Click here </a>

In [2]:
import zipfile
file = zipfile.ZipFile("Hindi_English_Truncated_Corpus.csv.zip")
file.extractall()
file.close()

In [3]:
# Importing some basic library
import pandas as pd
import numpy as np
from string import punctuation, digits
import re
from sklearn.model_selection import train_test_split


from tensorflow.keras import layers, models

In [4]:
## Reading the dataframe
df = pd.read_csv("Hindi_English_Truncated_Corpus.csv")
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [5]:
df['source'].value_counts()

tides        50000
ted          39881
indic2012    37726
Name: source, dtype: int64

In [6]:
df.shape

(127607, 3)

In [7]:
df.rename({'english_sentence':'eng', 'hindi_sentence':'hin'}, axis=1, inplace = True)
df.head(2)

Unnamed: 0,source,eng,hin
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...


In [8]:
## Checking the null values

df.isna().sum()

source    0
eng       2
hin       0
dtype: int64

In [9]:
df.dropna(axis = 0, inplace = True)
df.isna().sum()

source    0
eng       0
hin       0
dtype: int64

In [10]:
## Cheking the duplicates
print(df.duplicated().sum())
# there are 2778 duplicated rows
df.drop_duplicates(inplace = True, keep = 'first')
print(df.duplicated().sum())

2778
0


In [11]:
## Dropping the source values as of no use 
df.drop('source', axis = 1, inplace = True)
df.columns

Index(['eng', 'hin'], dtype='object')

## Text preprocessing

In [12]:
## Lowering the words

df['eng'] = df['eng'].apply(lambda x : x.lower())
df['hin'] = df['hin'].apply(lambda x : x.lower())

In [13]:
punc = set(punctuation)

df['eng'] = df['eng'].apply(lambda x : ''.join(ch for ch in x if ch not in punc))
df['hin'] = df['hin'].apply(lambda x : ''.join(ch for ch in x if ch not in punc))

In [14]:
hindi_digits = '२३०८१५७९४६'
digit = hindi_digits+digits
dig = set(digit)
df['eng'] = df['eng'].apply(lambda x : ''.join(ch for ch in x if ch not in dig))
df['hin'] = df['hin'].apply(lambda x : ''.join(ch for ch in x if ch not in dig))

In [15]:
df.head()

Unnamed: 0,eng,hin
0,politicians do not have permission to do what ...,राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करन...
1,id like to tell you about one such child,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी
2,this percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that theyre bad at not ...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,the ending portion of these vedas is called up...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [16]:
## Remove the all extra spaces from sentences

df['eng'] = df['eng'].apply(lambda x : re.sub(r'\s+', ' ', x.strip()))
df['hin'] = df['hin'].apply(lambda x : re.sub(r'\s+', ' ', x.strip()))

## Adding extra start and end tokens to decoder sentence

In [17]:
## start as <START>
## end as <END>

df['hin'] = df['hin'].apply(lambda x : "<START> " + x + " <END>")
df['hin'].iloc[0]

'<START> राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करने कि अनुमति नहीं है <END>'

In [18]:
## Generating the total unique words
eng_words = set()

for sentence in df['eng']:
    for word in sentence.split():
        eng_words.add(word)

hin_words = set()
for sentence in df['hin']:
    for word in sentence.split():
        hin_words.add(word)

print("English words len: ", len(eng_words))
print("Hindi words len: ", len(hin_words))

English words len:  72686
Hindi words len:  81977


In [19]:
## Storing the sentece length for each sentence

df['eng_len'] = df['eng'].apply(lambda x : len(x.split()))
df['hin_len'] = df['hin'].apply(lambda x : len(x.split()))

In [20]:
df.head()

Unnamed: 0,eng,hin,eng_len,hin_len
0,politicians do not have permission to do what ...,<START> राजनीतिज्ञों के पास जो कार्य करना चाहि...,12,15
1,id like to tell you about one such child,<START> मई आपको ऐसे ही एक बच्चे के बारे में बत...,9,13
2,this percentage is even greater than the perce...,<START> यह प्रतिशत भारत में हिन्दुओं प्रतिशत स...,10,11
3,what we really mean is that theyre bad at not ...,<START> हम ये नहीं कहना चाहते कि वो ध्यान नहीं...,12,13
4,the ending portion of these vedas is called up...,<START> इन्हीं वेदों का अंतिम भाग उपनिषद कहलात...,9,10


In [21]:
## Checking the value counts of legth of each sentence
## Taking only sentence length <= 30 from both english and hindi

df = df[df['eng_len'] <= 30]
df = df[df['hin_len'] <= 30]

In [22]:
df['eng_len'].max(), df['hin_len'].max()

(30, 30)

In [23]:
## Saving the dataframe now
df.to_csv("data.csv", index = False, header = True)

In [24]:
## Defined maximum sentence length
max_eng_len = 30
max_hin_len = 30

In [25]:
## Storing the word in ascending form 
sort_eng_words = sorted(list(eng_words))
sort_hin_words = sorted(list(hin_words))

num_eng_words = len(eng_words)
num_hin_words = len(hin_words)
num_eng_words, num_hin_words

(72686, 81977)

In [26]:
## Decoder sequence is zero padded so increase the length of num. of words in hindi words
num_hin_words += 1

## Tokenizing the words

In [27]:
eng_word_tokens = dict([(word, i) for i, word in enumerate(sort_eng_words)])
hin_word_tokens = dict([(word, i) for i, word in enumerate(sort_hin_words)])

In [28]:
## Storing the index to word
eng_index_words = dict([(i, word) for i, word in enumerate(sort_eng_words)])
hin_index_words = dict([(i, word) for i, word in enumerate(sort_hin_words)])

## Splitting the data into train-test-split

In [29]:
X = df['eng']
y = df['hin']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [30]:
X_train.shape, X_test.shape

((84156,), (21040,))

In [31]:
## Generating the batch of data
def generate_batch(X, y, batch_size = 128):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input = np.zeros((batch_size, max_eng_len), dtype = 'float32')
            decoder_input = np.zeros((batch_size, max_hin_len), dtype = 'float32')
            decoder_output = np.zeros((batch_size, max_hin_len, num_hin_words), dtype = 'float32')

            for i, (input_text, output_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                
                for t, word in enumerate(input_text.split()):
                    encoder_input[i, t] = eng_word_tokens[word]
                
                for t, word in enumerate(output_text.split()):
                    ## Here is the matter of timestamp
                    # ie. starts include in decoder input but on in decoder output
                    # and end goes into decoder output but not in decoder input

                    ## Adding word in decoder input where end is not included
                    if t < len(output_text.split()) - 1:
                        decoder_input[i, t] = hin_word_tokens[word]
                    
                    ## Now adding word in decoder output
                    ## Here start is not included in it.
                    if t > 0:
                        decoder_output[i, t-1, hin_word_tokens[word]] = 1

            yield([encoder_input, decoder_input], decoder_output)


In [42]:
## Encoder achitecture

embedding_dim = 300

encoder_input = layers.Input(shape=(None, ))
encoder_embedding = layers.Embedding(num_eng_words, embedding_dim, mask_zero = True)(encoder_input)
## Using LSTM layer
encoder_lstm = layers.LSTM(embedding_dim, return_state=True)

encoder_output, state_h, state_c = encoder_lstm(encoder_embedding)

## Here only state_h and state_c is required
encoder_states = [state_h, state_c]

In [45]:
## Decoder architecture

decoder_input = layers.Input(shape=(None, ))
decoder_embedding = layers.Embedding(num_hin_words, embedding_dim, mask_zero = True)

decoder_emb_output = decoder_embedding(decoder_input)

## Adding decoder LSTM
decoder_lstm = layers.LSTM(embedding_dim, return_state = True, return_sequences=True)
decoder_output, _, _ = decoder_lstm(decoder_emb_output, initial_state = encoder_states)

decoder_dense = layers.Dense(num_hin_words, activation = 'softmax')
decoder_output = decoder_dense(decoder_output)

In [39]:
model = models.Model([encoder_input, decoder_input], decoder_output)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 300)    21805800    ['input_1[0][0]']                
                                                                                                  
 embedding_2 (Embedding)        (None, None, 300)    24593400    ['input_3[0][0]']                
                                                                                            

In [40]:
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy')

In [36]:
train_sample = len(X_train)
test_sample = len(X_test)
batch_size = 128
epochs = 100

In [None]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size),
                    steps_per_epoch = train_sample//batch_size,
                    validation_data = generate_batch(X_test, y_test, batch_size),
                    validation_steps = test_sample//batch_size,
                    epochs = epochs
                    )

In [47]:
## Encoder model
encoder_model = models.Model(encoder_input, encoder_states)

## Setting up decoder
## Decoder input
decoder_state_input_h = layers.Input(shape = (embedding_dim, ))
decoder_state_input_c = layers.Input(shape = (embedding_dim, ))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]


## Decoder output

decoder_embedding_2 = decoder_embedding(decoder_input)

decoder_output2, decoder_state_h2, decoder_state_c2 = decoder_lstm(decoder_embedding_2, initial_state = decoder_states_inputs)
decoder_state2 = [decoder_state_h2, decoder_state_c2]
decoder_output2 = decoder_dense(decoder_output2)

decoder_model = models.Model([decoder_input ] + decoder_states_inputs, 
                             [decoder_output2] + decoder_state2
                             )


In [48]:
def decode_sequence(input_seq):
    # Encode the sequence as state vector
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = hin_word_tokens['<START>']

    ## Running loop for batch of sequence
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:

        output_token, h, c = decoder_model.predict([target_seq] + states_value)

        sample_token_index = np.argmax(output_token[0, -1, :])
        sample_char = hin_index_words[sample_token_index]
        decoded_sentence += " "+ sample_char

        if sample_char == '<END>' or len(decoded_sentence) > 60:
            stop_condition = True
        
        # Update the target sequence 
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sample_token_index

        # Update states
        states_value = [h, c]
    return decoded_sentence