Seq2Seq architecture is a type of moany-to-many sequence modeling and is commonly used for a variety of tasks such 
as Text-Summarization, ChatBot Development, Conversational Modeling, and Neural Machine Translation, etc,

In [2]:
# import all required libraries 

# keras
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# others
import os,sys
import numpy as np
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [3]:
# set values for different parameters

BATCH_SIZE = 64
EPOCHS = 20
LSTM_NODES = 256
NUM_SENTENCES = 20000
MAX_SENTENCE_LENGTH = 50
MAX_NUM_WORDS = 20000
EMBEDDING_SIZE = 100

the data set- Translate English sentences to thier French language 
fra-eng.zip-> fra.text file, on each line the text file contians an English sentence and its French translation

model contains more than 170,000 records but we will only use the first 20,000 reocrds to tain our model

# Seq2Seq Model

Seq2Seq architecture is an encoder-decoder architecture which consists of two LSTM networks- the encoder LSTM and the decoder LSTM

The input to the encoder LSTM is the sentence in the original language.
The input to the decoder LSTM is the sentence in the translated language with a start-of-sentence token

The ouput is the actual target sentence with an end-of-sentence token

# Data Preprocessing

In [4]:
# we need to generate two copies of the trenslated sentence- one with 
# start-of-sentence token and the other with the end-of-sentence token

input_sentences = []
output_sentences1 = []
output_sentences_inputs1 = []

count = 0

for line in open(r'/home/bukya/Downloads/fra-eng/fra.txt', encoding="utf-8"):
    count += 1
    
    
    if count > NUM_SENTENCES:
        break
        
    if '\t' not in line:
        continue
    lines = line.rstrip().split('\t')
    input_sentence, output = lines[0],lines[1]

    output_sentence = output + ' <eos>'
    output_sentences_input = '<sos> ' + output
    
    input_sentences.append(input_sentence)
    output_sentences1.append(output_sentence)
    output_sentences_inputs1.append(output_sentences_input)

Each line is split into two substrings at the position where the tab occurs. The left substring (the English sentence) is inserted into the input_sentences[] list. The substring to the right of the tab is the corresponding translated French sentence. The <eos> token, which marks the end-of-sentence is prefixed to the translated sentence, and the resultant sentence is appended to the output_sentences[] list. Similarly, the <sos> token, which stands for "start of sentence", is concatenated at the start of the translated sentence and the result is added to the output_sentences_inputs[]

In [5]:
output_sentences_inputs1

['<sos> Va !',
 '<sos> Salut !',
 '<sos> Salut.',
 '<sos> Cours\u202f!',
 '<sos> Courez\u202f!',
 '<sos> Qui ?',
 '<sos> Ça alors\u202f!',
 '<sos> Au feu !',
 "<sos> À l'aide\u202f!",
 '<sos> Saute.',
 '<sos> Ça suffit\u202f!',
 '<sos> Stop\u202f!',
 '<sos> Arrête-toi !',
 '<sos> Attends !',
 '<sos> Attendez !',
 '<sos> Poursuis.',
 '<sos> Continuez.',
 '<sos> Poursuivez.',
 '<sos> Bonjour !',
 '<sos> Salut !',
 '<sos> Je comprends.',
 "<sos> J'essaye.",
 "<sos> J'ai gagné !",
 "<sos> Je l'ai emporté !",
 '<sos> J’ai gagné.',
 '<sos> Oh non !',
 '<sos> Attaque !',
 '<sos> Attaquez !',
 '<sos> Santé !',
 '<sos> À votre santé !',
 '<sos> Merci !',
 '<sos> Tchin-tchin !',
 '<sos> Lève-toi.',
 '<sos> Va, maintenant.',
 '<sos> Allez-y maintenant.',
 '<sos> Vas-y maintenant.',
 "<sos> J'ai pigé !",
 '<sos> Compris !',
 '<sos> Pigé\u202f?',
 '<sos> Compris\u202f?',
 "<sos> T'as capté\u202f?",
 '<sos> Monte.',
 '<sos> Montez.',
 '<sos> Serre-moi dans tes bras !',
 '<sos> Serrez-moi dans vos br

In [6]:
# \u202f and \u2009 remove 
import re

def clean_text(text):
            text = re.sub(r"\u202f", " ", text)
            text = re.sub(r"\u2009", " ", text)

            return text

In [7]:
output_sentences= []
output_sentences_inputs = []

for i in range(20000):
    output_sentences.append(clean_text(output_sentences1[i]))
    output_sentences_inputs.append(clean_text(output_sentences_inputs1[i]))

In [8]:
output_sentences_inputs

['<sos> Va !',
 '<sos> Salut !',
 '<sos> Salut.',
 '<sos> Cours !',
 '<sos> Courez !',
 '<sos> Qui ?',
 '<sos> Ça alors !',
 '<sos> Au feu !',
 "<sos> À l'aide !",
 '<sos> Saute.',
 '<sos> Ça suffit !',
 '<sos> Stop !',
 '<sos> Arrête-toi !',
 '<sos> Attends !',
 '<sos> Attendez !',
 '<sos> Poursuis.',
 '<sos> Continuez.',
 '<sos> Poursuivez.',
 '<sos> Bonjour !',
 '<sos> Salut !',
 '<sos> Je comprends.',
 "<sos> J'essaye.",
 "<sos> J'ai gagné !",
 "<sos> Je l'ai emporté !",
 '<sos> J’ai gagné.',
 '<sos> Oh non !',
 '<sos> Attaque !',
 '<sos> Attaquez !',
 '<sos> Santé !',
 '<sos> À votre santé !',
 '<sos> Merci !',
 '<sos> Tchin-tchin !',
 '<sos> Lève-toi.',
 '<sos> Va, maintenant.',
 '<sos> Allez-y maintenant.',
 '<sos> Vas-y maintenant.',
 "<sos> J'ai pigé !",
 '<sos> Compris !',
 '<sos> Pigé ?',
 '<sos> Compris ?',
 "<sos> T'as capté ?",
 '<sos> Monte.',
 '<sos> Montez.',
 '<sos> Serre-moi dans tes bras !',
 '<sos> Serrez-moi dans vos bras !',
 '<sos> Je suis tombée.',
 '<sos> Je s

In [9]:
input_sentences[172], output_sentences[172], output_sentences_inputs[172]

("I'm ill.", 'Je suis malade. <eos>', '<sos> Je suis malade.')

In [10]:
input_sentences[3], output_sentences[3], output_sentences_inputs[3]

('Run!', 'Cours ! <eos>', '<sos> Cours !')

# Tokenization and padding

applying padding to the sentences that are longer or shorter than a certain length, which in case of inputs will be the length of the longest input sentence. And for the output this will be the length of the longest sentence in the output.

Tokenizer-->> It divides a sentence into the corresponding list of word
, Then it converts the words to integers

In [11]:
# This is extremely important since deep learning and machine learning algorithms work with numbers

# tokenize the input data

input_tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)
input_tokenizer.fit_on_texts(input_sentences)

input_integer_seq = input_tokenizer.texts_to_sequences(input_sentences)

word2idx_inputs = input_tokenizer.word_index
print('Total unique words in the input: %s ' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)

print("length of longest sentence in the input:", max_input_len)

Total unique words in the input: 3517 
length of longest sentence in the input: 6


In [12]:
# similarly for out put data

output_tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
output_tokenizer.fit_on_texts(output_sentences + output_sentences_inputs)
output_integer_seq = output_tokenizer.texts_to_sequences(output_sentences)
output_input_integer_seq = output_tokenizer.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = output_tokenizer.word_index
unique_words_output = len(word2idx_outputs)
print("Total unique words in the output:", unique_words_output)

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output:", max_out_len)

Total unique words in the output: 9198
Length of longest sentence in the output: 13


Next, we need to pad the input. The reason behind padding the input and the output is that text sentences can be of varying length, however LSTM (the algorithm that we are going to train our model) expects input instances with the same length. Therefore, we need to convert our sentences into fixed-length vectors. One way to do this is via padding.

In [13]:
encoder_input_sequences = pad_sequences(input_integer_seq, maxlen=max_input_len)

print("encoder_input_sequences.shape:", encoder_input_sequences.shape)
print(" example one encoder_input_sequences[172]:", encoder_input_sequences[172])

encoder_input_sequences.shape: (20000, 6)
 example one encoder_input_sequences[172]: [  0   0   0   0   6 536]


In [14]:
# verify the integers

print(word2idx_inputs["i'm"])
print(word2idx_inputs["ill"])

6
536


In [15]:
#In the same way, the decoder outputs and the decoder inputs are padded 

decoder_input_sequences = pad_sequences(output_input_integer_seq, maxlen=max_out_len, padding='post')
print("decoder_input_sequences.shape:", decoder_input_sequences.shape)
print("decoder_input_sequences[172]:", decoder_input_sequences[172])

# zeros are appended at the end of the sentences 

decoder_input_sequences.shape: (20000, 13)
decoder_input_sequences[172]: [  2   3   6 191   0   0   0   0   0   0   0   0   0]


It is further important to mention that in the case of the decoder, the post-padding is applied, which means that zeros are appended at the end of the sentence. In the encoder, zeros were padded at the beginning. The reason behind this approach is that encoder output is based on the words occurring at the end of the sentence, therefore the original words were kept at the end of the sentence and zeros were padded at the beginning. On the other hand, in the case of the decoder, the processing starts from the beginning of a sentence, and therefore post-padding is performed on the decoder inputs and outputs.

# Word Embeddings 

Since we are using deep learning model and deep learning models work with numbers, therefore we need to convert our words into their corresponding numeric vector representations, 

But we already converted our words into integers, so what's the difference between integer representation and word embeddings?

TWO differences

1. With integer representation a word is represented only with a single integer, but with in word embedding we represent a word with a vector, with vector representation a word is represented by a vector of 50,100,200 or whatever dimensions we like.
Hence, word embeddings capture a lot more inforamtion about words.

2. the single integer representation doesn't capture the relationships between different  words. but in word embeddings retain relationships between the words 

we can use custom word embeddings or pretrained word embeddings

In this example we will use Glove word embeddings for English input sentences 
and custom word embeddings for the translated Frnech sentences in the output

In [16]:
# create word embedding for the inputs first
# load Golve vectors and then create a dictionary where words are the keys 
# and the corresponding vectors are values 

from numpy import array, asarray, zeros

embeddings_dictionary = dict()

glove_file = open(r'/home/bukya/snap/firefox/common/Downloads/glove.6B/glove.6B.100d.txt', 
                 encoding = "utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [17]:
embeddings_dictionary['the'] # Glove word embedding for, key ['the'] and matrix is value

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [18]:
# create a matrix where the row number will represent the integer value for the word 
# and the columns will correspond to the dimensions of the word 
# this matrix will contain the word embeddings for the words in our input sentences 

num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embeddings_matrix = zeros((num_words, EMBEDDING_SIZE))

for word, index in word2idx_inputs.items():
    embedding_vector = embeddings_dictionary.get(word)
    
    if embedding_vector is not None:
        embeddings_matrix[index] = embedding_vector

In [19]:
embeddings_dictionary['ill'] # Glove word embedding dictionary for the 'ill' word

array([ 0.12648  ,  0.1366   ,  0.22192  , -0.025204 , -0.7197   ,
        0.66147  ,  0.48509  ,  0.057223 ,  0.13829  , -0.26375  ,
       -0.23647  ,  0.74349  ,  0.46737  , -0.462    ,  0.20031  ,
       -0.26302  ,  0.093948 , -0.61756  , -0.28213  ,  0.1353   ,
        0.28213  ,  0.21813  ,  0.16418  ,  0.22547  , -0.98945  ,
        0.29624  , -0.62476  , -0.29535  ,  0.21534  ,  0.92274  ,
        0.38388  ,  0.55744  , -0.14628  , -0.15674  , -0.51941  ,
        0.25629  , -0.0079678,  0.12998  , -0.029192 ,  0.20868  ,
       -0.55127  ,  0.075353 ,  0.44746  , -0.71046  ,  0.75562  ,
        0.010378 ,  0.095229 ,  0.16673  ,  0.22073  , -0.46562  ,
       -0.10199  , -0.80386  ,  0.45162  ,  0.45183  ,  0.19869  ,
       -1.6571   ,  0.7584   , -0.40298  ,  0.82426  , -0.386    ,
        0.0039546,  0.61318  ,  0.02701  , -0.3308   , -0.095652 ,
       -0.082164 ,  0.7858   ,  0.13394  , -0.32715  , -0.31371  ,
       -0.20247  , -0.73001  , -0.49343  ,  0.56445  ,  0.6103

In [20]:
embeddings_matrix[536] # integer representation for ill is 536 and the word embedding is output

array([ 0.12648   ,  0.1366    ,  0.22192   , -0.025204  , -0.71969998,
        0.66147   ,  0.48508999,  0.057223  ,  0.13829   , -0.26374999,
       -0.23647   ,  0.74348998,  0.46737   , -0.46200001,  0.20031001,
       -0.26302001,  0.093948  , -0.61756003, -0.28213   ,  0.1353    ,
        0.28213   ,  0.21813001,  0.16418   ,  0.22547001, -0.98944998,
        0.29624   , -0.62475997, -0.29534999,  0.21534   ,  0.92273998,
        0.38387999,  0.55743998, -0.14628001, -0.15673999, -0.51941001,
        0.25628999, -0.0079678 ,  0.12998   , -0.029192  ,  0.20868   ,
       -0.55127001,  0.075353  ,  0.44746   , -0.71046001,  0.75562   ,
        0.010378  ,  0.095229  ,  0.16673   ,  0.22073001, -0.46562001,
       -0.10199   , -0.80386001,  0.45162001,  0.45183   ,  0.19869   ,
       -1.65709996,  0.75840002, -0.40298   ,  0.82426   , -0.38600001,
        0.0039546 ,  0.61317998,  0.02701   , -0.3308    , -0.095652  ,
       -0.082164  ,  0.78579998,  0.13394   , -0.32714999, -0.31

In [21]:
# word embedding matrix will be used to create the embedding layer for our LSTM model


# Embedding layer
embedding_layer = Embedding(num_words, EMBEDDING_SIZE, weights=[embeddings_matrix], input_length=max_input_len)

In [22]:

# final shape of the output will be (number of inputs, length of the output sentence, 
# number words in the output)

# create a empty output array

decoder_targets_one_hot = np.zeros((len(input_sentences), max_out_len, num_words_output), dtype = 'uint8')

In [23]:
decoder_targets_one_hot.shape

(20000, 13, 9199)

to make predictions, the final layer of the model will be a dense layer, therfore we need the outputs in the form of one-hot encoded vectors, since we will be using SOFTMAX activation function at the dense layer. To create a such one-hot encoded output, the next step is to assign 1 to the column number that corresponds to the integer representation of the word.

For instance, '<sos> Je suis malade.' is [  2   3   6 191   0   0   0   0   0   0   0   0 0]
In the decoder_targets_one_hot output array, in the 2nd column of the 1st row 1 will be inserted, similarly at the 3rd index of the second row another 1 will be inserted and so on.

In [24]:
# decoder output sequences
decoder_output_sequences = pad_sequences(output_integer_seq, maxlen=max_out_len, padding='post')

In [25]:
for i ,d in enumerate(decoder_output_sequences):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

next we need to create the encoder and decoders the input to the encoder will be the sentence in English, and the output will be the hidden stata and cell of thr LSTM

In [26]:
# Encoder LSTM

encoder_inputs_placeholder = Input(shape=(max_input_len,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(LSTM_NODES, return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h, c]

In [27]:
# next step is to define the decoder, the decoder will have tow inputs-
# the hidden state and cell state from the encoder and 
# the input sequence, which actually will be the output sentence with an <sos> token appended at the beginning 

# Decoder LSTM

decoder_inputs_placeholder = Input(shape=(max_out_len,))

decoder_embedding = Embedding(num_words_output, LSTM_NODES)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_lstm = LSTM(LSTM_NODES, return_sequences=True, return_state=True)

decoder_outputs, _, _ = decoder_lstm(decoder_inputs_x, initial_state=encoder_states)

In [28]:
# finally the output from the decoder LSTM is passed through a dense layer to predict decoder outputs 

# Dense layer

decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [29]:
# compile the model

model = Model([encoder_inputs_placeholder, decoder_inputs_placeholder], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [31]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 13)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 6, 100)       351800      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 13, 256)      2354944     input_2[0][0]                    
____________________________________________________________________________________________

From the output, we have two types of input - input_1 and input_2

input_1 is the input placeholder for the encoder, which is embedde dand passed through lstm_1, which basically is th encoder LSTM, 
There are three outputs from the lstm_1 layer: the ouput, the hidden layer and the cell state, However only the cell state and the hidden state are passed to the decoder

lstm_2 layer is the decoder LSTM, input_2 contains the output sentences with <sos> token appended at the start. the input_2 is also passed through the an embedding layer and is used as input to the decoder LSTM, lstm_2
    
Finally, the output from the decoder LSTM is passed through the dense layer to make predictions 

In [32]:
# train our model using fit() method 

r = model.fit([encoder_input_sequences, decoder_input_sequences],
             decoder_targets_one_hot,
             batch_size=BATCH_SIZE,
             epochs=EPOCHS,
             validation_split=0.1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 18000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


model trained on 18,000 records and tested on the remaining 2,000 records 
After 20 epochs, training accuracy 90.71% and validation accuracy 79.35%
which shows that model is Overfitting,

To reduce overfitting, add dropout or more records 

add more records to reduce the overfitting

# Modifying the model for Predictions 

An example of what happens during training is as follows. Suppose we have a sentence i'm ill. The sentence is translated as follows:

i'm ill(i/p English sentence) -------> je suis malade (o/p French entence )

// Inputs on the left of Encoder/Decoder, outputs on the right.

Step 1:
I'm ill -> Encoder -> enc(h1,c1)

enc(h1,c1) + <sos> -> Decoder -> je + dec(h1,c1)

step 2:

dec(h1,c1) + je -> Decoder -> suis + dec(h2,c2)

step 3:

dec(h2,c2) + suis -> Decoder -> malade. + dec(h3,c3)

step 3:

dec(h3,c3) + malade. -> Decoder -> <eos> + dec(h4,c4)

However, during predictions the next word will be predicted on the basis of the previous word, which in turn is also predicted in the previous time-step

An example of what happens during prediction is as follows. We will again translate the sentence i'm ill:

// Inputs on the left of Encoder/Decoder, outputs on the right.

Step 1:

I'm ill -> Encoder -> enc(h1,c1)

enc(h1,c1) + <sos> -> Decoder -> y1(je) + dec(h1,c1)

step 2:

dec(h1,c1) + y1 -> Decoder -> y2(suis) + dec(h2,c2)

step 3:

dec(h2,c2) + y2 -> Decoder -> y3(malade.) + dec(h3,c3)

step 3:

dec(h3,c3) + y3 -> Decoder -> y4(<eos>) + dec(h4,c4)

In [34]:
# Let's modify our model to implement this logic

# encoder model remains same

encoder_model = Model(encoder_inputs_placeholder, encoder_states)

In [35]:
# now ate each step we need the decoder hidden state and cell states
# we will modify our model to accept the hidden and cell states 

decoder_state_input_h = Input(shape=(LSTM_NODES,))
decoder_state_input_c = Input(shape=(LSTM_NODES,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [36]:
# at each time step, there will be only one single word in the decoder input

# we need to modify the decoder embedding layer 

decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

In [37]:
# placeholder for decoder outputs

decoder_outputs, h, c = decoder_lstm(decoder_inputs_single_x, initial_state=decoder_states_inputs)

In [38]:
# to make the predictions, the decoder output is passed through the dense layer

decoder_states = [h, c]
decoder_ouputs = decoder_dense(decoder_outputs)

In [39]:
# final step to define the updated decoder model

decoder_model = Model([decoder_inputs_single] + decoder_states_inputs,
                     [decoder_outputs] + decoder_states)

In [41]:
decoder_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         multiple             2354944     input_5[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 256)          0                                            
____________________________________________________________________________________________

lstm_2 is the modified deocder LSTM

# Making Predictions

In [42]:
# In the tokenization steps, we converted words to integers 
# we will create new dictionaries for both inputs and outputs where 
# the keys will be the integers and the correspomding values will be words

idx2word_input = {v:k for k, v in word2idx_inputs.items()}
idx2word_target = {v:k for k, v in word2idx_outputs.items()}

In [47]:
# we need to create a method to translate sentences 
# the method will acept input-padded sequence English sentence and will 
# return translated French sentence 

def translate_sentence(input_seq):
    states_value = encoder_model.predict(input_seq) # pass the input sequence to encoder model
    
    target_seq = np.zeros((1,1)) # 1 x 1 matrix of all zeros
    target_seq[0, 0] = word2idx_outputs['<sos>'] # target_seq variable contains the   
                                                 # first word to the decoder model which is <sos>
    
    eos = word2idx_outputs['<eos>'] # eos variable initialized, which stores the integer value 
                                    # for the <eos> token
    
    output_sentence = [] # list contain the predicted translation
    
    for _ in range(max_out_len): # no. of execution cycles is length of the longest sentence in output
        
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value) 
        
        # predicts the output and hidden, cell states. using hidden, cell states of encoder and input token <sos>
        
        idx = np.argmax(output_tokens[0, 0, :]) # index of the predicted word stored
        
        if eos == idx: # stop the iteration if <eos> is predicted
            break
            
        if idx > 0: # predicted value is greater than zero 
            
            word = idx2word_target[idx] # corresponding word is retrieved from the idx2word dictionary
            output_sentence.append(word) # append the predicted word
            
        target_seq[0, 0] = idx # index of the predicted word is stored 
        states_value = [h, c] # state_value is updated with the new hidden and cell state of the deocder
        
        # in the next loop cycle, the updated hidden and cell states, 
        # along with the index of the previously predicted word, are used to make new predictions
        # loop continues untill the maximum output sequence length is achieved or the <eos> token is encountered
        
        
    return ' '.join(output_sentence)
    
    # list are concatenated using a space and the resulting string is returned to the calling function

# Testing the Model

In [73]:
# choose a sentence randomly from the input_sentences list
# retrieve the corresponding padded sequence for sentence 
# and we pass it to the translate_sentence()

i = np.random.choice(len(input_sentences))
input_seq = encoder_input_sequences[i:i+1]
translation = translate_sentence(input_seq)

print("Input:", input_sentences[i])
print("Response:", translation)

Input: You seem busy.
Response: me fait. j'en a a fut faire. faire. faire. faire. faire. faire.
