# Sequence to Sequence Model for Translation (ENG to PT)

### Using LSTM Autoencoders
### Dictionary in WordLevel
### Accuracy 

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers , activations , models , preprocessing , utils
import pandas as pd

In [10]:
lines = pd.read_table( 'eng2por.txt' , names=[ 'eng' , 'pt' ] )
lines = lines.iloc[ : 20000] 
lines.head()

Unnamed: 0,eng,pt
0,Go.,Vai.
1,Go.,Vá.
2,Hi.,Oi.
3,Run!,Corre!
4,Run!,Corra!


In [11]:
eng_lines = list()
for line in lines.eng:
    eng_lines.append(line) 

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(eng_lines) 
tokenized_eng_lines = tokenizer.texts_to_sequences(eng_lines) 

length_list = list()
for token_seq in tokenized_eng_lines:
    length_list.append(len(token_seq))
max_input_length = np.array(length_list).max()
print( 'English max length is {}'.format(max_input_length))

padded_eng_lines = preprocessing.sequence.pad_sequences(tokenized_eng_lines , maxlen=max_input_length , padding='post')
encoder_input_data = np.array( padded_eng_lines )
print( 'Encoder input data shape -> {}'.format(encoder_input_data.shape))

eng_word_dict = tokenizer.word_index
num_eng_tokens = len( eng_word_dict )+1
print( 'Number of English tokens = {}'.format(num_eng_tokens))

English max length is 6
Encoder input data shape -> (20000, 6)
Number of English tokens = 3315


In [12]:
port_lines = list()
for line in lines.pt:
    port_lines.append( '<START> ' + line + ' <END>' )  

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(port_lines) 
tokenized_port_lines = tokenizer.texts_to_sequences(port_lines) 

length_list = list()
for token_seq in tokenized_port_lines:
    length_list.append(len(token_seq))
max_output_length = np.array(length_list).max()
print('Portuguese max length is {}'.format(max_output_length ))

padded_port_lines = preprocessing.sequence.pad_sequences(tokenized_port_lines , maxlen=max_output_length, padding='post' )
decoder_input_data = np.array(padded_port_lines )
print('Decoder input data shape -> {}'.format(decoder_input_data.shape ))

port_word_dict = tokenizer.word_index
num_port_tokens = len(port_word_dict )+1
print('Number of Portuguese tokens = {}'.format(num_port_tokens))

Portuguese max length is 10
Decoder input data shape -> (20000, 10)
Number of Portuguese tokens = 5488


In [13]:
decoder_target_data = list()
for token_seq in tokenized_port_lines:
    decoder_target_data.append( token_seq[ 1 : ] ) 
    
padded_port_lines = preprocessing.sequence.pad_sequences(decoder_target_data , maxlen=max_output_length, padding='post' )
onehot_port_lines = utils.to_categorical(padded_port_lines , num_port_tokens )
decoder_target_data = np.array(onehot_port_lines)
print( 'Decoder target data shape -> {}'.format(decoder_target_data.shape))

Decoder target data shape -> (20000, 10, 5488)


In [15]:
encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( num_eng_tokens, 256 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 256 , return_state=True , recurrent_dropout=0.2 , dropout=0.2 )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( num_port_tokens, 256 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 256 , return_state=True , return_sequences=True , recurrent_dropout=0.2 , dropout=0.2)
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( num_port_tokens , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='categorical_crossentropy')

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 256)    848640      input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, None, 256)    1404928     input_4[0][0]                    
____________________________________________________________________________________________

In [17]:
model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=32, epochs=25 ) 
#model.save( 'model.h5' ) 

In [None]:
### Inference part
encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
decoder_state_input_h = tf.keras.layers.Input(shape=(256,))    
decoder_state_input_c = tf.keras.layers.Input(shape=(256,))
    
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding , initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = tf.keras.models.Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)

In [None]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( eng_word_dict[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=max_input_length , padding='post')

In [None]:
enc_model, dec_model = encoder_model, decoder_model
#enc_model , dec_model = make_inference_models()
#enc_model.save( 'enc_model.h5' ) 
#dec_model.save( 'dec_model.h5' ) 
#model.save( 'model.h5' ) 

for epoch in range( encoder_input_data.shape[0] ):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter eng sentence : ' ) ) )
    #states_values = enc_model.predict( encoder_input_data[ epoch ] )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = port_word_dict['start']
    stop_condition = False
    decoded_translation = ''
    
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        
        for word , index in port_word_dict.items() :
            if sampled_word_index == index :                
                sampled_word = word 
                
                if sampled_word == 'end' or len(decoded_translation.split()) > max_output_length:
                    stop_condition = True
                    
                else: 
                    decoded_translation += word
                    
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )  