In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers , activations , models , preprocessing , utils
import pandas as pd

tf.logging.set_verbosity( tf.logging.ERROR ) # Just to remove warnings!

print( tf.VERSION )


1.15.0


In [0]:
with open('file3.txt', 'w') as file3:
    with open('train.from', 'r') as file1:
        with open('train.to', 'r') as file2:
            for line1, line2 in zip(file1, file2):
                print(line1.strip() +'\t'+ line2.strip(), file=file3)

In [6]:
lines = pd.read_table( 'file3.txt' , names=[ 'user1' , 'user2' ] )
lines = lines.iloc[:] 
lines.head(20)


Unnamed: 0,user1,user2
0,HI,hello
1,how are you?,Good
2,good morning,good morning
3,good night,good night
4,where are you now,Home
5,what are you doing,"Nothing ,just some college stuff"
6,"who would win in a fight, lebron james, or a g...",Depends if those sandles are fitted with Nike ...
7,The power of a charismatic lead. Plus the sho...,"Agreed, Ioan Gruffudd intro's and outro's for ..."
8,"I don't know about you, but I don't typically ...",I remember when I was in high school I had a h...
9,I'm saying the people screaming racist on a mo...,"Oh, I thought you were saying it the other way..."


In [8]:
user1_lines = list()
for line in lines.user1:
    user1_lines.append( line ) 

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( user1_lines ) 
tokenized_user1_lines = tokenizer.texts_to_sequences( user1_lines ) 

length_list = list()
for token_seq in tokenized_user1_lines:
    length_list.append( len( token_seq ))
max_input_length = np.array( length_list ).max()
print( 'user1 max length is {}'.format( max_input_length ))

padded_user1_lines = preprocessing.sequence.pad_sequences( tokenized_user1_lines , maxlen=max_input_length , padding='post' )
encoder_input_data = np.array( padded_user1_lines )
print( 'Encoder input data shape -> {}'.format( encoder_input_data.shape ))

user1_word_dict = tokenizer.word_index
num_user1_tokens = len( user1_word_dict )+1
print( 'Number of user1 tokens = {}'.format( num_user1_tokens))

user1 max length is 62
Encoder input data shape -> (1500, 62)
Number of user1 tokens = 5224


In [9]:
user2_lines = list()
for line in lines.user2:
    user2_lines.append( '<START> ' + line + ' <END>' )  

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( user2_lines ) 
tokenized_user2_lines = tokenizer.texts_to_sequences( user2_lines ) 

length_list = list()
for token_seq in tokenized_user2_lines:
    length_list.append( len( token_seq ))
max_output_length = np.array( length_list ).max()
print( 'user2 max length is {}'.format( max_output_length ))

padded_user2_lines = preprocessing.sequence.pad_sequences( tokenized_user2_lines , maxlen=max_output_length, padding='post' )
decoder_input_data = np.array( padded_user2_lines )
print( 'Decoder input data shape -> {}'.format( decoder_input_data.shape ))

user2_word_dict = tokenizer.word_index
num_user2_tokens = len( user2_word_dict )+1
print( 'Number of user2 tokens = {}'.format( num_user2_tokens))

user2 max length is 64
Decoder input data shape -> (1500, 64)
Number of user2 tokens = 5026


In [10]:
decoder_target_data = list()
for token_seq in tokenized_user2_lines:
    decoder_target_data.append( token_seq[ 1 : ] ) 
    
padded_user2_lines = preprocessing.sequence.pad_sequences( decoder_target_data , maxlen=max_output_length, padding='post' )
onehot_user2_lines = utils.to_categorical( padded_user2_lines , num_user2_tokens )
decoder_target_data = np.array( onehot_user2_lines )
print( 'Decoder target data shape -> {}'.format( decoder_target_data.shape ))

Decoder target data shape -> (1500, 64, 5026)


In [11]:
from keras.callbacks import ModelCheckpoint


encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( num_user1_tokens, 256 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 128 , return_state=True  )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( num_user2_tokens, 256 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 128 , return_state=True , return_sequences=True)
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( num_user2_tokens , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='categorical_crossentropy')
filepath = "model.h5"
checkpoint1 = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint1]
model.summary()

Using TensorFlow backend.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 256)    1337344     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    1286656     input_2[0][0]                    
______________________________________________________________________________________________

In [12]:
model.fit([encoder_input_data , decoder_input_data], decoder_target_data, batch_size=250, epochs=50 ) 
model.save( 'model.h5' ) 

Train on 1500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100

In [0]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 128 ,))
    decoder_state_input_c = tf.keras.layers.Input(shape=( 128 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model

In [0]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        tokens_list.append( user1_word_dict[ word ] ) 
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=max_input_length , padding='post')

In [17]:
enc_model , dec_model = make_inference_models()

for epoch in range( encoder_input_data.shape[0] ):
    states_values = enc_model.predict( str_to_tokens( input( 'Enter user1 sentence : ' ) ) )
    # states_values = enc_model.predict( encoder_input_data[ epoch ] )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = user2_word_dict['start']
    stop_condition = False
    decoded_translation = ''
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in user2_word_dict.items() :
            if sampled_word_index == index :
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > max_output_length:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 

    print( decoded_translation )

Enter user1 sentence : hi
 end


KeyboardInterrupt: ignored