In [1]:

import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras import layers , activations , models , preprocessing , utils

from gensim.models import Word2Vec


import re
import os
import yaml

In [2]:
# Importing casual conversation dataset

dir_path = 'Conversation'
files_list = os.listdir(dir_path + os.sep)

questions = list()
answers = list()
for filepath in files_list:
    stream = open( dir_path + os.sep + filepath , 'rb')
    docs = yaml.safe_load(stream)
    conversations = docs['conversations']
    for con in conversations:
        if len( con ) > 2 :
            questions.append(con[0])
            replies = con[ 1 : ]
            ans = ''
            for rep in replies:
                ans += ' ' + rep
            answers.append( ans )
        elif len( con )> 1:
            questions.append(con[0])
            answers.append(con[1])
            

In [3]:
answers_with_tags = list()
for i in range( len( answers ) ):
    if type( answers[i] ) == str:
        answers_with_tags.append( answers[i] )
    else:
        questions.pop( i )

answers = list()
for i in range( len( answers_with_tags ) ) :
    answers.append( '<START> ' + answers_with_tags[i] + ' <END>' )

tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts( questions + answers )
vocab_size = len( tokenizer.word_index )+1
print( 'VOCAB SIZE : {}'.format( vocab_size ))


VOCAB SIZE : 1894


In [4]:
vocab = []
for word in tokenizer.word_index:
    vocab.append( word )

def tokenize( sentences ):
    tokens_list = []
    vocabulary = []
    for sentence in sentences:
        sentence = sentence.lower()
        sentence = re.sub( '[^a-zA-Z]', ' ', sentence )
        tokens = sentence.split()
        vocabulary += tokens
        tokens_list.append( tokens )
    return tokens_list , vocabulary

p = tokenize( questions + answers )
model = Word2Vec( p[ 0 ] ) 

embedding_matrix = np.zeros( ( vocab_size , 100 ) )
for i in range( len( tokenizer.word_index ) ):
    if i in model.wv.vocab.keys():
        embedding_matrix[ i ] = model[ vocab[i] ]

# encoder_input_data
tokenized_questions = tokenizer.texts_to_sequences( questions )
maxlen_questions = max( [ len(x) for x in tokenized_questions ] )
padded_questions = preprocessing.sequence.pad_sequences( tokenized_questions , maxlen=maxlen_questions , padding='post' )
encoder_input_data = np.array( padded_questions )
print( encoder_input_data.shape , maxlen_questions )

# decoder_input_data
tokenized_answers = tokenizer.texts_to_sequences( answers )
maxlen_answers = max( [ len(x) for x in tokenized_answers ] )
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
decoder_input_data = np.array( padded_answers )
print( decoder_input_data.shape , maxlen_answers )

# decoder_output_data
tokenized_answers = tokenizer.texts_to_sequences( answers )
for i in range(len(tokenized_answers)) :
    tokenized_answers[i] = tokenized_answers[i][1:]
padded_answers = preprocessing.sequence.pad_sequences( tokenized_answers , maxlen=maxlen_answers , padding='post' )
onehot_answers = utils.to_categorical( padded_answers , vocab_size )
decoder_output_data = np.array( onehot_answers )
print( decoder_output_data.shape )


(564, 22) 22
(564, 74) 74
(564, 74, 1894)


In [5]:
# my tests


from keras.layers import Input, Embedding, LSTM, Bidirectional, Dense
from keras.models import Model

encoder_inputs = Input(shape=(max_seq_len, ))
encoder_embedding = Embedding( vocab_size, 256 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = LSTM( 256 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = Input(shape=(max_seq_len ,  ))
decoder_embedding = Embedding( vocab_size, 256 , mask_zero=True) (decoder_inputs)
decoder_lstm = LSTM( 256 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( vocab_size , activation='softmax') 
output = decoder_dense ( decoder_outputs )

model = Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.Adam(0.002), metrics=['acc'], loss='categorical_crossentropy')

model.summary()


Using TensorFlow backend.


NameError: name 'max_seq_len' is not defined

In [45]:
encoder_inputs = tf.keras.layers.Input(shape=( None , ))
encoder_embedding = tf.keras.layers.Embedding( vocab_size, 256 , mask_zero=True ) (encoder_inputs)
encoder_outputs , state_h , state_c = tf.keras.layers.LSTM( 256 , return_state=True )( encoder_embedding )
encoder_states = [ state_h , state_c ]

decoder_inputs = tf.keras.layers.Input(shape=( None ,  ))
decoder_embedding = tf.keras.layers.Embedding( vocab_size, 256 , mask_zero=True) (decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM( 256 , return_state=True , return_sequences=True )
decoder_outputs , _ , _ = decoder_lstm ( decoder_embedding , initial_state=encoder_states )
decoder_dense = tf.keras.layers.Dense( vocab_size , activation=tf.keras.activations.softmax ) 
output = decoder_dense ( decoder_outputs )

model = tf.keras.models.Model([encoder_inputs, decoder_inputs], output )
model.compile(optimizer=tf.keras.optimizers.Adam(0.002), metrics=['acc'], loss='categorical_crossentropy')

model.summary()

Model: "model_12"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, None, 256)    484864      input_17[0][0]                   
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, None, 256)    484864      input_18[0][0]                   
___________________________________________________________________________________________

In [None]:
model.fit([encoder_input_data , decoder_input_data], decoder_output_data, batch_size=256,
          epochs=200, shuffle=True, validation_split=0.2) 

model.save( 'model_conversation.h5' ) 

Train on 451 samples, validate on 113 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200

In [36]:
def make_inference_models():
    
    encoder_model = tf.keras.models.Model(encoder_inputs, encoder_states)
    
    decoder_state_input_h = tf.keras.layers.Input(shape=( 256 ,)) # max sequence length
    decoder_state_input_c = tf.keras.layers.Input(shape=( 256 ,))
    
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding , initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = tf.keras.models.Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
    return encoder_model , decoder_model


In [37]:
def str_to_tokens( sentence : str ):
    words = sentence.lower().split()
    tokens_list = list()
    for word in words:
        word = word.strip("',.:;?$")
        if word in tokenizer.word_index:
            tokens_list.append( tokenizer.word_index[ word ] )
    return preprocessing.sequence.pad_sequences( [tokens_list] , maxlen=maxlen_questions , padding='post')

In [38]:

enc_model , dec_model = make_inference_models()

for _ in range(10):
    inp = input( 'Enter question : ' )
    
    if inp.lower() == 'bye' or inp.lower() == 'end':
        print('Hope to see you soon!')
        break
        
    states_values = enc_model.predict( str_to_tokens( inp ) )
    empty_target_seq = np.zeros( ( 1 , 1 ) )
    empty_target_seq[0, 0] = tokenizer.word_index['start']
    stop_condition = False
    decoded_translation = ''
   
    while not stop_condition :
        dec_outputs , h , c = dec_model.predict([ empty_target_seq ] + states_values )
        sampled_word_index = np.argmax( dec_outputs[0, -1, :] )
        sampled_word = None
        for word , index in tokenizer.word_index.items() :
            #print(word,index)
            if sampled_word_index == index :
                # print(word, '!')
                decoded_translation += ' {}'.format( word )
                sampled_word = word
        
        if sampled_word == 'end' or len(decoded_translation.split()) > maxlen_answers:
            stop_condition = True
            
        empty_target_seq = np.zeros( ( 1 , 1 ) )  
        empty_target_seq[ 0 , 0 ] = sampled_word_index
        states_values = [ h , c ] 
    
    print( decoded_translation )


Enter question : hi
i !
end !
 i end
Enter question : can you answer a question
end !
 end
Enter question : bye
Hope to see you soon!


In [26]:
a = str_to_tokens('are you alive yet')

In [27]:
print(a)

[[ 11   3 832  88   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]]


In [28]:
states_values = enc_model.predict( a )

In [29]:
states_values

[array([[-0.9250318 ,  0.6752312 ,  0.95101345, -0.9964426 , -0.9895105 ,
          0.9663247 , -0.994699  ,  0.9922562 , -0.9918174 ,  0.95332503,
          0.99505883, -0.99545527,  0.9957602 ,  0.9963229 ,  0.91156596,
         -0.9748242 ,  0.3386685 , -0.99504113, -0.9936898 , -0.97976625,
          0.9547825 , -0.9956732 , -0.992852  , -0.28045473,  0.9968604 ,
         -0.9937114 , -0.01042464, -0.90232706, -0.9943071 ,  0.9945505 ,
         -0.99429774, -0.9768969 ,  0.6647353 ,  0.06206416, -0.97602874,
         -0.99444723,  0.9945752 , -0.9961743 , -0.80914587,  0.9963387 ,
          0.99650925,  0.9887442 , -0.98992264, -0.8152563 ,  0.21929361,
          0.85932785, -0.96494603,  0.00226489, -0.98585206,  0.02420429,
         -0.9926739 , -0.67397463, -0.95304936, -0.9910338 ,  0.9617914 ,
          0.8042295 , -0.72745275,  0.96770704, -0.01225162, -0.9968767 ,
         -0.66481733, -0.95352626,  0.09174662, -0.96543753,  0.9945669 ,
          0.00345756, -0.94600946,  0.

In [30]:
sampled_word_index

911

In [31]:
dec_outputs

array([[[2.6642572e-05, 3.1620646e-05, 1.9246917e-03, ...,
         3.3558518e-04, 3.7239853e-04, 4.1406759e-04]]], dtype=float32)

In [None]:
c