In [0]:
##necessary imports :--
import tensorflow as tf
import numpy as np
import collections
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
from google.colab import files
files.upload()
##Loading from data

In [0]:
files.upload()
##Loading to data

In [0]:
##Hyperparameters:--
layer_size = 128
num_layers = 2
learning_rate = 0.001
batch_size = 32
epoch = 100
max_document_length = 50
embedding_dim = 128

In [0]:
##defining some of the helper functions to create the word-embeddings
import collections
import re

def build_dataset(words, n_words):
  '''This function creates the required data format for passing in the RNN'''
  
  count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
  count.extend(collections.Counter(words).most_common(n_words-1))
  
  ##initializing empty dictionary
  dictionary = dict()
  for word,_ in count:
    dictionary[word] = len(dictionary)
    
  data = list()
  unk_count = 0
  for word in words:
    index = dictionary.get(word,0)
    if index == 0:
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  
  
  reversed_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
  
  return data,count,dictionary,reversed_dictionary


def clean_string(string):
  string = re.sub('[^A-Za-z0-9 ]+', '', string)
  string = string.split(' ')
  string = filter(None, string)
  string = [y.strip() for y in string]
  string = ' '.join(string)
  return string.lower()


In [0]:
##Loading the dialouge data
with open('from.txt','r') as f:
  text_from = f.read().split('\n')
with open('to.txt','r') as f1:
  text_to = f1.read().split('\n')

In [0]:
##getting the sizes:--
concat_from = ' '.join(text_from).split()
size_from = len(list(set(concat_from)))

concat_to = ' '.join(text_to).split()
size_to = len(list(set(concat_to)))

In [0]:
##getting the integer representations:--
data_from, count_from, dictionary_from, rev_dictionary_from = build_dataset(concat_from,size_from)
data_to, count_to, dictionary_to, rev_dictionary_to = build_dataset(concat_to, size_to)

In [0]:
GO = dictionary_from['GO']
PAD = dictionary_from['PAD']
EOS = dictionary_from['EOS']
UNK = dictionary_from['UNK']

In [0]:
##making the pipeline for the chatbot:--


class ChatBot101(object):
  
  def __init__(self,layer_size,num_layers,embedding_dim,from_dict_size,to_dict_size,learning_rate,batch_size):
    
    def cells(reuse = False):
      return tf.nn.rnn_cell.LSTMCell(layer_size,initializer = tf.orthogonal_initializer(),reuse = reuse)
    self.X = tf.placeholder(tf.int32,[None,None],name = 'input')
    self.Y = tf.placeholder(tf.int32,[None,None],name = 'output')
    self.X_seq_len = tf.placeholder(tf.int32,[None],name = 'sequence_length_of_x')
    self.Y_seq_len = tf.placeholder(tf.int32,[None],name = 'sequence_lenght_of_y')
    
    with tf.variable_scope("encoder_embeddings"):
      encoder_embeddings = tf.Variable(tf.random_uniform([from_dict_size,embedding_dim],-1,1))
      encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings,self.X)
      after_encoder = tf.strided_slice(self.X,[0,0],[batch_size,-1],[1,1])
      
    with tf.variable_scope("decodeer_embeddings"):
      decoder_input = tf.concat([tf.fill([batch_size,1],GO),after_encoder],1)
      decoder_embeddings = tf.Variable(tf.random_uniform([to_dict_size,embedding_dim],-1,1))
      decoder_embedded = tf.nn.embedding_lookup(encoder_embeddings,decoder_input)
      
    with tf.variable_scope("encoder"):
      rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
      _,last_state = tf.nn.dynamic_rnn(rnn_cells,encoder_embedded,dtype = tf.float32)
      
    with tf.variable_scope("decoder"):
      rnn_cells_dec = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
      outputs,_ = tf.nn.dynamic_rnn(rnn_cells_dec,decoder_embedded,initial_state = last_state,dtype = tf.float32)

    with tf.variable_scope("logits"):
      self.logits = tf.layers.dense(outputs,to_dict_size)
      masks = tf.sequence_mask(self.Y_seq_len,tf.reduce_max(self.Y_seq_len),dtype = tf.float32)
      
    with tf.variable_scope("cost"):
      self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.logits,
                                                  targets = self.Y,
                                                  weights = masks)
      
    with tf.variable_scope("optimizer"):
      opti = tf.train.AdamOptimizer(learning_rate = learning_rate)#.minimize(self.cost)
      ##using gradient clipping
      gradients = opti.compute_gradients(self.cost)
      clipped_grad = [(tf.clip_by_value(grad,-1.,1.),var) for grad,var in gradients if grad is not None]
      self.optimizer = opti.apply_gradients(clipped_grad)
      

In [0]:
##initializatio of the model
import os
tf.reset_default_graph()
sess = tf.InteractiveSession()

model = ChatBot101(layer_size,num_layers,embedding_dim,size_from+4,size_to+4,learning_rate,batch_size)
sess.run(tf.global_variables_initializer())

saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)
checkpoint_dir = os.path.abspath(os.path.join('./', "checkpoints_chatbot"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")



In [0]:
##other helper functions:--

##getting the integer representation of the words:
def str_idx(corpus, dic):
  X = []
  for i in corpus:
    ints = []
    for k in i.split():
      try:
        ints.append(dic[k])
      except Exception as e:
        print(e)
        ints.append(2)
    X.append(ints)
  return X

def pad_sentence_batch(sentence_batch, pad_int):
  padded_seqs = []
  seq_lens = []
  max_sentence_len = 50
  for sentence in sentence_batch:
    padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
    seq_lens.append(50)
  return padded_seqs, seq_lens

def check_accuracy(logits, Y):
  acc = 0
  for i in range(logits.shape[0]):
    internal_acc = 0
    for k in range(len(Y[i])):
      if Y[i][k] == logits[i][k]:
        internal_acc += 1
    acc += (internal_acc / len(Y[i]))
  return acc / logits.shape[0]

X = str_idx(text_from, dictionary_from)
Y = str_idx(text_to, dictionary_to)



'lucky'
'sad'


In [0]:
for i in range(epoch):
  total_loss, total_accuracy = 0, 0
  for k in range(0, (len(text_from) // batch_size) * batch_size, batch_size):
    batch_x, seq_x = pad_sentence_batch(X[k: k+batch_size], PAD)
    batch_y, seq_y = pad_sentence_batch(Y[k: k+batch_size], PAD)
    predicted, loss, _ = sess.run([tf.argmax(model.logits,2), model.cost, model.optimizer], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y,
                                                model.X_seq_len:seq_x,
                                                model.Y_seq_len:seq_y})
        
    total_loss += loss
    total_accuracy += check_accuracy(predicted,batch_y)
#        print 'output:', [rev_dictionary_to[i] for i in predicted[0]]
#        print 'input:', [rev_dictionary_to[i] for i in batch_x[0]]
        
  total_loss /= (len(text_from) // batch_size)
  total_accuracy /= (len(text_from) // batch_size)
  print('epoch: %d, avg loss: %f, avg accuracy: %f'%(i+1, total_loss, total_accuracy))
  path = saver.save(sess, checkpoint_prefix, global_step=i+1)

epoch: 1, avg loss: 0.188996, avg accuracy: 0.973062
epoch: 2, avg loss: 0.187445, avg accuracy: 0.973062
epoch: 3, avg loss: 0.184409, avg accuracy: 0.973062
epoch: 4, avg loss: 0.181867, avg accuracy: 0.973062
epoch: 5, avg loss: 0.179319, avg accuracy: 0.973062
epoch: 6, avg loss: 0.177860, avg accuracy: 0.973062
epoch: 7, avg loss: 0.174889, avg accuracy: 0.973000
epoch: 8, avg loss: 0.172275, avg accuracy: 0.973000
epoch: 9, avg loss: 0.169824, avg accuracy: 0.973062
epoch: 10, avg loss: 0.168768, avg accuracy: 0.973016
epoch: 11, avg loss: 0.169614, avg accuracy: 0.973172
epoch: 12, avg loss: 0.169735, avg accuracy: 0.973250
epoch: 13, avg loss: 0.165564, avg accuracy: 0.973203
epoch: 14, avg loss: 0.163200, avg accuracy: 0.973266
epoch: 15, avg loss: 0.161247, avg accuracy: 0.973312
epoch: 16, avg loss: 0.159699, avg accuracy: 0.973328
epoch: 17, avg loss: 0.158477, avg accuracy: 0.973344
epoch: 18, avg loss: 0.157592, avg accuracy: 0.973344
epoch: 19, avg loss: 0.157293, avg ac

In [0]:
!ls

adc.json	     data.zip  imdb_master.csv	sample_data
checkpoints_chatbot  from.txt  kaggle.json	to.txt


In [0]:
def predict(sentence):
  X_in = []
  for word in sentence.split():
    try:
      X_in.append(dictionary_from[word])
    except:
      X_in.append(PAD)
      pass
        
  test, seq_x = pad_sentence_batch([X_in], PAD)
  input_batch = np.zeros([batch_size,seq_x[0]])
  input_batch[0] =test[0] 
        
  log = sess.run(tf.argmax(model.logits,2), 
                                      feed_dict={
                                              model.X:input_batch,
                                              model.X_seq_len:seq_x,
                                              model.Y_seq_len:seq_x
                                              }
                                      )
    
  result=' '.join(rev_dictionary_to[i] for i in log[0])
  return result
    
checkpoint_file = tf.train.latest_checkpoint(os.path.join('./', 'checkpoints_chatbot'))
saver = tf.train.import_meta_graph("checkpoints_chatbot/model-1.meta")
saver.restore(sess, checkpoint_file)
    
print(predict('how are you ?') )

In [0]:
!ls checkpoints_chatbot

checkpoint  model-1.data-00000-of-00001  model-1.index	model-1.meta
