<a href="https://colab.research.google.com/github/AndrewFatula/Seq2seq_model_using_Google_Cloud_TPU/blob/master/chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import string
import os
import sys
import time
from nltk.tokenize import TweetTokenizer
import tensorflow as tf
from collections import Counter
from matplotlib import pyplot as plt
from nltk.translate import bleu_score
from keras import regularizers
from copy import deepcopy as dc
from sklearn.metrics.pairwise import cosine_similarity as cs
print(tf.__version__)

Importing all needed packages.

In [0]:
!pip install --upgrade auth
!pip uninstall grpcio
!pip uninstall tensorflow
!pip install grpcio==1.24.3
!pip install tensorflow==2.0.0

Default version of tensorflow in colab is 1.15, so in order to gain all the benefits of tensorflow 2.0, current version of tensorflow needs to be uninstalled and then we can install the 2.0 version, but in order to use google cloud TPU v1 with tensorflow 2.0 before installing 2.0 version of tensorflow packages like grpcio and auth shuold be reinstalled the same way to versions specified above in code.

When given packages are reinstalled runstime shuold be reset in order to activates updates.

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Importing all needed tools and authentificating in google account in order to import data in colab.

In [0]:
downloaded = drive.CreateFile({'id':'1Z7odQLtZ7RmWERaRDJHDvbX_3VJ2ptvu'}) # all phrases in movies
downloaded.GetContentFile('movie_lines.txt') 
downloaded = drive.CreateFile({'id':'1TdbFyBvMGV_N8iszqTIAb7V1Vx_eSdMr'}) # all conversations in movies
downloaded.GetContentFile('movie_conversations.txt') 
downloaded = drive.CreateFile({'id':'1YTQeB3x_HTeEA5sjrleJD3notEKaPBnM'}) # all titles of movies
downloaded.GetContentFile('movie_titles_metadata.txt')
downloaded = drive.CreateFile({'id':'16Lxkrsd8HV9j9IrueYXQwY0TMbSttx5P'}) # GLOVE 50-demensional pretrained vector representations for 400000 English words trained on data from Twitter 
downloaded.GetContentFile('glove.6B.50d.txt')

Loading all needed datafiles for constucting training dataset including GLOVE 50-demensional pretrained vector representations for 400000 English words trained on data from Twitter - which is needed in training seq2seq model to check phrases similarity

In [0]:



if 'COLAB_TPU_ADDR' not in os.environ:
  print('ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!')
else:
  tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
  print ('TPU address iis', tpu_address)


cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
    tpu=tpu_address)
tf.config.experimental_connect_to_cluster(cluster_resolver)
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)

print ('Number of devices: {}'.format(tpu_strategy.num_replicas_in_sync))


Setting up TPU v1 hardware acceleration system. 
Definig TPU cluster_resolver and tpu_distributed_strategy for training seq2seq.

In [0]:

SEPARATOR = "+++$+++"
MAX_TOKENS = 10

emb_file = 'glove.6B.50d.txt'

def tokenize(str_):
	return TweetTokenizer(preserve_case=False).tokenize(str_)


def remove_all(str, substrings):
  index = 0
  for substr in substrings:
    length = len(substr)
    while str.find(substr) != -1:
      index = str.find(str)
      str = str[0:index] + str[index+length:]
  return str

def load_movies(genres):

	'''This function loads all the movie titles of specicfied genres from <movie_titles_metadata.txt> file and returns list of movie titles'''

	movies = []
	with open("movie_titles_metadata.txt", 'rb') as gf:

		for line in gf:
			line = str(line, encoding='utf-8', errors='ignore')
			arr_line = list(map( lambda x: x.strip(), line.split(SEPARATOR) ))
			line_genres = list(map( lambda x: x.strip(" '"), arr_line[-1].strip("[]").split(",") ))

			for genre in genres:
				if genre in line_genres:
					movies.append(arr_line[0])
					break

	return movies				


def read_phrases(movies = []):

	''' This function loads all phrasses from <movie_lines.txt> file which are said in given movie list as an argument of this function
			If function is called with no argument than it reads phrasses from all the movies available in dataset  '''

	phrases = {}

	with open('movie_lines.txt', 'rb') as lf:

		for line in lf:
			line = str(line, encoding='utf-8', errors='ignore').replace("<u>","").replace("</u>","")

			arr_line = list(map( lambda x: x.strip(), line.split(SEPARATOR) ))
			#phrases are loaded in dictionary, the key is index of a conversation
			if not movies:
				phrases[arr_line[0]] = tokenize(remove_all(arr_line[-1],["<b>","</b>","<u>","</u>","<i>","</i>","<u>", "</u>"]))
			elif arr_line[2] in movies: 
				phrases[arr_line[0]] = tokenize(remove_all(arr_line[-1],["<b>","</b>","<u>","</u>","<i>","</i>","<u>", "</u>"]))
				
	return phrases	
	


def read_dialogues(phrases, movies):

	''' this function constructs dialogs from phrases in the movies,
			all the needed information for dialogs construction is readed from file <movie_conversations.txt>'''

	dialogues = []
	with open("movie_conversations.txt", 'rb') as df:

		for line in df:	
			line = str(line, encoding='utf-8', errors='ignore')
			arr_line = list(map( lambda x: x.strip(), line.split(SEPARATOR) ))
			dialog = list(map( lambda x: x.strip("' "), arr_line[-1].strip("[]").split(",") ))

			if not movies:
				dialogues.append([phrases[phrase] for phrase in dialog])
			elif arr_line[2] in movies:	
				dialogues.append([phrases[phrase] for phrase in dialog])

	return dialogues				


def get_phrase_pairs(genres = None, max_tokins = MAX_TOKENS, n_pairs = None):

	''' This function constructs phrase_pairs dataset where each instance of dataset is phrase and response to that phrase '''

	#when genres in not specified it read phrses from all the movies is available
	if genres == None:
		all_phrases = read_phrases()
		movies = []
	else:
		movies = load_movies(genres)
		all_phrases = read_phrases(movies)	
	
	#before constructing phrase pairs dataset we need to get conversations dataset from all readed phrases
	conversations = read_dialogues(all_phrases, movies)
	phrase_pairs = []


	for conv in conversations:
		prev_phrase = None

		for phrase in conv:
			if prev_phrase is not None and (max_tokins == None or (len(phrase) <= max_tokins and len(prev_phrase) <= max_tokins)):
				phrase_pairs.append((prev_phrase, phrase))
			prev_phrase = phrase

	if (n_pairs == None) or (n_pairs >= len(phrase_pairs)) :
		return phrase_pairs, conversations
	else:
		np.random.shuffle(phrase_pairs)
		return phrase_pairs[0:n_pairs], conversations

Fuctions above are used to construct phrase_pairs dataset from movies

In [0]:

MIN_TOKEN_FREQ = 10
EMBEDDING_SIZE = 64
HIDDEN_SIZE = 1024
BATCH_SIZE = 512



def read_embeddings(filepath):

	'''this function reads pretrained on twitter data word2vec embeddings from embedings file, 
		 it returns list of vector representations of each word in embeddings file and token words dictionary'''

	words = []
	words.append("BEGIN")
	words.append("END")
	words_dict = {}
	representations = []
	representations.append([0]*50)
	representations.append([1]*50)

	with open(filepath) as fp:
		line = fp.readline()
		while line:
			line_content = line.split(" ")
			words.append(line_content[0])
			representations.append(list(map(lambda x: float(x), line_content[1:])))
			line = fp.readline()
	 
	for i in range(len(words)-1):
		words_dict[words[i]] = i

	return np.array(representations[:-1]), words_dict


def word_corrector(word):

	''' this function is used to convert commonly used english words shortcuts to corresponding them full forms '''

	dots = ['. ..', '. . .', '..', '. ...','...  ...', '.  ...', '. .']
	comonly_used = ["that", "what", "there", "who", "where", "how"]
	pronouns = ["it", "she", "he"]

	if word[-3:] == "'ll":
		decoded = "+$+" + word[:-3] + "@" + "will"
		return decoded

	elif word[-2:] == "'d":
		decoded = "+$+" + word[:-2] + "@" + "would"
		return decoded 

	elif word[-2:] == "'s" and word[:-2] in comonly_used+pronouns:
		decoded = "+$+" + word[:-2] + "@" + "is"
		return decoded

	elif word[-2:] == "'s" and word[:-2] and not word[:-2] in comonly_used+pronouns:
		decoded = "+$+" + word[:-2] + "@" + "'s"
		return decoded

	elif word[-2:] == "'t":
		if word == "can't":
			decoded = "+$+" + "can" + "@" + "not"
		else:
			decoded = "+$+" + word[:-3] + "@" + "not"	
		return decoded   

	elif word[-3:] == "'re":
		decoded = "+$+" + word[:-3] + "@" + "are"
		return decoded 

	elif word == "i'm":
		decoded = "+$+" + "i" + "@" + "am"
		return decoded 

	elif word[-3:] == "'ve":
		decoded = "+$+" + word[:-3] + "@" + "have"
		return decoded 

	elif word[:2] == "y'":
		decoded = "+$+" + "you" + "@" + word[2:]
		return decoded

	elif word[:2] == "dont":
		decoded = "+$+" + "do" + "@" + "not"
		return decoded

	else:
		if word in dots:
			word = "..."
		if word == "u":
			word = "you"
		if word == "ur":
			word = "your"	
		return word


####### 
def get_word_dict2(dialogues, emb_dict):

	''' this function constructs token dictionary with words which is available in embeddings dictionary retrieved from GLOVE 50-dimensional wordvectors representations,
			and is present in phrase_pairs dataset with frequency >= than specified MIN_TOKEN_FREQ '''

	freq_count = Counter()
	sizes_x = []
	sizes_y = []

	for dial in dialogues:
		for phrase in dial:
			freq_count.update(phrase)

	word_set = list(map(lambda x: '+' + x[0] if x[1] >= MIN_TOKEN_FREQ else '-' + x[0], freq_count.items() ))
	word_dict = {"BEGIN":0, "END":1}
	emb_indices = [0,1]
	i = 2


	for word in word_set:
		if word[0] == "+":
			correct_word = word_corrector(word[1:])

			if correct_word[:3] == "+$+":
				corrected1, corrected2 = correct_word[3:].split("@")
		
				if corrected2 == "'s" and not corrected2[0] in word_dict.keys() and not corrected2[-1] in word_dict.keys():
					word_dict["'"] = i
					word_dict["s"] = i+1
					emb_indices.append(emb_dict["'"])
					emb_indices.append(emb_dict["s"])
					i+=2
		 
				if not corrected1 in word_dict.keys() and corrected1 in emb_dict.keys():
					word_dict[corrected1] = i
					emb_indices.append(emb_dict[corrected1])
					i+=1
					continue

				if corrected1 in emb_dict.keys() and not corrected1 in word_dict.keys() :
					word_dict[corrected1] = i
					emb_indices.append(emb_dict[corrected1])
					i+=1


				if corrected2 in emb_dict.keys() and not corrected2 in word_dict.keys() :
					word_dict[corrected2] = i
					emb_indices.append(emb_dict[corrected2])
					i+=1
					continue

			else :
				if correct_word in emb_dict.keys() and not correct_word in word_dict.keys():
					word_dict[correct_word] = i
					emb_indices.append(emb_dict[correct_word])
					i+=1
   

	return word_dict, emb_indices



def convert_phrases2(phrase_pairs, word_dict):

	''' this function converts all the words in phrases dataset to tokens based on token dictionary retrieved from embeddings file and phrase_pairs dataset,
			phrases with words which frequencies are < MIN_TOKEN_FREQ or with words which are unavailable in words embeddings file are ignored,
			also it converts common words shortcuts in phrases to corresponding them full forms, 
			it returns converted to tokens phrase_pairs separately and lengths for each phrase '''

	sizes_x = []
	sizes_y = []
	not_available = []
	converted_x = []
	converted_y = []

	for pair in phrase_pairs:
		if len(pair[0]) < MAX_TOKENS+1 and len(pair[1]) < 8:
			phrase1 = []
			phrase2 = []

			for word in pair[0]:
				correct_word = word_corrector(word)
				if "<u>" in correct_word or "</u>" in correct_word:
					continue
		
				if "+$+" in correct_word:
					corrected1, corrected2 = correct_word[3:].split("@")
		 
					if corrected2 == "'s":
						if corrected1 in word_dict.keys():
							phrase1.append(word_dict[corrected1])
							phrase1.append(word_dict["'"])
							phrase1.append(word_dict["s"])
							continue
						else:
							phrase1.append(word_dict["BEGIN"])
							continue

					if corrected1 in word_dict.keys() and corrected2 in word_dict.keys():
						phrase1.append(word_dict[corrected1])
						phrase1.append(word_dict[corrected2])
						continue
					else:
						phrase1.append(word_dict["BEGIN"])
						continue
			
				else :
					if correct_word in word_dict.keys():
						if correct_word == "'s":
							phrase1.append(word_dict["'"])
							phrase1.append(word_dict["s"])
							continue
						phrase1.append(word_dict[correct_word])
					else:
						phrase1.append(word_dict["BEGIN"])

			for word in pair[1]:
				correct_word = word_corrector(word)
		
				if "+$+" in correct_word:
					corrected1, corrected2 = correct_word[3:].split("@")
		 
					if corrected2 == "'s":
						if corrected1 in word_dict.keys():
							phrase2.append(word_dict[corrected1])
							phrase2.append(word_dict["'"])
							phrase2.append(word_dict["s"])
							continue
						else:
							phrase2.append(word_dict["BEGIN"])
							continue

					if corrected1 in word_dict.keys() and corrected2 in word_dict.keys():
						phrase2.append(word_dict[corrected1])
						phrase2.append(word_dict[corrected2])
						continue
					else:
						phrase2.append(word_dict["BEGIN"])
						continue

				else :
					if correct_word in word_dict.keys():
						if correct_word == "'s":
							phrase2.append(word_dict["'"])
							phrase2.append(word_dict["s"])
							continue
						phrase2.append(word_dict[correct_word])
					else:
						phrase2.append(word_dict["BEGIN"])

			phrase1 = [0] + phrase1 + [1]
			phrase2 = [0] + phrase2 + [1]
		
			if not word_dict["BEGIN"] in phrase1[1:] and not word_dict["BEGIN"] in phrase2[1:]:
				converted_x.append(phrase1)
				converted_y.append(phrase2)

				sizes_x.append(len(phrase1))
				sizes_y.append(len(phrase2))
	
	length = len(sizes_x)
	sizes_x = np.array(sizes_x)
	sizes_y = np.array(sizes_y)

	return converted_x, converted_y, sizes_x.astype(np.int32), sizes_y.astype(np.int32), not_available

def make_addition(phrases):
	phrases.append((["hi","!"],["hello"]))
	phrases.append((["hi"],["hello", "!"]))
	phrases.append((["hi"],["hello", "!"]))
	phrases.append((["hello","!"],["hi", "!"]))
	phrases.append((["hi","."],["hello", ",","nice", "to", "meet","you"]))
	phrases.append((["hi","."],["hi","good","to","meet","you","!"]))
	phrases.append((["hi","!"],["hello"]))
	phrases.append((["hi"],["hello", "!"]))
	phrases.append((["hi"],["hello", "!"]))
	phrases.append((["hello","!"],["hi", "!"]))
	phrases.append((["hi","!"],["hello"]))
	phrases.append((["hi"],["hello", "!"]))
	phrases.append((["hi"],["hello", "!"]))
	phrases.append((["hello","!"],["hi", "!"]))
	phrases.append((["hi","."],["hello", ",","nice", "to", "meet","you"]))
	phrases.append((["hi","."],["hi","good","to","meet","you","!"]))
	phrases.append((["hi","!"],["hello"]))
	phrases.append((["hi"],["hello", "!"]))
	phrases.append((["hi"],["hello", "!"]))
	phrases.append((["hello","!"],["hi", "!"]))
	phrases.append((["hello"],["it","is","my","pleasure"]))
	phrases.append((["hi","."],["hello", ",","nice", "to", "meet","you"]))
	phrases.append((["hi","!"],["hi","good","to","meet","you","!"]))
	phrases.append((["hi",",", "how", "are", "you","?"],["i","am","fine",",","thank","you","!"]))
	phrases.append((["hello",",", "how", "are", "you","?"],["i","am","great",",","thank","you","!"]))
	phrases.append((["hi","!", "how", "are", "you","?"],["not","bad",",","thank","you","!"]))
	phrases.append((["hello", "!", "how", "are", "you","?"],["i","am","fine","thank","you","."]))
	phrases.append((["hi",",", "how", "are", "you","?"],["i","am","good",",","thank","you","!"]))
	phrases.append((["hello",",", "how", "are", "you","?"],["i","am","good",",","thank","you","!"]))
	phrases.append((["hello", "!", "how", "are", "you","?"],["good","enough",",","thank","you","!"]))

	return phrases


def calc_bleu_score(out, target):
	sf = bleu_score.SjmoothingFunction()
	return bleu_score.sentence_bleu(target, out, smoothing_function=sf.method1, weights=(0.5, 0.5))


def get_words_from_toukens(sentence, token_dict):
	return [token_dict[value] for value in sentence]





	


functions above are used to prepare and preprocces phrase_pairs dataset for training

In [0]:


print("constructing vectorspace for pharse pairs dataset...")

all_representations, emb_token_dict = read_embeddings(emb_file)
phrase_pairs, dialogues = get_phrase_pairs(genres = ["comedy", "thriller", "drama", "crime", "sci-fi", "western", "fantasy", "mystery", "animation" ,"history" ,"war"])
phrase_pairs = make_addition(phrase_pairs)
word_dict, transfer_indices = get_word_dict2(dialogues, emb_token_dict)




embeddings = []
for index in transfer_indices:
  embeddings.append(all_representations[index])
embeddings = np.array(embeddings)

phrase_number = len(phrase_pairs)
inverse_word_dict = {}

for key in word_dict.keys():
  inverse_word_dict[word_dict[key]] = key

  

print("vectorspace is constructed")

Code above constructs vectorspace for words which are present in phrase pairs dataset based on GLOVE 50-d embeddings

In [0]:
print("preparing and preprocces dataset...")


converted_x, converted_y, true_lengths_x, true_lengths_y, not_available = convert_phrases2(phrase_pairs, word_dict)

width_x = int(max(true_lengths_x))
width_y = int(max(true_lengths_y))

sparse_x = np.zeros((len(true_lengths_x), width_x))
sparse_y = np.zeros((len(true_lengths_x), width_y))
semi_hot_y = np.zeros((len(true_lengths_x), width_y))


for i in range(len(true_lengths_x)):
  sparse_x[i, :true_lengths_x[i]] = np.array(converted_x[i])
  sparse_y[i, :true_lengths_y[i]] = np.array(converted_y[i])
  semi_hot_y[i, : true_lengths_y[i]] = np.ones((true_lengths_y[i]))

sparse_x = tf.convert_to_tensor(sparse_x, dtype = tf.int32)  
sparse_y = tf.convert_to_tensor(sparse_y, dtype = tf.int32) 
semi_hot_y = tf.convert_to_tensor(semi_hot_y, dtype = tf.int32)

length = len(true_lengths_x)


lengths_x_onehot = []
for i in range(length):
  lengths_x_onehot.append(tf.one_hot(true_lengths_x[i]-1, width_x))

dict_size = len(list(word_dict.values()))

print("data is ready")

In [0]:
print("original dict_size:", dict_size)
print("number of phrase_pairs:", length)


As well as TPU v1 can be executed only with static computational graph, recurrent layers have to be unrolled to train on TPU, as it needs to be unrolled all the training data must have fixed shape. 

In order to train seq2seq model with variable size input sequences, lenghts of train instances are encoded as one_hot for x_sequence vectors and semi_hot vectors for y_sequence (semi_hot means vector of ones of sequence lengths and zeroes for the rest).

This vectors are multiplied with network output during training so non-zero gradient for each sequence will have variable lengths as training sequences and rest of the gradients will be zero.


In [0]:
with tpu_strategy.scope():



  class Seq2Seq(tf.keras.Model):
    ''' inherited from tf.keras.model class simle seq2seq model class with one embedding layer, 2 recurrent layers and two dense layers
    '''

    def __init__(self, hidden_size, emb_size, emb_dict):
      super(Seq2Seq, self).__init__()
      
      self.word_dict = emb_dict
      self.dict_size = len(self.word_dict)
      self.hidden_size = hidden_size
      self.emb_size = emb_size

      self.inverse_word_dict = {}
      for key in self.word_dict.keys():
        self.inverse_word_dict[self.word_dict[key]] = key

      self.emb_layer = tf.keras.layers.Embedding(dict_size, emb_size)

      self.decoder = tf.keras.layers.LSTM(units = hidden_size, kernel_regularizer=regularizers.l2(0.001), recurrent_regularizer=regularizers.l2(0.001), bias_regularizer=regularizers.l2(0.001),
                                          dropout = 0, recurrent_dropout = 0, return_state = True, return_sequences = True, unroll = True)
      self.encoder = tf.keras.layers.LSTM(units = hidden_size, kernel_regularizer=regularizers.l2(0.001), recurrent_regularizer=regularizers.l2(0.001), bias_regularizer=regularizers.l2(0.001),
                                          dropout = 0, recurrent_dropout = 0, return_state = True, unroll = True)
      
      self.interpreter = tf.keras.Sequential([tf.keras.layers.Dense(int(hidden_size*2), activation = "relu"), tf.keras.layers.Dense(dict_size, activation = "softmax")])

      hidden_representations = np.concatenate((np.zeros((1,hidden_size)), np.ones((1,hidden_size))), axis = 0)

      self.emb_hidden = tf.keras.Sequential()
      emb_hidden_layer = tf.keras.layers.Embedding(2, hidden_size, weights = [tf.constant(hidden_representations)])
      self.emb_hidden.add(emb_hidden_layer)
      self.emb_hidden.trainable = False

      self.end_words = [".", "!","..." , "-", ",", "?"]
    

    def get_words(self, tokens):

      '''fucntion that returns words from output tokens'''

      words = []
      prev_token = None
      for token in tokens:
        if token != prev_token:
          words.append(self.inverse_word_dict[token])
          prev_token = token
          
      return words
  

    
    def encode_sequence(self, x_batch, one_hot, seq_len):

      ''' encode sequence method, that returns hidden state for encoder for each input x_sequence'''

      batch_size = np.shape(x_batch)[0]
      hidden_total_h = tf.zeros([batch_size, self.hidden_size])
      hidden_state_h = tf.zeros([batch_size, self.hidden_size])
      hidden_total_c = tf.zeros([batch_size, self.hidden_size])
      hidden_state_c = tf.zeros([batch_size, self.hidden_size])
      hidden_state = [hidden_state_h, hidden_state_c]
      one_hot_x = self.emb_hidden(one_hot)
      for i in range(seq_len):
        _, hidden_state_h, hidden_state_c = self.encoder(x_batch[:,i:i+1], hidden_state, training = True)
        hidden_state = [hidden_state_h, hidden_state_c]
        hidden_total_h += one_hot_x[:,i,:]*hidden_state_h
        hidden_total_c += one_hot_x[:,i,:]*hidden_state_c
      return [hidden_total_h, hidden_total_c]

    
    def decode_sequence(self, hidden, y_batch, semi_hot_y):

      ''' method for training seq2seq with teacher 
          witch takes as argument encoded hidden state from x_input sequence
      '''

      output, _, _ = self.decoder(y_batch, hidden, training = True)
      return self.interpreter(output)*semi_hot_y[:,:,None] + (1-semi_hot_y)[:,:,None]*tf.one_hot(0, self.dict_size, dtype = tf.float32)[None,:]
 

    
    def decode_chain_sequence(self, hidden, semi_hot_y, seq_len):

      ''' method for training seq2seq without teacher 
          that takes as argument encoded hidden state and generates output as a chain sequence
      '''

      total_output = []
      batch_size = np.shape(hidden[0])[0]
      current_emb =  tf.zeros((batch_size, 1, self.emb_size), dtype = tf.float32)/20

      for i in range(seq_len):
        output, hidden_h, hidden_c = self.decoder(current_emb, hidden, training = True)
        hidden = [hidden_h, hidden_c]
        current_distribution = self.interpreter(output)*semi_hot_y[:,i:i+1,None]+(1-semi_hot_y[:,i:i+1,None])*tf.one_hot(0, self.dict_size, dtype = tf.float32)[None,:]
        total_output.append(current_distribution)
        current_word = tf.argmax(current_distribution, axis=-1)
        current_emb = self.emb_layer(current_word)

      return tf.concat(total_output, axis = 1)



    
    def call(self, input_x, input_y, one_hot_x, semi_hot_y):

      ''' forward method for training with techer'''

      seq_len = np.shape(input_x)[1]
      input_x = self.emb_layer(input_x)
      input_y = self.emb_layer(input_y)
      hidden = self.encode_sequence(input_x, one_hot_x, seq_len)
      predictions = self.decode_sequence(hidden, input_y, semi_hot_y)
      return predictions


    def call_2(self, input_x, one_hot_x, semi_hot_y, seq_len_y):

      ''' forward method for training without teacher'''

      seq_len_x = np.shape(input_x)[1]
      input_x = self.emb_layer(input_x)
      hidden = self.encode_sequence(input_x, one_hot_x, seq_len_x)
      predictions = self.decode_chain_sequence(hidden, semi_hot_y, seq_len_y)
      return predictions








    def decode_chain_sequence_test(self, hidden, r = 0):

      ''' method applied to trained seq2seq that generate output sequence for each input phrase''' 

      current_emb = tf.zeros((1,1,self.emb_size), dtype = tf.float32)/20
      total_output = []

      if r > 2:
        current_emb += tf.convert_to_tensor(np.random.rand(1, self.emb_size), dtype = tf.float32)*0.1*r/3

      for _ in range(15):
        output, hidden_h, hidden_c = self.decoder(current_emb, hidden, training = False)
        hidden = [hidden_h, hidden_c]
        current_distribution = self.interpreter(output)
        current_word = tf.argmax(current_distribution, axis =-1)

        if len(total_output)<1 and self.inverse_word_dict[current_word.numpy()[0,0]] in self.end_words:
          return self.decode_chain_sequence_test(hidden, r+1)

        if self.inverse_word_dict[current_word.numpy()[0,0]] == "END":
          if len(total_output) < 1:
            return self.decode_chain_sequence_test(hidden, r+1)
          else:
            return total_output

        total_output.append(current_word.numpy()[0,0])  
        current_emb = self.emb_layer(current_word)

      return total_output  

    
    def encode_sequence_test(self, x):

      ''' method applied to trained seq2seq model, returns hidden state for each input sequence'''

      _ , hidden_h, hidden_c = self.encoder(x[None, :, :], training = False)
      return [hidden_h, hidden_c]  



    
    def predict(self, phrase):

      ''' method applied on trained seq2seq that returns output phrase on each input phrase'''

      phrase_tokens = [self.word_dict[word] for word in phrase]
      inputs = []

      for word in phrase_tokens:
        inputs.append(self.emb_layer(word))

      inputs = tf.convert_to_tensor(inputs, dtype = tf.float32) 
      hidden = self.encode_sequence_test(inputs)
      output = self.decode_chain_sequence_test(hidden)
      output_words = self.get_words(output)

      return output_words


      



In [0]:


with tpu_strategy.scope():
	
	#creating embedding model with pretrained on twitter glove 50-dimensional words embeddings
	emb_model = tf.keras.Sequential()
	dict_size = len(embeddings)
	emb_size = np.shape(embeddings)[1]
	emb_layer = tf.keras.layers.Embedding(dict_size, emb_size, weights = [tf.constant(embeddings)])
	emb_layer.trainable = False
	emb_model.add(emb_layer)

	#instanciating seq2seq model from seq2seq class
	seq2seq = Seq2Seq(HIDDEN_SIZE, EMBEDDING_SIZE, word_dict)
	step1 = 0
	step2 = 0
	losses1 = []
	mean_losses1 = []
	losses2 = []
	mean_losses2 = []
	
	LEARNING_RATE = 0.002 #0.00001
	loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
 

	def compute_loss1(labels, predictions):

		''' cross_entropy_loss function for distributed strategy'''

		per_example_loss = loss_object(labels, predictions)
		return tf.nn.compute_average_loss(per_example_loss , global_batch_size=BATCH_SIZE)
	

	def compute_loss2(labels, predictions_probabilities):

		''' log_probability loss for correcting output probalility distribution for distributed strategy'''

		probabilities_no_grad = tf.stop_gradient(predictions_probabilities)
		predictions_argmax = tf.argmax(probabilities_no_grad, axis = -1)
		predictions_random = []
		for i in range(np.shape(labels)[1]):
			random_choice = tf.random.categorical(tf.math.log(probabilities_no_grad[:,i,:] + 0.0001),1)
			predictions_random.append(random_choice)
		predictions_random = tf.concat(predictions_random, axis = 1)

		predictions_argmax_vector = emb_model(predictions_argmax)
		predictions_random_vector = emb_model(predictions_random)
		labels_vector = emb_model(labels)
	
		random_similarity = tf.reduce_sum((labels_vector - predictions_random_vector)**2, axis = (1,-1))
		argmax_similarity = tf.reduce_sum((labels_vector - predictions_argmax_vector)**2, axis = (1,-1))
		probability_corrector = argmax_similarity - random_similarity

		log_probabilities_corrector = -tf.math.log(predictions_probabilities+0.0001)*probability_corrector[:,None,None]

		return tf.nn.compute_average_loss(log_probabilities_corrector, global_batch_size=BATCH_SIZE)



	def crossentropy_training_loop( train_dist_dataset, teacher_prob, step1, step2, losses1, mean_losses1, mean_pre_loss):

		''' training function for crossentropy training'''

		for x in train_dist_dataset:

			if np.random.rand() < teacher_prob:
				step1+=1
				loss = distributed_train_step1(x)
				losses1.append(loss)
				if step1 > 50:
					mean_losses1.append(np.mean(losses1[-50:]))
			else:
				step2+=1
				loss = distributed_train_step2(x)
				losses2.append(loss)
				if step2 > 50:
					mean_losses2.append(np.mean(losses2[-50:]))
		
			mean_pre_loss.append(loss)
	 

	def distribution_correction_loop( train_dist_dataset, teacher_prob, step1, step2, losses1, mean_losses1, mean_pre_loss):

		''' training function for output probability distribution correction'''

		for x in train_dist_dataset:

			if np.random.rand() < teacher_prob:
				step1+=1
				loss = distributed_train_step4(x)
				losses1.append(loss)
				if step1 > 50:
					mean_losses1.append(np.mean(losses1[-50:]))
			else:
				step2+=1
				loss = distributed_train_step3(x)
				losses2.append(loss)
				if step2 > 50:
					mean_losses2.append(np.mean(losses2[-50:]))
		
			mean_pre_loss.append(loss)




	def distributed_train_step1(dataset_inputs):

		''' function that applies distriuted TPU strategy to training_step1 fucntion'''

		per_replica_losses = tpu_strategy.experimental_run_v2(train_step1,
																											args=(dataset_inputs,))
		return tpu_strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
													axis=None)

	def distributed_train_step2(dataset_inputs):

		''' function that applies distriuted TPU strategy to training_step2 fucntion'''

		per_replica_losses = tpu_strategy.experimental_run_v2(train_step2,
																											args=(dataset_inputs,))
		return tpu_strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
													axis=None)	
	
	def distributed_train_step3(dataset_inputs):

		''' function that applies distriuted TPU strategy to training_step3 fucntion'''

		per_replica_losses = tpu_strategy.experimental_run_v2(train_step3,
																											args=(dataset_inputs,))
		return tpu_strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
													axis=None)

	def distributed_train_step4(dataset_inputs):

		''' function that applies distriuted TPU strategy to training_step1 fucntion'''

		per_replica_losses = tpu_strategy.experimental_run_v2(train_step4,
																											args=(dataset_inputs,))
		return tpu_strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses,
													axis=None)


	for _ in range(7):

		''' main training_loop'''

		optimizer1 = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1 = 0.9)
		optimizer2 = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1 = 0.9)
		optimizer3 = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE*0.005, beta_1 = 0.9)
		optimizer4 = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE*0.005, beta_1 = 0.9)
		LEARNING_RATE /= 2	
		

		@tf.function
		def train_step1(inputs):

			''' training_step1 fuction for crossentropy teacher training method 
					with @tf.function decorator that assures constriction of static computation graph for training
			'''

			with tf.GradientTape() as tape:
				input_x = inputs[0]
				input_y = inputs[1]
				one_hot_x = inputs[2]
				semi_hot_y = tf.cast(inputs[3], tf.float32)
				predictions = seq2seq(input_x, input_y[:,:-1], one_hot_x, semi_hot_y[:,1:])
				loss = compute_loss1(input_y[:,1:], predictions)
				#print_loss = compute_loss1(input_y[:,1:], predictions)
			gradients = tape.gradient(loss, seq2seq.trainable_variables)
			optimizer1.apply_gradients(zip(gradients, seq2seq.trainable_variables))
			return loss
		
		@tf.function
		def train_step2(inputs):

			''' training_step1 fuction for crossentropy sequence training method 
					with @tf.function decorator that assures constriction of static computation graph for training
			'''

			with tf.GradientTape() as tape:
				input_x = inputs[0]
				input_y = inputs[1]
				one_hot_x = inputs[2]
				semi_hot_y = tf.cast(inputs[3], tf.float32)
				predictions = seq2seq.call_2(input_x, one_hot_x, semi_hot_y[:,1:], np.shape(input_y)[1]-1)
				loss = compute_loss1(input_y[:,1:], predictions)
			gradients = tape.gradient(loss, seq2seq.trainable_variables)
			optimizer2.apply_gradients(zip(gradients, seq2seq.trainable_variables))
			return loss
		
		@tf.function
		def train_step3(inputs):

			''' training_step1 fuction for output probability distribution correction training method 
					with @tf.function decorator that assures constriction of static computation graph for training
			'''

			with tf.GradientTape() as tape:
				input_x = inputs[0]
				input_y = inputs[1]
				one_hot_x = inputs[2]
				semi_hot_y = tf.cast(inputs[3], tf.float32)
				predictions = seq2seq.call_2(input_x, one_hot_x, semi_hot_y[:,1:], np.shape(input_y)[1]-1)
				loss = compute_loss2(input_y[:,1:], predictions)
				printed_loss = compute_loss1(input_y[:,1:], predictions)
			gradients = tape.gradient(loss, seq2seq.trainable_variables)
			optimizer3.apply_gradients(zip(gradients, seq2seq.trainable_variables))
			return printed_loss
		


		@tf.function
		def train_step4(inputs):

			''' training_step1 fuction for crossentropy teacher training method 
					with @tf.function decorator that assures constriction of static computation graph for training
			'''

			with tf.GradientTape() as tape:
				input_x = inputs[0]
				input_y = inputs[1]
				one_hot_x = inputs[2]
				semi_hot_y = tf.cast(inputs[3], tf.float32)
				predictions = seq2seq(input_x, input_y[:,:-1], one_hot_x, semi_hot_y[:,1:])
				loss = compute_loss2(input_y[:,1:], predictions)
				print_loss = compute_loss1(input_y[:,1:], predictions)
			gradients = tape.gradient(loss, seq2seq.trainable_variables)
			optimizer4.apply_gradients(zip(gradients, seq2seq.trainable_variables))
			return print_loss

		if _ < 6:
			print("crossentropy training...")

			teacher_prob = 1
			n_epochs = int(80)

			if _ > 1:
				teacher_prob = 0.5
				n_epochs  = 24 - _*2

			for epoch in range(n_epochs):
				epoch_start = time.localtime(time.time())
				
				mean_pre_loss = []
				dataset = tf.data.Dataset.from_tensor_slices((sparse_x, sparse_y, lengths_x_onehot, semi_hot_y)).shuffle(150000).batch(BATCH_SIZE, drop_remainder = True) 
				train_dist_dataset = tpu_strategy.experimental_distribute_dataset(dataset)
			
				if (epoch > 35 or _ > 0) and (epoch+1) % 20 == 0:
					teacher_prob = 1 - teacher_prob
	

				crossentropy_training_loop(train_dist_dataset, teacher_prob, step1, step2, losses1, mean_losses1, mean_pre_loss)
			
				epoch_end = time.localtime(time.time())
				start_in_sec = epoch_start[3]*3600 + epoch_start[4]*60 + epoch_start[5]
				end_in_sec = epoch_end[3]*3600 + epoch_end[4]*60 + epoch_end[5]
				epoch_time = end_in_sec - start_in_sec

				print("Cycle: ", _ , ", Epoch: ", epoch,", And current loss is:", np.mean(mean_pre_loss), " while teacher_prob is: ", teacher_prob, ", Epoch_time: ", epoch_time)	
			
		else:

			print("distribution correction...")

			n_epochs = 300
			
			teacher_prob = 0.5

			for epoch in range(n_epochs):
				epoch_start = time.localtime(time.time())

				
				mean_pre_loss = []
				dataset = tf.data.Dataset.from_tensor_slices((sparse_x, sparse_y, lengths_x_onehot, semi_hot_y)).shuffle(150000).batch(BATCH_SIZE, drop_remainder = True) 
				train_dist_dataset = tpu_strategy.experimental_distribute_dataset(dataset)

				distribution_correction_loop( train_dist_dataset, teacher_prob, step1, step2, losses1, mean_losses1, mean_pre_loss)
			
				epoch_end = time.localtime(time.time())
				start_in_sec = epoch_start[3]*3600 + epoch_start[4]*60 + epoch_start[5]
				end_in_sec = epoch_end[3]*3600 + epoch_end[4]*60 + epoch_end[5]
				epoch_time = end_in_sec - start_in_sec

				print("Cycle: ", _ , ", Epoch: ", epoch,", And current loss is:", np.mean(mean_pre_loss), " while teacher_prob is: ", teacher_prob, ", Epoch_time: ", epoch_time)
				
		



In [0]:


print("seq2seq is trained")
plt.plot(mean_losses1)
plt.show()
plt.close()
plt.plot(mean_losses2)
plt.show()
plt.close()


my_questions = ["are you a human ?", 
                "what is important to you ?","hello ! how are you ?","hello !","what is your business here ?",
                "tell me something about humans",
                "what i should do now ?",
                "are you good guy ?", "who are you ?",
                "do you aim to harm humanity ?","what is on your mind ?",
                "can you be my friend ?","are you a machine ?",
                "how can i help you ?","what can you say about yourself ?",
                "will artificial intelligence rule the world ?",
                "are you smart enough ?",
                "it is bad weather today",
                "how can you explain your existence ?"]
                



def talk_to_me(test_phrases):
  converted_test_phrases = []
  converted_test_answers = []
  test_answers = []

  with tf.device("cpu:0"):

    for phrase in test_phrases:
      phrase_words = phrase.split(" ")
      skip = False
      for word in phrase_words:
        if not word in word_dict.keys():
          test_answers.append(["sorry" , ",", "i", "dont", "know", "what", "'" + word + "'", "means."])
          skip = True
          break
          

      if not skip:
        predicted = seq2seq.predict(phrase.strip(" ").split(" "))
        test_answers.append(predicted)

    
  for i in range(len(test_phrases)):
    phrase = test_phrases[i].strip(" ").split(" ")

    answer = test_answers[i]
    converted_answer = ""
    for word in answer:
      converted_answer += word
      converted_answer += " "

    converted_test_answers.append(converted_answer)  


  for i in range(len(test_phrases)):
    print(test_phrases[i])
    print(converted_test_answers[i])
    print("\n")  



In [0]:
talk_to_me(my_questions)

In [0]:
talk_to_me(["are you a artificial mind ?"])