In [4]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow_addons as tfa
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
import pickle as pkl
import unicodedata
import spacy
from spacy_langdetect import LanguageDetector
import time
import os
import sys
import re

In [None]:
human_lines = []
robot_lines = []

In [None]:
data_path_human = "/content/drive/MyDrive/seminar/seminar/rDany/human_text.txt"
data_path_robot = "/content/drive/MyDrive/seminar/seminar/rDany/robot_text.txt"


with open(data_path_human, "r") as f:
    human_lines += f.read().split("\n")
    
with open(data_path_robot, "r") as f:
    robot_lines += f.read().split("\n")
print(human_lines[1])
print(robot_lines[1])

In [None]:
with open("/content/drive/MyDrive/seminar/seminar/input_docs.pkl", "rb") as handle:
  human_lines += pkl.load(handle)
with open("/content/drive/MyDrive/seminar/seminar/target_docs.pkl", "rb") as handle:
  robot_lines += pkl.load(handle)

In [None]:

# function to remove accented characters
def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

for i, line in tqdm(enumerate(human_lines)):
  human_lines[i] = remove_accented_chars(line)

for i, line in tqdm(enumerate(robot_lines)):
  robot_lines[i] = remove_accented_chars(line) 

In [None]:
english_human_lines = []
english_robot_lines = []



In [None]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)

for i in tqdm(range(len(human_lines))):
  human_doc = nlp(human_lines[i])
  robot_doc = nlp(robot_lines[i])

  if human_doc._.language['language'] == 'en' and robot_doc._.language['language'] == 'en':
    english_human_lines.append(human_lines[i])
    english_robot_lines.append(robot_lines[i])
print(len(english_human_lines), len(english_robot_lines))
human_lines = english_human_lines
robot_lines = english_robot_lines
del english_human_lines
del english_robot_lines

In [None]:
#with open("/content/drive/MyDrive/seminar/english_human.pkl", "wb") as handle:
#  pkl.dump(human_lines, handle)

#with open("/content/drive/MyDrive/seminar/english_robot.pkl", "wb") as handle:
#  pkl.dump(robot_lines, handle)

In [None]:
!ls /content/drive/MyDrive/seminar
!mv /content/drive/MyDrive/seminar/english_human\ \(1\).pkl /content/drive/MyDrive/seminar/english_human.pkl
!mv /content/drive/MyDrive/seminar/english_robot\ \(1\).pkl /content/drive/MyDrive/seminar/english_robot.pkl

In [5]:
with open("../from_drive/english_human.pkl", "rb") as handle:
    human_lines = pkl.load(handle)

with open("../from_drive/english_robot.pkl", "rb") as handle:
    robot_lines = pkl.load(handle)

In [6]:
human_lines[3002], robot_lines[3002]

('What the hell is that sound?', 'Vengeance.')

In [7]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


test = "Hey I'm Yann, how're you and how's it going ? That's interesting: I'd love to hear more about it."
print(decontracted(test))

Hey I am Yann, how are you and how is it going ? That is interesting: I would love to hear more about it.


In [8]:

for i, line in tqdm(enumerate(human_lines)):
    human_lines[i] = decontracted(line)

for i, line in tqdm(enumerate(robot_lines)):
    robot_lines[i] = decontracted(line)

64496it [00:02, 26887.23it/s]
64496it [00:02, 30573.31it/s]


# Spell Check

In [9]:
human_lines = [re.sub(r"\[\w+\]",'hi',line) for line in human_lines]
human_lines = [" ".join(re.findall(r"\w+",line)) for line in human_lines]
robot_lines = [re.sub(r"\[\w+\]",'',line) for line in robot_lines]
robot_lines = [" ".join(re.findall(r"\w+",line)) for line in robot_lines]
# grouping lines by response pair
pairs = list(zip(human_lines,robot_lines))
#random.shuffle(pairs)
len(pairs)

64496

In [10]:
input_docs = []
target_docs = []
input_tokens = set()
target_tokens = set()

for line in pairs:
    input_doc, target_doc = line[0], line[1]
    # Appending each input sentence to input_docs
    input_docs.append(input_doc)
    # Splitting words from punctuation  
    target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
    # Redefine target_doc below and append it to target_docs
    target_doc = '<START> ' + target_doc + ' <END>'
    target_docs.append(target_doc)
  
    # Now we split up each sentence into words and add each unique word to our vocabulary set
    for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
        if token not in input_tokens:
            input_tokens.add(token)
    for token in target_doc.split():
        if token not in target_tokens:
            target_tokens.add(token)
input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)
num_tokens = len(set(input_tokens + target_tokens)) + 2 # [UNK]
pairs = list(zip(input_docs, target_docs))

In [11]:
pairs[3002]

('What the hell is that sound', '<START> Vengeance <END>')

In [12]:
vocab_size = 30000 + 1
units = 1024
embedding_dim = 100

In [13]:


tokenizer = Tokenizer(filters='', oov_token="<unk>")
tokenizer.fit_on_texts(input_docs + target_docs)

tokenizer.num_words = vocab_size
input_docs_tokenized = tokenizer.texts_to_sequences(input_docs)
target_docs_tokenized = tokenizer.texts_to_sequences(target_docs)

In [14]:
final_in_docs_tokenized = []
final_tar_docs_tokenized = []

for i in range(len(input_docs_tokenized)):
  if len(input_docs_tokenized[i]) <= 15 and len(target_docs_tokenized[i]) <= 15:
    final_in_docs_tokenized.append(input_docs_tokenized[i])
    final_tar_docs_tokenized.append(target_docs_tokenized[i])
len(final_in_docs_tokenized), len(final_tar_docs_tokenized)

(20727, 20727)

In [None]:
input_docs_tokenized[0]

In [15]:
max_len = 0
for r in final_tar_docs_tokenized:
    if len(r) > max_len:
        max_len = len(r)
  
max_len

15

In [16]:

X = final_in_docs_tokenized
Y = final_tar_docs_tokenized

X = tf.keras.preprocessing.sequence.pad_sequences(X, padding='pre')
Y = tf.keras.preprocessing.sequence.pad_sequences(Y, padding='pre')

del final_in_docs_tokenized
del final_tar_docs_tokenized
del input_docs
del target_docs
del input_docs_tokenized
del target_docs_tokenized
#del human_lines
#del robot_lines

In [17]:
X.shape, Y.shape

((20727, 15), (20727, 15))

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

In [19]:
print(X_train.shape, X_test.shape)

(18654, 15) (2073, 15)


In [20]:
BUFFER_SIZE = len(X_train)
BATCH_SIZE = 64
steps_per_epoch = len(X_train)//BATCH_SIZE

In [21]:
with open("../from_drive/enwiki_20180420_100d.txt", "r") as f:
    dict_w2v = {}
    problems = []
    
    for line in tqdm(f):
        
        tokens = line.split()

        try:
          word = tokens[0]
          vector = np.array(tokens[1:], dtype=np.float32)
        except:
          pass
        
        if vector.shape[0] == embedding_dim:
            dict_w2v[word] = vector
        else:
            problems.append({word: vector})

4530031it [02:45, 27315.19it/s]


In [22]:
len(problems)

45

In [23]:
tokenizer.word_index[list(tokenizer.word_index.keys())[0]]
len(dict_w2v)

4529821

In [24]:


hits = 0
misses = 0
embedding_matrix = np.zeros((vocab_size, embedding_dim))
tokens = list(tokenizer.word_index.keys())[:vocab_size-1]

for token in tqdm(tokens):
    
    embedding = dict_w2v.get(token)
    
    if embedding is not None:
        embedding_matrix[tokenizer.word_index[token]] = embedding
        hits += 1
    else:
        misses += 1
embedding_matrix[tokenizer.word_index["<unk>"]] = np.random.rand(embedding_dim)
print(hits, misses)

100%|██████████| 30000/30000 [00:00<00:00, 70045.16it/s] 

29064 936





In [25]:
import sys
print(sys.getsizeof(dict_w2v))
del dict_w2v

167772256


In [26]:
print(f"Hits: {hits}")
print(f"Missed: {misses}")
len(tokenizer.word_index)

Hits: 29064
Missed: 936


46958

In [27]:
len(tokenizer.word_index)

46958

In [28]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, 
                                                                                            drop_remainder=True)

In [29]:
def max_len(sentence):
    return max(len(s) for s in sentence)

max_length_input = max_len(X_train)
max_length_output = max_len(Y_train)

In [30]:
for example in dataset.take(1):
    example_x, example_y = example
    
print(example_x.shape) 
print(example_y.shape) 

(64, 15)
(64, 15)


In [31]:
type(example_x)

tensorflow.python.framework.ops.EagerTensor

In [32]:
class EncoderAttention(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dims, hidden_units):
        super().__init__()
        self.hidden_units = hidden_units
        self.embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dims, tf.keras.initializers.Constant(embedding_matrix),
                trainable=True)
        self.lstm_layer = tf.keras.layers.LSTM(hidden_units, return_sequences=True, 
                                                     return_state=True ) # We need the lstm outputs 
                                                                         # to calculate attention!
    
    def initialize_hidden_state(self): 
        return [tf.zeros((BATCH_SIZE, self.hidden_units)), 
                tf.zeros((BATCH_SIZE, self.hidden_units))] 
                                                               
    def call(self, inputs, hidden_state):
        embedding = self.embedding_layer(inputs)
        output, h_state, c_state = self.lstm_layer(embedding, initial_state = hidden_state)
        return output, h_state, c_state


encoder = EncoderAttention(vocab_size, embedding_dim, units)

In [33]:
# Test  the encoder
sample_initial_state = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_x, sample_initial_state)
print(sample_output.shape)
print(sample_h.shape)

(64, 15, 1024)
(64, 1024)


In [34]:

class DecoderAttention(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super().__init__()
        
        
        self.embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim, tf.keras.initializers.Constant(embedding_matrix),
                trainable=True)

        self.lstm_cell = tf.keras.layers.LSTMCell(hidden_units)

        self.sampler = tfa.seq2seq.sampler.TrainingSampler()

        self.attention_mechanism = tfa.seq2seq.LuongAttention(hidden_units, memory_sequence_length=BATCH_SIZE*[len(X_train[0])]) #N

        self.attention_cell = tfa.seq2seq.AttentionWrapper(cell=self.lstm_cell, # N
                                      attention_mechanism=self.attention_mechanism, 
                                      attention_layer_size=hidden_units)

        self.output_layer = tf.keras.layers.Dense(vocab_size)
        self.decoder = tfa.seq2seq.BasicDecoder(self.attention_cell, # N
                                                sampler=self.sampler, 
                                                output_layer=self.output_layer)

    def build_initial_state(self, batch_size, encoder_state): #N
        decoder_initial_state = self.attention_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
        return decoder_initial_state


    def call(self, inputs, initial_state):
        embedding = self.embedding_layer(inputs)
        outputs, _, _ = self.decoder(embedding, initial_state=initial_state, sequence_length=BATCH_SIZE*[len(Y_train[0])-1])
        return outputs

decoder = DecoderAttention(vocab_size, embedding_dim, units)

In [35]:
# Test the decoder
sample_y = tf.random.uniform((BATCH_SIZE, len(X_train)))
decoder.attention_mechanism.setup_memory(sample_output) # Attention needs the last output of the Encoder
                                                        # as starting point
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c]) # N


sample_decoder_output = decoder(example_y, initial_state)

print(sample_decoder_output.rnn_output.shape)

(64, 14, 30001)


In [36]:
sample_output.shape

TensorShape([64, 15, 1024])

In [37]:
optimizer = tf.keras.optimizers.Adam()

def loss_function(real, pred):
    cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)  # mask and loss have to have the same Tensor type
    loss = mask * loss
    loss = tf.reduce_mean(loss) # you need one loss scalar number for the mini batch
    return loss 

In [42]:
print(tf.config.list_physical_devices("GPU"))
import time

[]


In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    encoder_hidden = encoder.initialize_hidden_state() # Every epoch we use a zero Tensor matrix
    epoch_loss = 0

    for (batch, (input, target)) in enumerate(dataset.take(steps_per_epoch)):
        with tf.GradientTape() as tape:
            # Pass the input through the encoder 
            encoder_output, encoder_h, encoder_c = encoder(input, encoder_hidden)
            decoder_input = target[ : , :-1 ] # Ignore <end> token
            real = target[ : , 1: ]         # ignore <start> token
            # The encoder output, encoder hidden state and the decoder input
            # is passed to the decoder
            decoder.attention_mechanism.setup_memory(encoder_output) # N
            decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [encoder_h, encoder_c]) # N
            decoder_output = decoder(decoder_input, decoder_initial_state) 
            logits = decoder_output.rnn_output
            batch_loss = loss_function(real, logits)

        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(batch_loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
        epoch_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      epoch_loss / steps_per_epoch))
    print('Time {:.4f} sec\n'.format(time.time() - start))

In [43]:
encoder_model_save_path = "../chatbot/model_v1/model_2_v3/encoder/weights.ckpt"
decoder_model_save_path = "../chatbot/model_v1/model_2_v3/decoder/weights.ckpt"

In [None]:
with open("/content/drive/MyDrive/seminar/seminar/models/full_models/model_2_v3/tokenizer.pkl", "wb") as handle:
  pkl.dump(tokenizer, handle, protocol=pkl.HIGHEST_PROTOCOL)

with open("/content/drive/MyDrive/seminar/seminar/models/full_models/model_2_v3/encoder_embedding_layer.pkl", "wb") as handle:
  pkl.dump(tf.convert_to_tensor(encoder.embedding_layer.variables).numpy(), handle, protocol=pkl.HIGHEST_PROTOCOL)

with open("/content/drive/MyDrive/seminar/seminar/models/full_models/model_2_v3/decoder_embedding_layer.pkl", "wb") as handle:
  pkl.dump(tf.convert_to_tensor(decoder.embedding_layer.variables).numpy(), handle, protocol=pkl.HIGHEST_PROTOCOL)

with open("/content/drive/MyDrive/seminar/seminar/models/full_models/model_2_v3/hyper_params.yaml", "w") as handle:
  hyper_params = """
    epochs: 70
    batch-size: 64
    optimizer: adam
    vocab-size: 30000
  """
  handle.write(hyper_params)

encoder.save_weights(encoder_model_save_path)
decoder.save_weights(decoder_model_save_path)

In [None]:
tf.compat.v1.enable_eager_execution()
tf.convert_to_tensor(encoder.embedding_layer.variables).numpy()

In [None]:
#encoder.save_weights(encoder_model_save_path)
#decoder.save_weights(decoder_model_save_path)

In [None]:

#encoder.save(encoder_model_save_path, save_format="tf")
#decoder.save(decoder_model_save_path, save_format="tf")

In [44]:
#encoder = tf.keras.models.load_model(encoder_model_save_path)
#decoder = tf.keras.models.load_model(decoder_checkpoint_path)
encoder.load_weights(encoder_model_save_path)
decoder.load_weights(decoder_model_save_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x163685b80>

In [45]:
import unicodedata
def preprocess_sentence(w):
    w = w.lower().strip()
    # This next line is confusing!
    # We normalize unicode data, umlauts will be converted to normal letters
    #w = w.replace("ß", "ss")
    #w = ''.join(c for c in unicodedata.normalize('NFD', w) if unicodedata.category(c) != 'Mn')

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"\[\w+\]",'', w)
    w = " ".join(re.findall(r"\w+",w))
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!]+", " ", w)
    w = w.strip()
    w = decontracted(w)

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [48]:
def reply(sentence, preprocess=True):

    if preprocess:
        sentence = preprocess_sentence(sentence)
        sentence_tokens = tokenizer.texts_to_sequences([sentence])
        input = tf.keras.preprocessing.sequence.pad_sequences(sentence_tokens, maxlen=max_length_input, padding='post')
    else:
        input = sentence
    input = tf.convert_to_tensor(input)

    encoder_hidden = [tf.zeros((1, units)), tf.zeros((1, units))]
    encoder_output, encoder_h, encoder_c = encoder(input, encoder_hidden)
    start_token = tf.convert_to_tensor([tokenizer.word_index['<start>']])
    end_token = tokenizer.word_index['<end>']

    # This time we use the greedy sampler because we want the word with the highest probability!
    # We are not generating new text, where a probability sampling would be better
    greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

    # Instantiate a BasicDecoder object
    decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.attention_cell, # N
                                                sampler=greedy_sampler, output_layer=decoder.output_layer)
    # Setup Memory in decoder stack
    decoder.attention_mechanism.setup_memory(encoder_output) # N

    # set decoder_initial_state
    decoder_initial_state = decoder.build_initial_state(batch_size=1, encoder_state=[encoder_h, encoder_c]) # N

    ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
    ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
    ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function

    decoder_embedding_matrix = decoder.embedding_layer.variables[0]

    outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_token, end_token= end_token, initial_state=decoder_initial_state)

    result_sequence  = outputs.sample_id.numpy()
    return tokenizer.sequences_to_texts(result_sequence)[0]
reply("Hi")

'oh that here look at this that ai not fun at that <end>'

In [49]:
tfa.__version__

'0.12.0'

In [50]:
reply("How about we go hiking?")

'no hey hey that is one long from home as long as living <end>'

In [None]:
decoder.embedding_layer.variables

In [None]:
encoder.load_weights(encoder_checkpoint_path)
decoder.load_weights(decoder_checkpoint_path)