In [4]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [5]:
! wget https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/mono/en.txt.gz

--2020-06-24 14:34:53--  https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/mono/en.txt.gz
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3663376519 (3.4G) [application/gzip]
Saving to: ‘en.txt.gz’


2020-06-24 14:35:59 (53.2 MB/s) - ‘en.txt.gz’ saved [3663376519/3663376519]



In [6]:
!gunzip -k en.txt.gz
!mkdir lines
!split -a 3 -l 100000 en.txt lines/lines-

In [7]:
import re

def _should_skip(line, min_length, max_length):
    """Whether a line should be skipped depending on the length."""
    return len(line) < min_length or len(line) > max_length


def create_example(previous_lines, line, file_id):
    """Creates examples with multi-line context
    The examples will include:
        file_id: the name of the file where these lines were obtained.
        response: the current line text
        context: the previous line text
        context/0: 2 lines before
        context/1: 3 lines before, etc.
    """
    example = {
        'file_id': file_id,
        'context': previous_lines[-1],
        'response': line,
    }
    example['file_id'] = file_id
    example['context'] = previous_lines[-1]

    extra_contexts = previous_lines[:-1]
    example.update({
        'context/{}'.format(i): context
        for i, context in enumerate(extra_contexts[::-1])
    })

    return example


def _preprocess_line(line):
    line = line.decode("utf-8")

    # Remove the first word if it is followed by colon (speaker names)
    # NOTE: this wont work if the speaker's name has more than one word
    line = re.sub('(?:^|(?:[.!?]\\s))(\\w+):', "", line)

    # Remove anything between brackets (corresponds to acoustic events).
    line = re.sub("[\\[(](.*?)[\\])]", "", line)

    # Strip blanks hyphens and line breaks
    line = line.strip(" -\n")

    return line


def _create_examples_from_file(file_name, min_length=0, max_length=20,
                               num_extra_contexts=5):

    previous_lines = []
    with open(file_name, 'rb') as f:
      for line in f :
        line = _preprocess_line(line)
        if not line:
            continue

        should_skip = _should_skip(
            line,
            min_length=min_length,
            max_length=max_length)

        if previous_lines:
            should_skip |= _should_skip(
                previous_lines[-1],
                min_length=min_length,
                max_length=max_length)

            if not should_skip:
                yield create_example(previous_lines, line, file_name)

        previous_lines.append(line)
        if len(previous_lines) > num_extra_contexts + 1:
            del previous_lines[0]


In [8]:
example = _create_examples_from_file(file_name='lines/lines-aaa')

count = 0
for i in example:
  count += 1
print('Found '+ str(count*5) + ' examples')
print(i)

Found 101770 examples
{'file_id': 'lines/lines-aaa', 'context': "We'll catch him.", 'response': 'Because we have to.', 'context/0': 'Oh, aye.', 'context/1': "Tell me, Sergeant, in your professional opinion, do you really believe you can catch this man if he doesn't want you to?", 'context/2': 'I have.', 'context/3': 'You seem to have made a great study of crime, Mr. Newspaperman.', 'context/4': "The ones we don't know about yet."}


In [9]:
in_comma = "'"

def remove_char(sentence):
  sent = sentence.replace('!', '')
  sent = sent.replace(',', '')
  sent = sent.replace(in_comma, '')
  sent = sent.replace('%', '')
  sent = sent.replace('-', '')
  sent = sent.replace('.', '')
  sent = sent.replace('?', '')
  sent = sent.replace('/', '')
  sent = sent.replace(':', '')
  sent = sent.replace(';', '')

  return sent

ex_sent = 'hello, me! why?'
print(remove_char(ex_sent))

hello me why


In [10]:
example = _create_examples_from_file(file_name='lines/lines-aaa')

inputs = []
responses = []
i = 0

for test in example:

  input_1 = test['context']
  input_2 = test['context/0']
  input_3 = test['context/1']
  input_4 = test['context/2']
  input_5 = test['context/3']
  response = test['response']
    
  inputs.append(remove_char(input_1))
  inputs.append(remove_char(input_2))
  inputs.append(remove_char(input_3))
  inputs.append(remove_char(input_4))
  inputs.append(remove_char(input_5))

  for j in range(5):
    responses.append('startsent' + ' ' +remove_char(response)+ ' ' + 'endsent')
    
  i += 5

print(inputs[101769], responses[101769])

You seem to have made a great study of crime Mr Newspaperman startsent Because we have to endsent


In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

oov_token = "<OOV>"
max_length = 25

tokenizer = Tokenizer(oov_token=oov_token)
tokenizer.fit_on_texts(inputs)
tokenizer.fit_on_texts(responses)

word_index = tokenizer.word_index
word_index['startsent'] = 0
word_index['endsent'] = len(word_index)+1
index_word = {word_index[word]:word for word in word_index}
print(index_word)
vocab_size = len(word_index) + 1
input_seq = tokenizer.texts_to_sequences(inputs)
response_seq = tokenizer.texts_to_sequences(responses)
input_seq_pad = pad_sequences(input_seq, maxlen = max_length ,padding = 'post', truncating = 'post')
response_seq_pad = pad_sequences(response_seq, maxlen = max_length, padding = 'post', truncating = 'post')

print(len(word_index))
print(inputs[0], responses[0])
print(input_seq_pad[0], response_seq_pad[0])
print(len(input_seq_pad))

6978
Im going to wait startsent Here we go endsent
[ 28  79   7 106   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0] [   0   21   40   20 6979    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]
101770


In [12]:
def preprocess(text_list):
  inputs = []
  for sent in text_list:
    inputs.append(remove_char(sent))
  input_seq = tokenizer.texts_to_sequences(inputs)
  input_seq_pad = pad_sequences(input_seq, maxlen = max_length ,padding = 'post', truncating = 'post')
  return input_seq_pad

In [13]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O /tmp/glove.6B.100d.txt

--2020-06-24 14:47:50--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.76.128, 2a00:1450:400c:c0b::80
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.76.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 347116733 (331M) [text/plain]
Saving to: ‘/tmp/glove.6B.100d.txt’


2020-06-24 14:47:53 (126 MB/s) - ‘/tmp/glove.6B.100d.txt’ saved [347116733/347116733]



In [14]:
import os
import numpy as np

embeddings_index = {} #initialize dictionary
f = open('/tmp/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [15]:
embedding_dim = 100

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

print(len(embeddings_matrix))
embeddings_matrix[6977]

6980


array([-1.41990006e-01, -3.21310014e-01, -3.81099999e-01, -1.28950000e-01,
        3.44550014e-01, -3.16929996e-01, -9.51619983e-01,  1.12909997e+00,
       -2.79879998e-02,  2.43589997e-01,  2.97149986e-01,  6.00820005e-01,
       -2.53589988e-01, -2.91130006e-01,  1.59280002e-01, -1.16090000e+00,
       -1.60209998e-03,  2.46810004e-01, -5.91790006e-02,  7.26859987e-01,
       -5.14699996e-01,  1.19000003e-01, -2.16429994e-01, -1.05260000e-01,
        3.49779993e-01, -7.14169979e-01, -1.94560006e-01, -9.02819991e-01,
       -6.82489991e-01, -7.83150017e-01,  4.66960013e-01,  6.63779974e-01,
       -2.34469995e-01, -3.23300004e-01, -3.13089997e-01,  3.96660000e-01,
       -2.18840003e-01,  3.17770004e-01, -2.30430007e-01, -4.44669992e-01,
        6.38899982e-01, -2.74749994e-01, -5.94479978e-01, -2.71160007e-01,
       -1.79100007e-01, -8.09289992e-01, -3.41439992e-01, -4.14759994e-01,
        3.27890009e-01, -6.44529998e-01,  8.20810020e-01,  7.23560005e-02,
       -6.28900006e-02,  

In [16]:
"""print(input_seq_pad.shape)
X_train = input_seq_pad[:500]
y_train = response_seq_pad[:500]"""


X_train = []
y_train = []
for j in range(40000):
    X_train.append(input_seq_pad[j])
    y_train.append(response_seq_pad[j])

X_train = np.array(X_train)
y_train = np.array(y_train)

print(embedding_dim)
print(X_train.shape)
print(y_train.shape)

dataset = (inputs, responses)

100
(40000, 25)
(40000, 25)


In [17]:
import os
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, GRU, Dense 

max_sequence_len = 25
batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(1024)
train_dataset = train_dataset.batch(batch_size, drop_remainder=True)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [18]:
class Encoder(tf.keras.Model):
    def __init__(self, hidden_size=1024, max_sequence_len=25, batch_size=batch_size, embedding_dim=100, vocab_size=vocab_size+1):
        super(Encoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.max_sequence_len = max_sequence_len
        self.hidden_size = hidden_size
        self.batch_size = batch_size

        self.embedding_layer = Embedding(
            input_dim=self.vocab_size, output_dim=self.embedding_dim, weights=[embeddings_matrix], trainable=False)
        self.GRU_1 = GRU(units=hidden_size, return_sequences=True,recurrent_initializer='glorot_uniform')
        self.GRU_2 = GRU(units=hidden_size,
                         return_sequences=True, return_state=True,recurrent_initializer='glorot_uniform')

    def initial_hidden_state(self):
        return tf.zeros(shape=(self.batch_size, self.hidden_size))

    def call(self, x, initial_state, training=False):
        x = self.embedding_layer(x)
        x = self.GRU_1(x, initial_state=initial_state)
        x, hidden_state = self.GRU_2(x)
        return x, hidden_state

In [19]:
class Attention(tf.keras.Model):
    def __init__(self, hidden_size=256):
        super(Attention, self).__init__()
        self.fc1 = Dense(units=hidden_size)
        self.fc2 = Dense(units=hidden_size)
        self.fc3 = Dense(units=1)

    def call(self, encoder_output, hidden_state, training=False):
        '''hidden_state : h(t-1)'''
        y_hidden_state = tf.expand_dims(hidden_state, axis=1)
        y_hidden_state = self.fc1(y_hidden_state)
        y_enc_out = self.fc2(encoder_output)

        y = tf.keras.backend.tanh(y_enc_out + y_hidden_state)
        attention_score = self.fc3(y)
        attention_weights = tf.keras.backend.softmax(attention_score, axis=1)

        context_vector = tf.multiply(encoder_output, attention_weights)
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [20]:
class Decoder(tf.keras.Model):
    def __init__(self, hidden_size=1024, max_sequence_len=25, batch_size=batch_size, embedding_dim=100, vocab_size=vocab_size+1):
        super(Decoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.max_sequence_len = max_sequence_len
        self.hidden_size = hidden_size
        self.batch_size = batch_size
    
        self.embedding_layer = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim, weights=[embeddings_matrix], trainable=False)
        self.GRU = GRU(units=hidden_size,
                       return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        self.attention = Attention(hidden_size=self.hidden_size)
        self.fc = Dense(units=self.vocab_size)

    def initial_hidden_state(self):
        return tf.zeros(shape=(self.batch_size, self.hidden_size))

    def call(self, x, encoder_output, hidden_state, training=False):
        x = self.embedding_layer(x)
        context_vector, attention_weights = self.attention(
            encoder_output, hidden_state, training=training)
        contect_vector = tf.expand_dims(context_vector, axis=1)
        x = tf.concat([x, contect_vector], axis=-1)
        x, curr_hidden_state = self.GRU(x)
        x = tf.reshape(x, shape=[self.batch_size, -1])
        x = self.fc(x)
        return x, curr_hidden_state, attention_weights

In [21]:
loss_object = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-4)
train_accuracy = tf.metrics.SparseCategoricalAccuracy()

def loss_function(y_true, y_pred):
    loss = loss_object(y_true, y_pred)
    mask = 1 - tf.cast(tf.equal(y_true, 0), 'float32')
    return tf.reduce_mean(loss * mask)

In [22]:
def training_step(inputs, responses):    
    with tf.GradientTape() as Tape:
        encoder_init_state = encoder.initial_hidden_state()
        encoder_output, encoder_hidden_state = encoder(inputs, encoder_init_state, training=True)
        decoder_hidden = encoder_hidden_state
        loss = 0
        acc = []
        current_word = tf.expand_dims(responses[:, 0], axis=1)
        for word_idx in range(1, max_sequence_len):
            next_word = responses[:, word_idx]
            logits, decoder_hidden, attention_weights = decoder(current_word, encoder_output, decoder_hidden)
            loss += loss_function(next_word, logits)
            acc.append(train_accuracy(next_word, logits))
            current_word = tf.expand_dims(next_word, axis=1)
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = Tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return loss, tf.reduce_mean(acc)

encoder = Encoder()
decoder = Decoder()

In [23]:
epochs = 20
NUM_SAMPLES = len(X_train)
num_steps = NUM_SAMPLES // batch_size

for epoch in range(1, epochs + 1):
    print(f'Epoch {epoch}/{epochs}')
    ep_loss = []
    ep_acc = []
    progbar = tf.keras.utils.Progbar(target=num_steps, stateful_metrics=[
                                     'curr_loss', 'curr_accuracy'], unit_name='batch')

    for step, example in enumerate(train_dataset):
        inputs = example[0]
        responses = example[1]
        loss, acc = training_step(inputs, responses)
        loss /= responses.shape[1]
        ep_loss.append(loss)
        ep_acc.append(acc)
        progbar.update(
            step + 1, values=[('curr_loss', loss), ('curr_accuracy', acc)])

    print(f'Metrics after epoch {epoch} : Loss => {np.mean(ep_loss):.3f} | Accuracy => {np.mean(ep_acc):.3f}')


Epoch 1/20
Metrics after epoch 1 : Loss => 0.502 | Accuracy => 0.781
Epoch 2/20
Metrics after epoch 2 : Loss => 0.406 | Accuracy => 0.885
Epoch 3/20
Metrics after epoch 3 : Loss => 0.359 | Accuracy => 0.896
Epoch 4/20
Metrics after epoch 4 : Loss => 0.324 | Accuracy => 0.902
Epoch 5/20
Metrics after epoch 5 : Loss => 0.293 | Accuracy => 0.906
Epoch 6/20
Metrics after epoch 6 : Loss => 0.263 | Accuracy => 0.910
Epoch 7/20
Metrics after epoch 7 : Loss => 0.235 | Accuracy => 0.914
Epoch 8/20
Metrics after epoch 8 : Loss => 0.207 | Accuracy => 0.918
Epoch 9/20
Metrics after epoch 9 : Loss => 0.182 | Accuracy => 0.921
Epoch 10/20
Metrics after epoch 10 : Loss => 0.161 | Accuracy => 0.925
Epoch 11/20
Metrics after epoch 11 : Loss => 0.145 | Accuracy => 0.927
Epoch 12/20
Metrics after epoch 12 : Loss => 0.132 | Accuracy => 0.930
Epoch 13/20
Metrics after epoch 13 : Loss => 0.122 | Accuracy => 0.933
Epoch 14/20
Metrics after epoch 14 : Loss => 0.114 | Accuracy => 0.935
Epoch 15/20
Metrics afte

In [24]:
import matplotlib.pyplot as plt

loss = []
accuracy = []

for i in ep_loss:
  temp = float(i)
  loss_temp.append(temp)
  loss.append(np.mean(loss_temp))
  
plt.plot(loss)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()

NameError: ignored

In [25]:
def translate_sentence(sentence):
    sentence = preprocess([sentence])
    enc_init = tf.zeros(shape=[1, 1024])
    enc_out, enc_hidden = encoder(sentence, enc_init)
    decoder.batch_size = 1
    tokenizer.index_word[0] = ''
    decoded = []
    att = []
    current_word = tf.expand_dims([word_index['startsent']], axis=0) 
    decoder_hidden = enc_hidden
    for word_idx in range(1, max_sequence_len):
        logits, decoder_hidden, attention_weights = decoder(current_word, enc_out, decoder_hidden)
        decoded_idx = np.argmax(logits)
        if index_word[decoded_idx] == 'endsent':
            break
        decoded.append(tokenizer.index_word[decoded_idx])
        att.append(attention_weights.numpy().squeeze())
        current_word = tf.expand_dims([decoded_idx], axis=0)
    return ' '.join(decoded), att


In [26]:

sentences = ['what do you want ?',
             'are you mad ?',
             'who are you?',
             'the exam is going to be hard', 
             'do you want to play with me ?',
             'i love you', 
             'how are you doing ?', 
             'i should tell you that i am quite dangerous',
             'PLease help me',
             'I am happy',
             'Do you have any idea of how to use it ?',
             'Am I mad?']

for inp_sentence in sentences:
    inp_array = inp_sentence.split()
    inp_len = len(inp_sentence.split())
    trans_sentence, attention_weights = translate_sentence(inp_sentence)
    trans_array = trans_sentence.split()
    trans_len = len(trans_array)
    print('INPUT : ', inp_sentence)
    print('RESPONSE : ', trans_sentence)
    print('-'*30)

INPUT :  what do you want ?
RESPONSE :  i was passing
------------------------------
INPUT :  are you mad ?
RESPONSE :  why do you close
------------------------------
INPUT :  who are you?
RESPONSE :  where do i want to leave
------------------------------
INPUT :  the exam is going to be hard
RESPONSE :  get up
------------------------------
INPUT :  do you want to play with me ?
RESPONSE :  i do not know
------------------------------
INPUT :  i love you
RESPONSE :  i always have
------------------------------
INPUT :  how are you doing ?
RESPONSE :  at zero
------------------------------
INPUT :  i should tell you that i am quite dangerous
RESPONSE :  do not worry
------------------------------
INPUT :  PLease help me
RESPONSE :  they lie
------------------------------
INPUT :  I am happy
RESPONSE :  i go from arsha
------------------------------
INPUT :  Do you have any idea of how to use it ?
RESPONSE :  and the matches
------------------------------
INPUT :  Am I mad?
RESPONSE :