In [8]:
# Importing the packages 
import tensorflow as tf

# Zip file reading 
import zipfile

# Array math 
import numpy as np 

# Tensorflow text 
import tensorflow_text as tf_text

In [9]:
# Printing the version of tf, tf_text 
print(tf.__version__)
print(tf_text.__version__)

# Checking if GPU is available
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

2.10.0
2.10.0
GPU is available


# Reading the data 

In [10]:
# Defining the path to the data 
path_to_file = 'input/lit-eng.zip'

# Loading the data
archive = zipfile.ZipFile(path_to_file, 'r')
text = archive.read('lit.txt').decode('utf-8')

# Splitting the data into target and context
lines = text.splitlines()
pairs = [line.split('\t') for line in lines]

# Creating the context and the target data 
target_raw = np.array([x[0] for x in pairs])
context_raw = np.array([x[1] for x in pairs])

In [11]:
# Printing the amount of data we have
print('Amount of examples:', len(target_raw))

# Printing out some target and context data 
for i in range(3):
    print('Target:', target_raw[-i])
    print('Context:', context_raw[-i])
    print('---')

Amount of examples: 2140
Target: Go on.
Context: Tęsk.
---
Target: January, February, March, April, May, June, July, August, September, October, November and December are the twelve months of the year.
Context: Metus sudaro dvylika mėnesių: sausis, vasaris, kovas, balandis, gegužė, birželis, liepa, rugpjūtis, rugsėjis, spalis, lapkritis ir gruodis.
---
Target: Before I get out of bed, I spend a little time thinking about what I'll be doing the rest of the day.
Context: Prieš atsikeldamas iš lovos, aš praleidžiu kažkiek laiko galvodamas apie tai, ką veiksiu likusią dienos dalį.
---


# Creating a TF dataset

In [12]:
BUFFER_SIZE = len(context_raw)
BATCH_SIZE = 8

is_train = np.random.uniform(size=(len(target_raw),)) < 0.8

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw[is_train], target_raw[is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))

val_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw[~is_train], target_raw[~is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))

In [13]:
# Showing one batch of the data 
for example_context_strings, example_target_strings in train_raw.take(1):
  print(example_context_strings)
  print()
  print(example_target_strings)
  break

tf.Tensor(
[b'Ne\xc5\xbeaiskite su ugnimi.'
 b'Mes valgome, kad gyventume, o negyvename, kad valgytume.'
 b'Kartais svajon\xc4\x97s i\xc5\xa1sipildo.'
 b'De\xc5\xa1imt met\xc5\xb3 yra ilgas laikas.'
 b'\xc5\xa0unis yra i\xc5\xa1tikimi.' b'Ar tu nebuvai pavargusi?'
 b'Maniau, sakiai, kad jis tavo.' b'Kambaryje nieko n\xc4\x97ra.'], shape=(8,), dtype=string)

tf.Tensor(
[b"Don't play with fire." b'We eat to live, not live to eat.'
 b'Dreams sometimes come true.' b'Ten years is a long time.'
 b'Dogs are faithful.' b"Weren't you tired?"
 b'I thought you said it was yours.' b"There's no one in the room."], shape=(8,), dtype=string)


# Text preprocessing

In [14]:
def tf_lower_and_split_punct(text: str) -> str:
  """
  Function that preprocesses the text by lowercasing it and splitting the
    punctuation from the words.
  """
  # The bellow line normalizes converts the text to NFKD form, which is a
  #  compatibility decomposition form. This is done to ensure that the text
  #  is in a standard form.
  text = tf_text.normalize_utf8(text, 'NFKD')
  
  # Lowercasing
  text = tf.strings.lower(text)
  
  # Keep space, a to z, and select punctuation.
  text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
  
  # Add spaces around punctuation.
  text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
  
  # Strip whitespace.
  text = tf.strings.strip(text)

  # Add start and end tokens.
  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  
  # Returning 
  return text

In [15]:
# Showing an example 
example_text = 'Lietuviškas tekstas'
print(tf_lower_and_split_punct(example_text))

tf.Tensor(b'[START] lietuviskas tekstas [END]', shape=(), dtype=string)


# Text vectorization 

In [18]:
# Defining the maximum number of unqiue words that we will extract from the context data
max_vocab_size = 10000

# A layer which will vectorizes the text
context_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True
    )

In [22]:
# The adapt() function is similar to model.fit() function in that it will
#  train the layer on the data that we pass to it.
context_text_processor.adapt(train_raw.map(lambda context, target: context))

# Here are the first 20 words from the vocabulary:
context_text_processor.get_vocabulary()[:10]

['', '[UNK]', '[START]', '[END]', '.', 'as', ',', '?', 'tomas', 'tai']

In [23]:
# Now lets build the target text processor
target_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size,
    ragged=True)

target_text_processor.adapt(train_raw.map(lambda context, target: target))
target_text_processor.get_vocabulary()[:10]

['', '[UNK]', '[START]', '[END]', '.', 'i', 'the', 'you', 'to', '?']

In [27]:
# Printing out once again the first batch of the context strings
print(example_context_strings.numpy())

[b'Ne\xc5\xbeaiskite su ugnimi.'
 b'Mes valgome, kad gyventume, o negyvename, kad valgytume.'
 b'Kartais svajon\xc4\x97s i\xc5\xa1sipildo.'
 b'De\xc5\xa1imt met\xc5\xb3 yra ilgas laikas.'
 b'\xc5\xa0unis yra i\xc5\xa1tikimi.' b'Ar tu nebuvai pavargusi?'
 b'Maniau, sakiai, kad jis tavo.' b'Kambaryje nieko n\xc4\x97ra.']


In [28]:
# The context text processor will convert the context strings into tokens
example_tokens = context_text_processor(example_context_strings)
example_tokens

<tf.RaggedTensor [[2, 1714, 22, 308, 4, 3],
 [2, 23, 975, 6, 14, 2258, 6, 363, 1830, 6, 14, 972, 4, 3],
 [2, 276, 1128, 2178, 4, 3], [2, 806, 111, 11, 2237, 216, 4, 3],
 [2, 1168, 11, 2169, 4, 3], [2, 10, 13, 663, 202, 7, 3],
 [2, 214, 6, 556, 6, 14, 12, 38, 4, 3], [2, 181, 46, 39, 4, 3]]>

In [30]:
def process_text(context: str, target: str):
  """
  Function that creates the input and output for the deep learning model; 

    Args:
        context: The context data
        target: The target data

    Returns:
        A tuple of the input and output data
  """
  # Preprocesing and tokenizing the raw input text
  context = context_text_processor(context).to_tensor()
  target = target_text_processor(target)

  # Creating the input and output data for deep learning
  targ_in = target[:,:-1].to_tensor()
  targ_out = target[:,1:].to_tensor()

  # Returning
  return (context, targ_in), targ_out

# Applying the function to the list of texts 
train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

In [41]:
# Defining an example context and target sentences 
example_context = "Labas pasauli!"
example_target = "Hello world!"

# Applying the function to the example context and target sentences
(context_tokens, target_tokens), target_tokens_out = process_text([example_context], [example_target])

The output of the `process_text()` function is a tuple consisting of ((context tokens $X$, target tokens $X$), target tokens $Y$). 

The $Y$ tokens is a sequence of tokens that are shifted by one unit to the right of the $X$ tokens. 

In [46]:
print(f"The context tokens X: {context_tokens.numpy()}")
print(f"The target tokens X: {target_tokens.numpy()}")
print(f"The target tokens y: {target_tokens_out.numpy()}") 

The context tokens X: [[   2 2018  352   36    3]]
The target tokens X: [[   2 1281  148   78]]
The target tokens y: [[1281  148   78    3]]


# The encoder 

The goal of the encoder is to process the context sequence into a sequence of vectors that are useful for the decoder as it attempts to predict the next output for each timestep. 

We will use a bidirectional-RNN to do the processing.

The encoder:

* Takes a list of token IDs (from context_text_processor).
* Looks up an embedding vector for each token (Using a layers.Embedding).
* Processes the embeddings into a new sequence (Using a bidirectional layers.GRU).
* Returns the processed sequence. This will be passed to the attention head.

In [50]:
# Defining the UNITS for the RNN layer; 
UNITS = 128


class Encoder(tf.keras.layers.Layer):
  def __init__(self, text_processor, units):
    super(Encoder, self).__init__()
    self.text_processor = text_processor
    self.vocab_size = text_processor.vocabulary_size()
    self.units = units

    # The embedding layer converts tokens to vectors
    self.embedding = tf.keras.layers.Embedding(
        self.vocab_size, 
        units,
        mask_zero=True)

    # The RNN layer processes those vectors sequentially.
    self.rnn = tf.keras.layers.Bidirectional(
        merge_mode='sum',
        layer=tf.keras.layers.GRU(units,
                            # Return the sequence and state
                            return_sequences=True,
                            recurrent_initializer='glorot_uniform'))

  def call(self, x):
    
    # 1. The embedding layer looks up the embedding vector for each token.
    x = self.embedding(x)

    # 3. The GRU processes the sequence of embeddings.
    x = self.rnn(x)

    # 4. Returns the new sequence of embeddings.
    return x

  def convert_input(self, texts):
    texts = tf.convert_to_tensor(texts)
    if len(texts.shape) == 0:
      texts = tf.convert_to_tensor(texts)[tf.newaxis]
    context = self.text_processor(texts).to_tensor()
    context = self(context)
    return context

In [77]:
# Setting an example number of embedding dimensions 
embedding_dim_example = 10

# Initiating the encoder 
encoder_example = Encoder(context_text_processor, embedding_dim_example)

In [78]:
# Defining the example sentence 
example_context = "Vandeniu, žeme ir oru!"
example_target = "By sea, land and air!"

# Applying the function to the example context and target sentences
(context_tokens, target_tokens), target_tokens_out = process_text([example_context], [example_target])

print(f"We have the original context string:\n'{example_context}'") 
print(f"Its token form is:\n{context_tokens.numpy()}")
embedding_output = encoder_example.embedding(context_tokens)
print(f"The first layer for the encoder is the embedding layer, which converts the tokens to vectors. The shape of the output: {embedding_output.shape}")
print(f"Embedding output:\n{embedding_output}")

We have the original context string:
'Vandeniu, žeme ir oru!'
Its token form is:
[[   2  969    6    1   21 1646   36    3]]
The first layer for the encoder is the embedding layer, which converts the tokens to vectors. The shape of the output: (1, 8, 10)
Embedding output:
[[[ 0.04450219 -0.03834212  0.02642426  0.0076616   0.01130848
    0.02561903 -0.03648316  0.02317765  0.03023699 -0.01628438]
  [-0.02516098 -0.02166842  0.02130898 -0.00571905 -0.04905744
   -0.00580677 -0.03810577 -0.02370191  0.0434409  -0.0301008 ]
  [-0.03928101  0.00330967  0.0049296   0.00819153  0.01588798
    0.00770397  0.04836607  0.0289692   0.03934513 -0.02607347]
  [ 0.0244338   0.00356202  0.02282291 -0.00224922  0.03450264
   -0.01853289 -0.03045956  0.01710209  0.02187247 -0.00715552]
  [ 0.01397747 -0.0084367  -0.02343072  0.03459773  0.04115411
    0.03225381  0.03548944 -0.01587107  0.00378732 -0.02506669]
  [ 0.02397806 -0.03537195 -0.03064996 -0.00405379 -0.03447749
    0.01471053  0.00835864  0

As we can see from the above code, the first layer in the encoder takes the tokens and assigns them a vector of size 10 (example constant). The output shape is (batch_size, sequence_length, embedding_dimension). In our case:
* batch_size = 1, because we provided one sentence
* sequence_length = 8, because we provided 8 tokens 
* embedding_dimension = 10, because we provided 10 as the embedding dimension 

The output of the embedding layer gets fed into the bidirectional GRU layer. 

`GRU` stands for Gated Recurrent Unit. It is a type of RNN that is able to learn long-term dependencies. 

The generic input shape for the GRU layer is **(batch_size, sequence_length, number of features)**. 

In our case, the batch size is 1, the sequence length is 8 and the number of features is 10. Each number in the embedding vector is a feature.

Because we have set the `return_sequences` parameter to `True`, the output of the GRU layer is a sequence of vectors. 

The output shape is (batch_size, sequence_length, embedding_dimension).



In [79]:
# We pass the output of the embedding layer to the RNN layer
rnn_output = encoder_example.rnn(embedding_output)

print(f"The shape of the RNN output: {rnn_output.shape}")
print(f"RNN output:\n{rnn_output}")

The shape of the RNN output: (1, 8, 10)
RNN output:
[[[ 0.01882152  0.00135372 -0.01662301  0.02526054 -0.00643379
    0.01713916  0.0149806  -0.000564   -0.00476052  0.0244658 ]
  [ 0.02813361 -0.0112968   0.0052719  -0.01278985  0.0059546
    0.00366347  0.00330911  0.00188386  0.00457937  0.03087011]
  [ 0.03105481 -0.00715557 -0.00376149  0.01787064 -0.01643648
    0.00280224  0.00844893  0.00859217 -0.00913633  0.00964303]
  [ 0.01832861  0.00136256 -0.03106287  0.03231924 -0.01670991
    0.01064127  0.01257971 -0.00684161 -0.01363955  0.01880245]
  [ 0.01451444  0.01780497 -0.01984466  0.02564278 -0.02929392
    0.01237199  0.00675356  0.00760588 -0.01046836 -0.01500607]
  [ 0.01690835  0.00991304 -0.01740106  0.0046435  -0.01482428
    0.00835882  0.02395185  0.00653472  0.00056449 -0.01246439]
  [ 0.00238482  0.00980644 -0.00530771 -0.01611545 -0.00972703
   -0.0093706   0.01469161  0.00346752 -0.00792204 -0.02686535]
  [-0.00663948  0.02494486 -0.03550029  0.02347191 -0.026545

The above output is the encoder output. It is a sequence of vectors that are useful for the decoder as it attempts to predict the next output for each timestep. 

# Attention layer 

The attention layer is a mechanism that allows the decoder to focus on the relevant parts of the encoder output. 

The simplest way you could calculate a single vector from the entire sequence would be to take the average across the sequence (`layers.GlobalAveragePooling1D`). An attention layer is similar, but calculates a weighted average across the context sequence. Where the weights are calculated from the combination of context and "query" vectors.

In [76]:
from keras import layers

class SelfAttentionLayer(layers.Layer):
    def __init__(self, embedding_dim):
        super(SelfAttentionLayer, self).__init__()
        self.embedding_dim = embedding_dim
        self.query_layer = layers.Dense(embedding_dim)
        self.key_layer = layers.Dense(embedding_dim)
        self.value_layer = layers.Dense(embedding_dim)
        
    def call(self, inputs):
        # Compute query, key, and value matrices
        query = self.query_layer(inputs)
        key = self.key_layer(inputs)
        value = self.value_layer(inputs)
        
        # Compute dot product attention scores
        scores = tf.matmul(query, key, transpose_b=True)
        scores_scaled = tf.divide(scores, tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32)))
        attention_weights = tf.nn.softmax(scores_scaled, axis=-1)
        
        # Apply attention weights to value matrix
        output = tf.matmul(attention_weights, value)
        return output

In [83]:
# We have an embedding size of **embedding_dim_example** 
attention_example = SelfAttentionLayer(embedding_dim_example)

# Calculating the query, key and value matrices
query = attention_example.query_layer(rnn_output)
key = attention_example.key_layer(rnn_output)
value = attention_example.value_layer(rnn_output)

# Printing out the query, key and value matrices
print(f"Query matrix:\n{query}")
print(f"Key matrix:\n{key}")
print(f"Value matrix:\n{value}")

# Printing out the shapes
print(f"Query matrix shape: {query.shape}")
print(f"Key matrix shape: {key.shape}")
print(f"Value matrix shape: {value.shape}")

Query matrix:
[[[-0.00839226 -0.01421904  0.01949998 -0.01202205  0.01488757
    0.01198664 -0.00209417 -0.0094746  -0.00261967  0.02356385]
  [-0.00425759 -0.01746158 -0.02190064 -0.00700978  0.00342614
   -0.02015614  0.00848745 -0.02948325  0.00479916 -0.00192907]
  [-0.01243407 -0.01030727 -0.00065965 -0.00720427  0.00798002
   -0.00632861  0.01057748 -0.01312093 -0.02157345  0.00170836]
  [-0.021781   -0.0167504   0.03156691 -0.02483669  0.01643346
    0.02221419 -0.00274086 -0.00090497 -0.00038572  0.02773965]
  [-0.01552628  0.00855328  0.02616021 -0.003216    0.00101429
    0.01657802  0.01138128  0.02170641 -0.02607228  0.006862  ]
  [-0.02364073 -0.00302263  0.01336624 -0.0033468  -0.00401209
   -0.0109858   0.01350125  0.00971766 -0.02404812 -0.01200932]
  [-0.02698657  0.00385442  0.0005471  -0.00640507 -0.02308033
   -0.01895544  0.01214225  0.01451633 -0.01991589 -0.02591465]
  [-0.03149271  0.00304063  0.04571516 -0.02218417 -0.00511967
    0.02515347 -0.00211561  0.0334

The above matrices have an output of shape (batch_size, sequence_length, embedding_dimension). 

Each vector that gets fed into any of the underlying Dense layers is a vector of size 10. Thus, each query, key and value layer is just a dense layer with 10 neurons each.

The first computation is the dot product of the query and key vectors. We multiply the query vector with the key vector because we want to know how much the query vector is related to the key vector.


In [85]:
scores = tf.matmul(query, key, transpose_b=True)

print(f"Dot product attention scores:\n{scores}")
print(f"Shape of scores: {scores.shape}")

Dot product attention scores:
[[[-1.0201239e-03 -4.4868028e-04 -7.2688924e-04 -1.0086417e-03
   -9.8364765e-04 -7.8633369e-05  7.1415375e-04 -3.2644288e-04]
  [ 1.4099921e-03  6.0449453e-05 -6.4434033e-05  1.6493733e-03
    3.8262733e-04  8.1197970e-04 -2.8518657e-04  1.5278750e-03]
  [ 9.7279914e-04  2.6012617e-04  1.3489346e-04  1.1315203e-03
    5.8455007e-06  5.0517143e-04 -9.2395276e-06  9.2330459e-04]
  [-1.7254781e-03 -9.2272466e-04 -9.7171345e-04 -1.6978084e-03
   -1.2307597e-03 -3.3157325e-04  8.0931140e-04 -6.8287319e-04]
  [-2.7066271e-04  1.3313891e-04  3.8997037e-04 -2.7029717e-04
   -1.3497373e-04 -2.1789613e-04  2.3218978e-04 -4.1013624e-04]
  [ 1.0547187e-03  2.1827198e-04  6.3699041e-04  1.3500205e-03
    6.2305923e-04  2.6774278e-04 -2.6333498e-04  9.8469132e-04]
  [ 1.8397184e-03  3.0992916e-04  1.2354427e-03  2.1313103e-03
    1.4002207e-03  2.6393426e-04 -1.0548511e-03  1.2074687e-03]
  [-1.6135173e-03 -7.7509502e-04 -3.4399207e-05 -1.6113701e-03
   -4.4917737e-04 

The output shape is now (batch_size, sequence_length, sequence_length).  

The next step will be is to normalize the values in the matrix. We do this by dividing each value by the square root of the embedding dimension. After that, we apply the softmax function to the matrix. 

After applying the softmax function, we get a matrix of values that sum up to 1.

In [89]:
# Scaling the scores
scores_scaled = tf.divide(scores, tf.math.sqrt(tf.cast(embedding_dim_example, tf.float32)))

# Applying the softmax function
attention_weights = tf.nn.softmax(scores_scaled, axis=-1)

print(f"Attention weights:\n{attention_weights}")

Attention weights:
[[[0.12497884 0.12500143 0.12499043 0.12497929 0.12498028 0.12501605
   0.1250474  0.12500626]
  [0.1250286  0.12497525 0.12497031 0.12503806 0.12498798 0.12500495
   0.12496158 0.12503326]
  [0.12501906 0.1249909  0.12498595 0.12502533 0.12498084 0.12500058
   0.12498024 0.1250171 ]
  [0.12496518 0.12499689 0.12499496 0.12496626 0.12498472 0.12502027
   0.12506537 0.12500638]
  [0.12499201 0.12500797 0.12501812 0.12499203 0.12499738 0.12499409
   0.12501189 0.1249865 ]
  [0.12501761 0.12498455 0.1250011  0.12502928 0.12500055 0.12498651
   0.12496551 0.12501486]
  [0.1250365  0.12497602 0.12501259 0.12504801 0.1250191  0.1249742
   0.12492209 0.12501149]
  [0.12496687 0.12499999 0.12502928 0.12496695 0.12501289 0.12499817
   0.12504075 0.12498514]]]


The last step of the attention layer is to multiply the normalized matrix with the value matrix. 

In [90]:
# Multiplying the attention weights with the value matrix
output = tf.matmul(attention_weights, value)

print(f"Output:\n{output}")
print(f"Shape of output: {output.shape}")

Output:
[[[-0.02190342  0.00580592  0.00079784  0.00440796  0.00265892
    0.0018284   0.00832882  0.00465613 -0.01214981 -0.00200923]
  [-0.02190764  0.00580736  0.00080046  0.00440706  0.00265874
    0.0018285   0.00833186  0.00465444 -0.01215313 -0.00200891]
  [-0.02190632  0.00580707  0.00080002  0.00440677  0.00265839
    0.00182786  0.00833106  0.00465442 -0.01215227 -0.0020088 ]
  [-0.02190289  0.00580554  0.00079718  0.00440833  0.00265905
    0.00182879  0.00832829  0.0046569  -0.01214911 -0.00200947]
  [-0.02190458  0.00580627  0.00079879  0.00440613  0.00265755
    0.00182731  0.00832968  0.00465537 -0.01215017 -0.00200903]
  [-0.02190716  0.00580704  0.00080023  0.00440607  0.0026578
    0.00182785  0.00833156  0.00465471 -0.01215227 -0.00200894]
  [-0.02190916  0.00580755  0.00080134  0.00440483  0.00265696
    0.00182749  0.00833286  0.00465412 -0.01215321 -0.00200887]
  [-0.02190379  0.00580555  0.00079759  0.00440661  0.00265763
    0.00182801  0.00832881  0.00465687 -0

The output is the exact same shape as the input. The attention layer returns a weighted average of the context vectors. Thus, we can infer which positions in the context sequence are more important than others.