In [1]:
!pip install -U nltk 'gensim==4.2.0' 'keras-nlp' 'keras-preprocessing' 'tensorflow-text>=2.11'

Looking in indexes: https://alyydi:****@tratonregistry.jfrog.io/artifactory/api/pypi/ats-pypi-virtual/simple
Requirement already up-to-date: nltk in /home/alyydi/pluralsight/venv/lib/python3.8/site-packages (3.9.1)
Requirement already up-to-date: gensim==4.2.0 in /home/alyydi/pluralsight/venv/lib/python3.8/site-packages (4.2.0)
Requirement already up-to-date: keras-nlp in /home/alyydi/pluralsight/venv/lib/python3.8/site-packages (0.6.1)
Requirement already up-to-date: keras-preprocessing in /home/alyydi/pluralsight/venv/lib/python3.8/site-packages (1.1.2)
Requirement already up-to-date: tensorflow-text>=2.11 in /home/alyydi/pluralsight/venv/lib/python3.8/site-packages (2.13.0)


In [2]:
import multiprocessing
import tensorflow as tf
import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda, ELU, Conv1D, MaxPooling1D, Dropout
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras import preprocessing
from textblob import TextBlob, Word
from keras_preprocessing.sequence import pad_sequences
from keras.initializers import Constant
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import Model, Input
import tensorflow_text as tf_text
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import numpy as np
import re
import random
import os
import pandas as pd
import gensim
import warnings
import nltk
import time

TRACE = False

def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  tf.random.set_seed(42)
  random.seed(42)
  if TRACE:
    tf.debugging.set_log_device_placement(True)

def set_session_with_gpus_and_cores():
  cores = multiprocessing.cpu_count()
  gpus = len(tf.config.list_physical_devices('GPU'))
  config = tf.compat.v1.ConfigProto( device_count = {'GPU': gpus  , 'CPU': cores} , intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
  sess = tf.compat.v1.Session(config=config)
  tf.compat.v1.keras.backend.set_session(sess)

set_seeds_and_trace()
set_session_with_gpus_and_cores()
warnings.filterwarnings('ignore')
nltk.download('punkt')

2024-11-12 08:32:08.683686: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-12 08:32:08.726817: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-12 08:32:11.461944: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-11-12 08:32:11.461981: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: VDL900341
2024-11-12 08:32:11.461985: I tensorflow

True

# Transformer Layers

![alt text](./trans1.png "Title")

![alt text](./seq2seq_w_att.png "Title")

Where we input attention with the hidden state to create another updated hidden state we could input into the next cell. And this worked well on medium sized sentences, but was hard to train and unstable. Now that we know this, the Transformer basicaly tried to get rid of the RNN by using only attention

# The embedding and positional encoding layer

![alt text](./pos_enc_layer.png "Title")

In [4]:
## This comes straight from the paper

def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1)

  return tf.cast(pos_encoding, dtype=tf.float32)

In [5]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x
     


In [6]:
pos = PositionalEmbedding(5000, 100)

In [7]:
input = tf.constant(np.random.randint(1,5000, size=(3,26)))
response = pos(input)
response.shape

TensorShape([3, 26, 100])

# Add and normalize

![alt text](./add_n_norm.png "Title")

Note: Use Add layer instead of + to propagate masks

We will create a BaseAttention layer that inherits the Add+Norm and then each subclass of attention will implement the correct on

In [8]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

# Self attention layer

![alt text](./self_att_layer.png "Title")

In [11]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    # We need to compare everything with everything, therefore Q, K and V must be the input
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])  # This one comes from the base class
    x = self.layernorm(x)  # This one comes from the base class
    return x

In [12]:
embedding_dim = 100
vocab_size = 5000
input = tf.constant(np.random.randint(1,vocab_size, size=(3,26)))

# First we apply the PositionalEmbedding to embed into what the attention layer expects
pos = PositionalEmbedding(vocab_size, embedding_dim)

# Then we do the self attention, the n_heads is arbitrary
gsa = GlobalSelfAttention(num_heads=3, key_dim=embedding_dim)


response = gsa(pos(input))
response.shape

TensorShape([3, 26, 100])

# The cross attention layer

This layer connects the encoder and decoder. This layer is the most straight-forward use of attention in the model, it performs the same task as the attention block in the previous demo (and we will copy it).

In [13]:
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,  # This is the key part!!
        value=context,  # This is the key part!!
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

In [14]:
embedding_dim_es = 100
vocab_size_es = 5000

embedding_dim_en = 512
vocab_size_en = 6000

# We are supposing the model will translate Spanish to English, so context for CrossAttention will be the spanish input.

input_es = tf.constant(np.random.randint(1,vocab_size_es, size=(3,26)))
input_en = tf.constant(np.random.randint(1,vocab_size_es, size=(3,24)))


pos_es = PositionalEmbedding(vocab_size_es, embedding_dim_es)
pos_en = PositionalEmbedding(vocab_size_en, embedding_dim_en)


gsa = GlobalSelfAttention(num_heads=3, key_dim=embedding_dim_es)
cross = CrossAttention(num_heads=3, key_dim=embedding_dim_en)


context = gsa(pos_es(input_es)) # Forget about the feed forwards

response = cross(pos_en(input_en), context=context) # Forget about masked attention for now, assume it is the identity

response.shape

TensorShape([3, 24, 512])

# The causal self attention layer (Masked Multi Headed Attention)

![alt text](./causal_self_att.png "Title")

The only big difference in the masked multi headed attention is that we cannot attend to words in the future, so we will use a mask such that the Nth word can only see the first N-1 words and not all the sentence.


In [15]:

class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)  # This is the key!
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

![alt text](./causal_self_att_2.png "Title")


Notice in the diagram above how the query can only attend the values for the past


In [16]:
embedding_dim_en = 512
vocab_size_en = 6000

# We are supposing the model will translate Spanish to English, so context for CrossAttention will be the spanish input.

input_en = tf.constant(np.random.randint(1,vocab_size_es, size=(3,24)))


pos_en = PositionalEmbedding(vocab_size_en, embedding_dim_en)

csa = CausalSelfAttention(num_heads =3, key_dim=embedding_dim_en)

response = csa(pos_es(input_en))

response.shape

TensorShape([3, 24, 100])