In [1]:
"""
Created by      : Cephas Soga
Project name    : Synaptiq
Algorithm name  : 3epsilon
Alg. Family     : Transformer
Alg. Type       : Decoder Only
"""
### ADAPTED FROM https://oreil.ly/J86pg
### Using GoogleBlog Corpus for training

'\nCreated by      : Cephas Soga\nProject name    : Synaptiq\nAlgorithm name  : 3epsilon\nAlg. Family     : Transformer\nAlg. Type       : Decoder Only\n'

0. Libraries and Packages

In [2]:
%load_ext autoreload
%autoreload 2
import numpy as np
import re, string, os, random, json, pickle


In [3]:
# setting Accelerated linear algebra
os.environ['TF_XLA_FLAGS'] = '--tf_xla_auto_jit=2'

In [4]:
from tqdm.notebook import tqdm
from IPython.display import display, HTML

import tensorflow as tf
# Set the number of inter- and intra-op parallelism threads
tf.config.threading.set_inter_op_parallelism_threads(2)
tf.config.threading.set_intra_op_parallelism_threads(4)
from tensorflow.keras import layers, models, losses, callbacks

1. Parameters

In [5]:
VOCAB_SIZE = 12000
BUFFER_SIZE = 128
MAX_LEN = 40
EMBEDDING_DIM  = 128
N_HEADS = 2
FEED_FOWARD_DIM = 128
KEY_DIM = 128
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 128
EPOCHS = 1

2. Read Json file and extract text data

In [6]:
# load the json file
with open(r"C:\Users\hp\Desktop\Text_datasets\blogs.json") as json_file:
    text_ds = json.load(json_file)
# filter the data to retain 'text' key only in the dicts
text_ds = list(
    item["text"] for item in text_ds 
    if item["text"] is not None
)
# create a list of stopwords
stopwords = ["urlLink", "urllink", "click"]
# remove them from dataset
text_ds = list(word for word in text_ds if word not in stopwords) 
count = len(text_ds)
print(f'{count} blogs found !')

681284 blogs found !


3. Tokenize the dataset extrated from JSON file

In [7]:
# Pad the punctuation to treat them as separate words
def pad_punctuation(strings):
    strings = re.sub(f"([{string.punctuation}, '\n'])", r" \1 ", strings)
    strings = re.sub(" +", " ", strings)
    return strings

In [8]:
# apply the punctuation padding to the data
text_ds = list(pad_punctuation(line) for line in tqdm(text_ds))

  0%|          | 0/681284 [00:00<?, ?it/s]

In [9]:
# pick a random subset from text data after padding has been applied
idx = random.randint(0, 1000)
sample = text_ds[idx]

In [10]:
# convert to Tensorflow dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_ds).
    batch(BATCH_SIZE)
    .shuffle(1000)
)

In [11]:
# create vectorizaton layer
vectors = layers.TextVectorization(
    standardize='lower',
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_LEN + 1
)

In [12]:
# adapt the vectorization layer to the text dataset
vectors.adapt(text_ds)
learned_vocabulary = vectors.get_vocabulary()

In [13]:
# display some 'token:word' mappings as well as the size of the vocabulary
for idx, word in enumerate(learned_vocabulary[:10]):
    print(f'{idx}: {word}')
print('Learned vocabulary size:', len(learned_vocabulary) )

0: 
1: [UNK]
2: .
3: ,
4: i
5: the
6: '
7: to
8: and
9: a
Learned vocabulary size: 12000


In [14]:
# show vectorisations results for the random sample
sample = vectors(sample)
print(sample.numpy())

[   4 1001   15  417  149  109    2  531   16   16   16   24    1    3
    4   84    4    6   48  778  184  338    2    4   45  758  244    5
 2446    9  261    8  134  204    4   53   62    9 1766  834    2]


4. finalizing the text data adjustement

In [15]:
# prepare the inputs that will be fed foward through the model's layers
def transformer_inputs(text):
    """
    Shift the sequences by 1 position so that the target at pos. (i) is the word at pos. (i+1).
    The model will use all the words located at pos. (0 - K) with k < i.
    """
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectors(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

In [16]:
text_ds = text_ds.map(transformer_inputs)

5. Create a casual attention mask

In [17]:
def casual_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Preventing informations from future tokens to flow into current token.
    That means masking the upper half of the dot product in self attention.
    Ones in the lower triangle, counting from the lower right corner.
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)

np.transpose(casual_attention_mask(1, 10, 10, dtype=tf.int32)[0])

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

6. Create a Transformer Block layer

In [18]:
class TransformerBlock(layers.Layer):
    def __init__(self, num_heads, key_dim, embed_dim, ff_dim, dropout_rate=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate
        self.attn = layers.MultiHeadAttention(
            num_heads, key_dim, output_shape=embed_dim
            )
        self.dropout1 = layers.Dropout(self.dropout_rate)
        self.dropout2 = layers.Dropout(self.dropout_rate)
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.ffn1 = layers.Dense(self.ff_dim, activation='relu')
        self.ffn2 = layers.Dense(self.embed_dim)
        

    def call(self, inputs):
        input_sahpe = tf.shape(inputs)
        batch_size = input_sahpe[0]
        seq_len = input_sahpe[1]
        casual_mask = casual_attention_mask(
            batch_size, seq_len, seq_len, tf.bool
        )
        attention_output, attention_scores = self.attn(
            inputs,
            inputs,
            attention_mask=casual_mask,
            return_attention_scores=True,
        )
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn1 = self.ffn1(out1)
        ffn2 = self.ffn2(ffn1)
        ffn_output = self.dropout2(ffn2)
        return  (self.layernorm2(out1 + ffn_output), attention_scores)
    
    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "key_dim": self.key_dim,
                "embed_dim": self.embed_dim,
                "num_heads": self.num_heads,
                "ff_dim": self.ff_dim,
                "dropout_rate": self.dropout_rate,
            }
        )
        return config

7. Token an position embedding

In [19]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.max_len  = max_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_emb = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    
    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "max_len": self.max_len,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config

8. Building the transformer model

In [20]:
loss = [losses.SparseCategoricalCrossentropy(), None]
inputs = layers.Input(shape=(None,), dtype=tf.int32)
x = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x, attention_scores = TransformerBlock(
    N_HEADS, KEY_DIM, EMBEDDING_DIM, FEED_FOWARD_DIM
)(x)
outputs = layers.Dense(VOCAB_SIZE, activation='softmax')(x)
t_eps = models.Model(inputs=inputs, outputs=[outputs, attention_scores])
t_eps.compile("adam", loss=loss) 

In [21]:
t_eps.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 token_and_position_embeddi  (None, None, 128)         1541120   
 ng (TokenAndPositionEmbedd                                      
 ing)                                                            
                                                                 
 transformer_block (Transfo  ((None, None, 128),       165504    
 rmerBlock)                   (None, 2, None, None))             
                                                                 
 dense_2 (Dense)             (None, None, 12000)       1548000   
                                                                 
Total params: 3254624 (12.42 MB)
Trainable params: 3254624 (12.42 MB)
Non-trainable params: 0 (0.00 Byte)
_____________________

9. load the model if needed

In [22]:
if LOAD_MODEL:
    # load model's weights
    t_eps = models.load_model('./models/3epsilon', compile=True)

10. Train the transformer

In [23]:
# create a TextGenerator checpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }
    
    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs
    
    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y, att = self.model.predict(x, verbose=0)
            sample_token, probs = self.sample_from(y[0][-1], temperature)
            info.append(
                {
                    "prompt": start_prompt,
                    "word_probs": probs,
                    "atts": att[0, :, -1, :],
                }
            )
            start_tokens.append(sample_token)
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info
    
    def on_epoch_end(self, epoch, logs=None):
        self.generate("love is", max_tokens=80, temperature=1.0)

In [50]:
#Create a model save checkpoint
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.ckpt",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

# Tokenize starting prompt
text_generator = TextGenerator(learned_vocabulary)

TypeError: TextGenerator.__init__() missing 2 required positional arguments: 'start_tokens' and 'index_to_word'

In [25]:
t_eps.fit(
    text_ds,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)

generated text:
love is why not all i dont have anything to . from now ( it ' s ' so [UNK] ) ' tomorrow night at [UNK] this week . 



<keras.src.callbacks.History at 0x2fc282a2490>

In [26]:
# save the model architecture
t_eps.save("./models/3epsilon_architecture")
# save the model weigths
t_eps.save_weights("./models/3epsilon_weights")

INFO:tensorflow:Assets written to: ./models/3epsilon_architecture\assets


INFO:tensorflow:Assets written to: ./models/3epsilon_architecture\assets


11. Generate text using the transformeer

In [27]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        highlighted_text = []
        for word, att_score in zip(
            i["prompt"].split(), np.mean(i["atts"], axis=0)
        ):
            highlighted_text.append(
                '<span style="background-color:rgba(135,206,250,'
                + str(att_score / max(np.mean(i["atts"], axis=0)))
                + ');">'
                + word
                + "</span>"
            )
        highlighted_text = " ".join(highlighted_text)
        display(HTML(highlighted_text))

        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

12. Save the model and get it ready for being used in Streamlit UI

In [28]:
# Save the model to disk
filename = '3epsilon_saved_with_pickle.sav'
pickle.dump(t_eps, open(filename, 'wb'))

In [29]:
# save the textgenerator
filename = 'text_generator_saved_with_pickle.sav'
pickle.dump(text_generator, open(filename, 'wb'))

In [30]:
# save learned vocabulary
filename = 'learned_vocabulary_saved_with_pickle.sav'
pickle.dump(learned_vocabulary, open(filename, 'wb'))

In [49]:
info = text_generator.on_epoch_end(
    "love is", max_tokens=80 , temperature=0.5
)

TypeError: TextGenerator.on_epoch_end() got an unexpected keyword argument 'max_tokens'

In [33]:
print(info, learned_vocabulary)

NameError: name 'info' is not defined