In [1]:
import tensorflow as tf

In [2]:
import os
# Set environment variable for Keras backend to use TensorFlow
os.environ["KERAS_BACKEND"] = "tensorflow"  
#import tensorflow_datasets as tfds
# Import required libraries for NLP tasks
import tensorflow_text
import keras_nlp
import keras
import time
# Enable mixed precision training to improve performance and reduce memory usag
keras.mixed_precision.set_global_policy("mixed_float16")

In [3]:
# Suppress all warning messages to keep output clean
import warnings
warnings.filterwarnings("ignore")

In [4]:
# To speed up training and generation, we use preprocessor of length 128
# instead of full length 1024.
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=128,
)
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en", preprocessor=preprocessor
)

In [5]:
# Record the start time for performance measurement
start = time.time()

# Generate text using GPT-2 model with the prompt "My trip to Yosemite was"
# Set maximum length of generated text to 200 tokens
output = gpt2_lm.generate("My trip to Yosemite was", max_length=200)
print("\nGPT-2 output:")
print(output)

# Record end time and calculate total execution time
end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")

I0000 00:00:1739958663.926486 16202430 service.cc:148] XLA service 0x600003a6b500 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1739958663.926536 16202430 service.cc:156]   StreamExecutor device (0): Host, Default Version
I0000 00:00:1739958663.931689 16202430 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.



GPT-2 output:
My trip to Yosemite was pretty much the same as the last time. It was pretty much the same, except I had a few more things to do. I had to take the bus back to the airport, and I had to take a short drive back to the hotel to pick up my bags, and to get my passport back to the airport.

I was really lucky that I got to go back to the airport, and to have a few hours with my family. I was really happy to see my mom and dad. They were really happy to see me, and really nice people to meet. I think I had a lot of fun with my family, and I'm really glad that I did.

I was going to go to see my dad for his birthday, but he was going to have to go to his house, and then I was going to get a little more of an early morning break to get some time to relax and get some work done on the trail
TOTAL TIME ELAPSED: 20.47s


In [6]:
# Record the start time for performance measurement
start = time.time()

# Generate text using GPT-2 model with the given prompt and maximum length
output = gpt2_lm.generate("That Italian restaurant is", max_length=200)
# Print the generated output
print("\nGPT-2 output:")
print(output)

# Record end time and calculate total execution time
end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")


GPT-2 output:
That Italian restaurant is called "The Italian Restaurant", which is a name that comes from its Italian origins: "The Italian Restaurant". The restaurant is a place for Italians to enjoy their favorite dishes, to eat and enjoy. It is a great place to meet other Italian and other European travelers who want to learn about Italy.

The Italian restaurant is open every day, from 5:00 pm to 7:00 pm and has a great view of the sea. The restaurant is open for lunch and dinner. The Italian restaurant is open for dinner every day from 7 pm to 10 pm.

The restaurant has a great view, great atmosphere, and is very clean. It is a great place to stay for the evening. The Italian restaurant is a great place to stay for the evening. It is a great place to stay for the evening.
TOTAL TIME ELAPSED: 15.98s


## GPT text generation from scratch with KerasHub

In [8]:
# Import required libraries
import os                           # Operating system interface
import keras_hub                    # TensorFlow Hub for Keras models
import keras                        # Deep learning framework
import tensorflow.strings as tf_strings  # TensorFlow string operations module

In [9]:
# Data
BATCH_SIZE = 64  # Number of samples processed in each training iteration
MIN_STRING_LEN = 512  # Minimum length threshold - strings shorter than this will be discarded
SEQ_LEN = 128  # Length of training sequences measured in tokens

# Model Architecture Parameters
EMBED_DIM = 256  # Dimension of token embeddings
FEED_FORWARD_DIM = 128  # Dimension of feed forward network in transformer
NUM_HEADS = 3  # Number of attention heads in transformer
NUM_LAYERS = 2  # Number of transformer encoder layers
VOCAB_SIZE = 5000  # Maximum vocabulary size to limit model parameters

# Training Configuration
EPOCHS = 5  # Number of complete passes through the training dataset

# Generation/Inference Settings  
NUM_TOKENS_TO_GENERATE = 80  # Number of tokens to generate during text generation

In [10]:
# Download and extract dataset from AWS S3 bucket
# The dataset 'simplebooks.zip' contains text data for NLP tasks
dir = keras.utils.get_file(
    origin="https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip",
    extract=True,  # Automatically extract the downloaded zip file
    cache_dir= "/Portfolio/NLP"  # Local directory to store the dataset
)


Downloading data from https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip
[1m282386239/282386239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 0us/step


In [50]:
# Create a raw training dataset from text file
raw_train_ds = (
    # Load text data from file using TextLineDataset
    tf.data.TextLineDataset( "datasets/simplebooks.zip/simplebooks/simplebooks-92-raw/train.txt")
    # Filter out strings shorter than MIN_STRING_LEN
    .filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
    # Batch the data into groups of BATCH_SIZE
    .batch(BATCH_SIZE)
    # Randomly shuffle batches with a buffer of 256 samples
    .shuffle(buffer_size=256)
)

In [55]:
# Create a validation dataset from text file
raw_val_ds = (
    # Load text file from zip archive using TextLineDataset
    tf.data.TextLineDataset("datasets/simplebooks.zip/simplebooks/simplebooks-92-raw/valid.txt")
    # Filter out strings shorter than MIN_STRING_LEN
    .filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
    # Batch the data into groups of size BATCH_SIZE
    .batch(BATCH_SIZE)
)

In [56]:
# Train a WordPiece tokenizer vocabulary from the training dataset
# WordPiece is a subword tokenization algorithm that breaks words into smaller pieces
vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
    raw_train_ds,                    # Input training dataset
    vocabulary_size=VOCAB_SIZE,      # Maximum size of vocabulary
    lowercase=True,                  # Convert all text to lowercase
    reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],  # Special tokens for padding, unknown words, and beginning of sequence
)

2025-02-19 11:06:20.921039: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [57]:
# Initialize WordPieceTokenizer with specified parameters
# - Uses provided vocabulary for tokenization
# - Sets maximum sequence length to SEQ_LEN
# - Converts all text to lowercase
tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)

In [58]:
import tensorflow.data as tf_data

# Initialize packer to add [BOS] (beginning of sequence) token at start of sequences
start_packer = keras_hub.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
)


def preprocess(inputs):
    # Convert input text to token IDs using tokenizer
    outputs = tokenizer(inputs)
    # Add start token to create features for training
    features = start_packer(outputs)
    # Use original tokenized sequence as labels
    labels = outputs
    return features, labels


# Create training dataset by:
# 1. Applying preprocessing to raw data
# 2. Enabling parallel processing with AUTOTUNE
# 3. Prefetching next batch while current batch processes
train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
    tf_data.AUTOTUNE
)
# Create validation dataset with same preprocessing pipeline
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
    tf_data.AUTOTUNE
)

In [59]:
# Define input layer for variable length sequences of integers
inputs = keras.layers.Input(shape=(None,), dtype="int32")

# Create embedding layer that combines token and positional embeddings
embedding_layer = keras_hub.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,    # Size of the vocabulary
    sequence_length=SEQ_LEN,       # Maximum sequence length
    embedding_dim=EMBED_DIM,       # Dimension of embeddings
    mask_zero=True,               # Enable masking for variable length sequences
)
x = embedding_layer(inputs)

# Stack multiple transformer decoder layers
for _ in range(NUM_LAYERS):
    decoder_layer = keras_hub.layers.TransformerDecoder(
        num_heads=NUM_HEADS,           # Number of attention heads
        intermediate_dim=FEED_FORWARD_DIM,  # Dimension of feed forward network
    )
    # Apply self-attention only (no cross-attention)
    x = decoder_layer(x)  # Giving one argument only skips cross-attention.

# Final dense layer to project to vocabulary size
outputs = keras.layers.Dense(VOCAB_SIZE)(x)

# Create and compile the model
model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_hub.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])

In [60]:
# Display a summary of the model's architecture, including:
# - Layer types and names
# - Output shapes
# - Number of parameters
model.summary()

In [61]:
# Train the model using the training dataset and validate using validation dataset
# train_ds: Training dataset
# val_ds: Validation dataset
# epochs: Number of complete passes through the training dataset
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

Epoch 1/5
[1m2445/2445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2373s[0m 968ms/step - loss: 5.0178 - perplexity: 185.3096 - val_loss: 4.2258 - val_perplexity: 68.5013
Epoch 2/5
[1m2445/2445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2103s[0m 858ms/step - loss: 4.1704 - perplexity: 64.8094 - val_loss: 4.0732 - val_perplexity: 58.8164
Epoch 3/5


2025-02-19 12:22:38.913060: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m2445/2445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5773s[0m 2s/step - loss: 4.0298 - perplexity: 56.2870 - val_loss: 4.0089 - val_perplexity: 55.1365
Epoch 4/5
[1m2445/2445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8397s[0m 3s/step - loss: 3.9550 - perplexity: 52.2264 - val_loss: 3.9710 - val_perplexity: 53.1024
Epoch 5/5
[1m2445/2445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2090s[0m 853ms/step - loss: 3.9084 - perplexity: 49.8464 - val_loss: 3.9409 - val_perplexity: 51.5549


<keras.src.callbacks.history.History at 0x3649597c0>

In [103]:
# The "packer" layers adds the [BOS] token for us.
# Create tokenized prompt by passing empty string to tokenizer and wrapping in list
# Apply start_packer to add BOS token to tokenized prompt
prompt_tokens = start_packer(tokenizer([""]))
# Display the resulting prompt tokens
prompt_tokens

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int32)>

In [105]:
def next(prompt, cache, index):
    # Get model predictions (logits) for the token at position (index-1)
    # Shape: [batch_size, vocab_size]
    logits = model(prompt)[:, index -1 , :]
    
    # Skip hidden state handling for now since it's only used in contrastive search
    #cache = True
    hidden_states = None
    
    # Return logits for next token prediction, along with empty hidden states and cache
    return logits, hidden_states, cache

### Greedy search

In [108]:
# Initialize a greedy sampler for text generation
sampler = keras_hub.samplers.GreedySampler()

# Generate output tokens using the sampler
# - next: function that predicts next token probabilities
# - prompt_tokens: input sequence to start generation from
# - index=1: start sampling after the [BOS] (beginning of sequence) token
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,  # Start sampling immediately after the [BOS] token.
)

# Convert the generated tokens back to readable text
txt = tokenizer.detokenize(output_tokens)

# Print the generated text
print(f"Greedy search generated text: \n{txt}\n")

Greedy search generated text: 
['[BOS] " i \' m not going to be a bit like a bit , " said the doctor , " and i \' ll take a little girl \' s to - day , and i \' ll take a walk with me , and i \' ll take a walk with me , and i \' ll take a walk to the house . i \' ll get a good old woman , and i \' ll be a good old woman , and i \' ll be a good old woman , and i \' ll get a good woman , and i \' ll get a good woman , and i \' ll get a good woman , and i \' ll get a']



### Beam search

In [110]:
# Initialize beam search sampler with 10 beams for text generation
sampler = keras_hub.samplers.BeamSampler(num_beams=10)

# Generate output tokens using beam search
# - next: function that predicts next tokens
# - prompt_tokens: input sequence to start generation from
# - index: position to start generating from
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)

# Convert generated tokens back to readable text
txt = tokenizer.detokenize(output_tokens)

# Print the generated text
print(f"Beam search generated text: \n{txt}\n")

Beam search generated text: 
['[BOS] " yes , sir , " he said , " but i don \' t know how much more than i am . but i don \' t know what i am going to tell you about it . but i don \' t know what it is , and i don \' t know it , but i don \' t know about it , and i don \' t know about it , but i don \' t know it , and i \' ll tell you about it , and i \' ll tell you about it , and i \' ll tell you about it , and i \' ll tell you about it , and i \' ll tell you , \'']



### Random search

In [112]:
# Initialize a random sampler for text generation
sampler = keras_hub.samplers.RandomSampler()

# Generate output tokens using the sampler
# - next: function to get next token probabilities
# - prompt: input token sequence
# - index: starting position for generation
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)

# Convert the generated tokens back to readable text
txt = tokenizer.detokenize(output_tokens)

# Print the randomly generated text
print(f"Random search generated text: \n{txt}\n")

Random search generated text: 
['[BOS] it was dry to put onto a diary point , for he saw the four long windows , and let me get a board on the walls . he pulled the first paper , and spoke to the bill a quarter in the manner in which glarest fericks fell , and when they fell back for him to wait him . " it reached just instead of eleven , let its testim \' s roll , and follow me out . " in my piece , it travelled so , but at one o \' margin , an \' presently she ran with a pair of eyes flying out like a snake , sometimes he']



### Top-K search

In [115]:
# Initialize a Top-K sampler with k=10 (selects from top 10 most likely tokens)
sampler = keras_hub.samplers.TopKSampler(k=10)

# Generate tokens using the sampler with:
# - next: token generation function
# - prompt_tokens: initial input sequence
# - index: starting position (1)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)

# Convert the generated tokens back to readable text
txt = tokenizer.detokenize(output_tokens)

# Print the generated text with Top-K sampling
print(f"Top-K search generated text: \n{txt}\n")

Top-K search generated text: 
["[BOS] now , then , when they were ready to take up a little chuckling , the old - house was empty and there were some chatterer of apples . then , as a farmer brown ' s boy , and the old man , who had been so hungry , could not be eaten with , they ate them , and the old woman was quite sure that she was all the old woman ' s house was in the old woman ' s room with her . the old woman , was very much surprised to see her and said : ' now you ' d better have to come home . ' and she said , ' no ."]



### Top-P search

In [117]:
# Initialize a Top-P (nucleus) sampler with probability threshold of 0.5
sampler = keras_hub.samplers.TopPSampler(p=0.5)

# Generate tokens using the sampler
# next: token generation function
# prompt_tokens: input sequence of tokens
# index: position to start generating from
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)

# Convert generated tokens back to readable text
txt = tokenizer.detokenize(output_tokens)

# Print the generated text
print(f"Top-P search generated text: \n{txt}\n")

Top-P search generated text: 
['[BOS] " i \' m glad , " she said . " the water is hot , and there \' s transport . " i \' m not going to get some breakfast , and we \' ll take a comparel . you \' ll find a rout disappearance , and you \' ll take a part with me , and i \' ll give a little trouble to put up to a cheese , \' and then you \' ll get on to - night , \' cause he can \' t a bit . " he was a very bad boy , an \' he thought he was a little in']

