In [1]:
!pip install  tensorflow
!pip install keras_nlp



In [2]:
!pip install tensorflow_datasets



In [3]:
!pip install ipywidgets



In [4]:
!pip install tensorflow-text



In [5]:
import tensorflow as tf

In [6]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"  
#import tensorflow_datasets as tfds
import tensorflow_text
import keras_nlp
import keras
import time

keras.mixed_precision.set_global_policy("mixed_float16")

In [7]:
# To speed up training and generation, we use preprocessor of length 128
# instead of full length 1024.
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=128,
)
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en", preprocessor=preprocessor
)

In [8]:
start = time.time()

output = gpt2_lm.generate("My trip to Yosemite was", max_length=200)
print("\nGPT-2 output:")
print(output)

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")

I0000 00:00:1731494725.634857 50072335 service.cc:148] XLA service 0x60000273fb00 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1731494725.634897 50072335 service.cc:156]   StreamExecutor device (0): Host, Default Version
I0000 00:00:1731494725.637083 50072335 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.



GPT-2 output:
My trip to Yosemite was a bit of a blur because I was going through some of the most beautiful areas of the park and I wanted to take a look at what's out there. So I did.

Here's a little bit about me:

I'm a photographer, photographer's assistant, and photographer's assistant for The New York Times, which means that my photos aren't necessarily the best of what you can find on Google. So if you're looking for something more than just a little bit of fun, I recommend you check out this article.

My favorite place to be in Yosemite is in the park, and I love to see how the landscape changes over time. The Yosemite Valley is a pretty big, wide, and beautiful place to be.

The most beautiful thing about Yosemite Valley is the view of the valley. I love it when you see the mountains. I love it when you see the mountains. It's like a big world.


TOTAL TIME ELAPSED: 19.74s


In [9]:
start = time.time()

output = gpt2_lm.generate("That Italian restaurant is", max_length=200)
print("\nGPT-2 output:")
print(output)

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")


GPT-2 output:
That Italian restaurant is called 'The Italian Grill' because the owner, who was born in Italy, is Italian and has been here since he was a teenager. The owner's daughter, who is also Italian, is also Italian.

But it is not just the restaurant owners and owners who have been affected by Italian restaurants, but the restaurant's staff.

The restaurant's staff are not all Italian but some are Italian.

They include a chef, who is not Italian, and a waitress who is Italian, according to the restaurant's Facebook page.

The restaurant is not the only one that is affected by the Italian restaurants.

According to the New York Post, a number of restaurants across the U.S. are also affected by Italian restaurants.

The Post reports that the Italian Restaurant Association of Greater New York has been contacted to offer a list of restaurants that may be affected.

"It is very unfortunate that a restaurant that has been serving
TOTAL TIME ELAPSED: 18.16s


## GPT text generation from scratch with KerasHub

In [35]:
!pip install keras-hub



In [37]:
import os
import keras_hub
import keras
import tensorflow.strings as tf_strings

In [41]:
# Data
BATCH_SIZE = 64
MIN_STRING_LEN = 512  # Strings shorter than this will be discarded
SEQ_LEN = 128  # Length of training sequences, in tokens

# Model
EMBED_DIM = 256
FEED_FORWARD_DIM = 128
NUM_HEADS = 3
NUM_LAYERS = 2
VOCAB_SIZE = 5000  # Limits parameters in model.

# Training
EPOCHS = 5

# Inference
NUM_TOKENS_TO_GENERATE = 80

In [55]:
dir = keras.utils.get_file(
    origin="https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip",
    extract=True,
    cache_dir= "/Users/borja/Desktop/Portfolio/NLP"
)
#dir = os.path.expanduser("/Users/borja/Desktop/Portfolio/NLP")

Downloading data from https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip
[1m282386239/282386239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 0us/step


'/Users/borja/Desktop/Portfolio/NLP/datasets/simplebooks.zip'

In [59]:
raw_train_ds = (
    tf_data.TextLineDataset(dir + "/datasets/simplebooks.zip/simplebooks/simplebooks-92-raw/train.txt")
    .filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=256)
)

In [60]:
raw_val_ds = (
    tf_data.TextLineDataset(dir + "/datasets/simplebooks.zip/simplebooks/simplebooks-92-raw/valid.txt")
    .filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
    .batch(BATCH_SIZE)
)

In [61]:
# Train tokenizer vocabulary
vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
    raw_train_ds,
    vocabulary_size=VOCAB_SIZE,
    lowercase=True,
    reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],
)

In [62]:
tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)

In [63]:
import tensorflow.data as tf_data

# packer adds a start token
start_packer = keras_hub.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
)


def preprocess(inputs):
    outputs = tokenizer(inputs)
    features = start_packer(outputs)
    labels = outputs
    return features, labels


# Tokenize and split into train and label sequences.
train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
    tf_data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
    tf_data.AUTOTUNE
)

In [64]:
inputs = keras.layers.Input(shape=(None,), dtype="int32")
# Embedding.
embedding_layer = keras_hub.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=SEQ_LEN,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)
x = embedding_layer(inputs)
# Transformer decoders.
for _ in range(NUM_LAYERS):
    decoder_layer = keras_hub.layers.TransformerDecoder(
        num_heads=NUM_HEADS,
        intermediate_dim=FEED_FORWARD_DIM,
    )
    x = decoder_layer(x)  # Giving one argument only skips cross-attention.
# Output.
outputs = keras.layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_hub.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])

In [65]:
model.summary()

In [66]:
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

Epoch 1/5




   2445/Unknown [1m2124s[0m 866ms/step - loss: 4.9946 - perplexity: 181.5923

  self.gen.throw(typ, value, traceback)


[1m2445/2445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2125s[0m 866ms/step - loss: 4.9944 - perplexity: 181.5578 - val_loss: 4.2076 - val_perplexity: 67.2965
Epoch 2/5
[1m2445/2445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 868ms/step - loss: 4.1799 - perplexity: 65.4335

2024-11-13 17:50:09.836615: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m2445/2445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2126s[0m 868ms/step - loss: 4.1799 - perplexity: 65.4321 - val_loss: 4.0977 - val_perplexity: 60.3031
Epoch 3/5
[1m2445/2445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2109s[0m 861ms/step - loss: 4.0389 - perplexity: 56.8078 - val_loss: 4.0228 - val_perplexity: 55.9280
Epoch 4/5
[1m2445/2445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2135s[0m 872ms/step - loss: 3.9625 - perplexity: 52.6210 - val_loss: 3.9748 - val_perplexity: 53.3349
Epoch 5/5
[1m2445/2445[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2126s[0m 868ms/step - loss: 3.9173 - perplexity: 50.2951 - val_loss: 3.9641 - val_perplexity: 52.7500


<keras.src.callbacks.history.History at 0x34e6acfa0>

In [116]:
# The "packer" layers adds the [BOS] token for us.
prompt_tokens = start_packer(tokenizer([""]))
prompt_tokens

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int32)>

In [117]:
def next(prompt, cache, index):
    logits = model(prompt)[:, index -1 , :]
    # Ignore hidden states for now; only needed for contrastive search.
    #cache = True
    hidden_states = None
    return logits, hidden_states, cache

In [118]:
import warnings
warnings.filterwarnings("ignore")

### Greedy search

In [119]:
sampler = keras_hub.samplers.GreedySampler()
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,  # Start sampling immediately after the [BOS] token.
)
txt = tokenizer.detokenize(output_tokens)
print(f"Greedy search generated text: \n{txt}\n")

Greedy search generated text: 
['[BOS] " i have been thinking of the matter over , " the captain said , " but i have been a good deal of trouble , and i have been able to get a good deal of money , and i have been able to get a good deal of money , and i have been able to get a good deal of money , and i have been obliged to pay for a good deal of money , and i have been obliged to pay for a good price for a good price . i have been obliged to pay for a good price , and i have been obliged to pay for a good price for money , and i have been obliged to pay']



### Beam search

In [120]:
sampler = keras_hub.samplers.BeamSampler(num_beams=10)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Beam search generated text: \n{txt}\n")

Beam search generated text: 
['[BOS] " well , i don \' t know , " he said , " but i don \' t mean to say anything about it . i don \' t mean to say anything about it , but i don \' t know it . i don \' t like it , but i don \' t like it . i don \' t like it , but i don \' t like it . i don \' t like it , but i don \' t like it . i don \' t like it . i don \' t like it , but i don \' t like it . i don \' t like it . i don \' t like it . i']



### Random search

In [121]:
sampler = keras_hub.samplers.RandomSampler()
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Random search generated text: \n{txt}\n")

Random search generated text: 
['[BOS] at the time there the party marched from sandy bayne hoofs that the indians in the forests and sordebigre returned from the valley , and his men upon his flank the cavalry pressed forward at amleay meeting at the callers . towards night they marched again a little down by the two of the bomy guards in river . several times he reached their ranks again . accidentally margaretville returned to their homes and lay down stones running around the hills laid bare and stretched themselves in tents . as slowly poles were the circles dragged to one , the retainer bugain and he']



### Top-K search

In [122]:
sampler = keras_hub.samplers.TopKSampler(k=10)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Top-K search generated text: \n{txt}\n")

Top-K search generated text: 
['[BOS] " you must go with you , " said hugh , putting his arm into one eye , " he would have a good time . then you must have gone down into a little room , with you and your wife , and i am sure you are not a little so far away . if you would have done you , if you could find yourself at a moment , but you would have been more than a minute , and if you would only look at you to see the door open . you would see that you would not come up . it will be well enough , i will be glad to get you , " he said , as he said']



### Top-P search

In [123]:
sampler = keras_hub.samplers.TopPSampler(p=0.5)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Top-P search generated text: \n{txt}\n")

Top-P search generated text: 
['[BOS] " i was a long time ago , " he said , " i suppose , indeed , " the knight said , " but you are not to say anything about it . now , as you say , it is well that it was not for me to be like all those of the damsels . i should say that there is nothing to be done . you have heard that this damsel is not the knight of the damsels , and so to speak to you , but i will say , that you have a damsel of honor and i shall be very glad to say that you have come to me , and so i will take the lady you']

