In [1]:
!git clone https://github.com/keras-team/keras-nlp.git

import sys
sys.path.append("/kaggle/working/keras-nlp")

Cloning into 'keras-nlp'...
remote: Enumerating objects: 5913, done.[K
remote: Counting objects: 100% (2022/2022), done.[K
remote: Compressing objects: 100% (270/270), done.[K
remote: Total 5913 (delta 1820), reused 1791 (delta 1752), pack-reused 3891[K
Receiving objects: 100% (5913/5913), 2.56 MiB | 9.95 MiB/s, done.
Resolving deltas: 100% (4466/4466), done.


In [2]:
import os
import keras_nlp
import tensorflow as tf
from tensorflow import keras

In [3]:
# dataset
BATCH_SIZE = 64
BUFFER_SIZE = 256
MIN_TRAINING_SEQ_LEN = 450
SEQ_LEN = 128


# Model
EMBED_DIM = 128
FEED_FORWARD_DIM = 512
NUM_HEADS = 8
NUM_LAYERS = 4
VOCAB_SIZE = 40000  # Limits parameters in model.

# Training
EPOCHS = 10

# Inference
NUM_TOKENS_TO_GENERATE = 80

In [4]:
keras.utils.get_file(
    origin="https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip",
    extract=True,
)
dir = os.path.expanduser("~/.keras/datasets/simplebooks/")

train_path = dir + "simplebooks-92-raw/train.txt"
val_path = dir + "simplebooks-92-raw/valid.txt"

Downloading data from https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip


In [5]:
raw_train_ds = (
    tf.data.TextLineDataset(train_path)
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=BUFFER_SIZE)
)

raw_val_ds = (
    tf.data.TextLineDataset(dir + "simplebooks-92-raw/valid.txt")
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
)
    
print(raw_train_ds.unbatch().batch(1).take(1).get_single_element())
print("\n")
print(raw_val_ds.unbatch().batch(1).take(1).get_single_element())

tf.Tensor([b'Lose the love of her idolized husband? That would be worse than death. But it should never be: he loved her dearly now (it could not be possible that these last few wretched days had robbed her quite of the devoted affection she had known beyond a doubt to be hers before); and she would tell him, as soon as he came in, how sorry she was for the conduct that had vexed him, and never, no, never again, would she do or say any thing to displease him, or lower herself in his estimation.'], shape=(1,), dtype=string)


tf.Tensor([b'"Nonsense! It isn\'t anything of the sort!" cried the Calico Clown, and he tried to wink at the Monkey from behind a pile of building blocks. "The ocean is as safe as the shore. Why, look at the English and French dolls," he said, waving his cymbals in the direction of the imported toys in the next aisle. "They came over the ocean in a ship, and they did not even have a headache. And look at the Japanese dolls -- they came much farther, over another oc

### Tokenization

In [6]:
# this is the utility to train a word piece vocabulary

vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    raw_train_ds,
    vocabulary_size=VOCAB_SIZE,
    lowercase=True,
    reserved_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]"],
)

In [7]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)

In [8]:
# this is how our tensor looks before adding start-end packers

text = "This is a code"
tokenizer(text)

<tf.Tensor: shape=(128,), dtype=int32, numpy=
array([ 137,  124,   38, 8824,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)>

In [9]:
# packer adds a start & end tokens
start_end_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
    end_value = tokenizer.token_to_id("[EOS]")
)

In [10]:
start_end_packer(tokenizer(text))

<tf.Tensor: shape=(128,), dtype=int32, numpy=
array([   2,  137,  124,   38, 8824,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    3], dtype=int32)>

In [11]:
def preprocess(inputs):
    # tokenize and add packers to input
    outputs = tokenizer(inputs)
    features = start_end_packer(outputs)
    
    # labels are tokenized input without the packers
    labels = outputs
    return features, labels

In [12]:
# Tokenize and split into train and label sequences.
train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)

In [13]:
train_ds.unbatch().batch(1).take(1).get_single_element()

(<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
 array([[    2,    98,  3804,   116,   226,   917,   108,  1446,  6449,
           100,  2939, 10493,    14,    99,   148,  1467,   100, 12694,
         36050,   108,  5569,    99,    98, 17099,    14,    99,   100,
           177,   105,    98,   821,   168,    38,  1843,   999,   101,
            98,   747,    16,   123,  5343,   178,  1126,   101,  2214,
           315,   613,    14,    99,    98,   999,  3886,   276,   114,
           356,  6332,     9,  6548,    99, 12970,   224,  2400,   138,
           102,    38,  2561,   102,    38,   318,   286,  3414,    16,
           711,  1370,   107,   139,   148,   250,    16,   155,   101,
            98, 17099,  1118,   105,   356,  6332,   107,   613,    98,
           747,   744,   102,    98,   468,    14,    99,   133,   101,
            98,   263, 33782,   152,   105,   104,   107,   283,   129,
            14,    99,   112,   107,   247,   122,   105,   112,   107,
           199,

## Model

In [14]:
from keras import layers
from keras_nlp.layers import TokenAndPositionEmbedding
from keras_nlp.layers import TransformerDecoder

inputs = layers.Input(shape=(None,), dtype=tf.int32)

embedding_layer = TokenAndPositionEmbedding(vocabulary_size = VOCAB_SIZE, 
                                           sequence_length = SEQ_LEN, 
                                           embedding_dim = EMBED_DIM, 
                                            mask_zero = True)
x = embedding_layer(inputs)

for _ in range(NUM_LAYERS):
    decoder_layer = TransformerDecoder(num_heads = NUM_HEADS, intermediate_dim = FEED_FORWARD_DIM)
    x = decoder_layer(x)
    
outputs = layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs = inputs, outputs = outputs)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])

In [15]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 token_and_position_embeddin  (None, None, 128)        5136384   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_decoder (Transf  (None, None, 128)        198272    
 ormerDecoder)                                                   
                                                                 
 transformer_decoder_1 (Tran  (None, None, 128)        198272    
 sformerDecoder)                                                 
                                                                 
 transformer_decoder_2 (Tran  (None, None, 128)        198272

In [16]:
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe30d67f590>

In [17]:
# sanity check
model(tf.expand_dims(tokenizer(text), axis = 0))

<tf.Tensor: shape=(1, 128, 40000), dtype=float32, numpy=
array([[[ -2.4872534 , -13.02685   , -12.999907  , ..., -12.959289  ,
         -12.927773  , -12.953392  ],
        [ -0.28776821, -11.775211  , -11.67721   , ..., -11.783797  ,
         -11.649365  , -11.667374  ],
        [ -4.520238  , -11.500245  , -11.432527  , ..., -11.491863  ,
         -11.402462  , -11.39734   ],
        ...,
        [ -0.40784603, -11.591385  , -11.654431  , ..., -11.54339   ,
         -11.476372  , -11.539055  ],
        [ -0.542383  , -11.589169  , -11.649965  , ..., -11.547303  ,
         -11.482927  , -11.538782  ],
        [  1.4446563 , -11.5100565 , -11.500842  , ..., -11.292993  ,
         -11.329205  , -11.338281  ]]], dtype=float32)>

## Inference

In [18]:
from tensorflow.python.ops.numpy_ops import np_config

np_config.enable_numpy_behavior()

In [19]:
prompt_tokens = start_end_packer(tokenizer([""]))
prompt_tokens

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3]],
      dtype=int32)>

In [20]:
def next(prompt, cache, index):
    logits = model(prompt)[:, index - 1, :]
    hidden_states = None
    
    return logits, hidden_states, cache

### Greedy Search

In [21]:
sampler = keras_nlp.samplers.GreedySampler()

output_tokens = sampler(next, prompt = prompt_tokens, index = 1)
output = tokenizer.detokenize(output_tokens)

print(output)

tf.Tensor([b'[BOS] " i don \' t know what i \' m going to say , " said the doctor , " but i \' m not going to tell you what i \' m going to do . i \' m going to tell you about it . i \' m going to tell you about it . i \' m going to tell you about it . i \' m going to tell you about it . i \' m going to tell you about it . i \' m going to tell you about it . i \' m going to tell you about it . i \' m going to tell you about it . i \' m going to tell you about it .'], shape=(1,), dtype=string)


### Beam Search

In [22]:
sampler = keras_nlp.samplers.BeamSampler(num_beams = 10)
output_tokens = sampler(next, prompt_tokens, index = 1)

output = tokenizer.detokenize(output_tokens)
print(output)

tf.Tensor([b'[BOS] " i don \' t know what i \' m going to say , " he said , " but i don \' t know what i \' m going to tell you . i don \' t know what i \' m going to tell you , but i don \' t know what i \' m going to tell you . i don \' t know what i \' m going to say , but i don \' t know what i \' m going to do . i don \' t know what i \' m going to do . i don \' t know what i \' m going to do , but i don \' t know anything about it . i'], shape=(1,), dtype=string)


### Random Search

In [23]:
sampler = keras_nlp.samplers.RandomSampler()

output_tokens = sampler(next, prompt_tokens, index = 1)
output = tokenizer.detokenize(output_tokens)

print(output)

tf.Tensor([b"[BOS] the general , his soldiers rushed forward to the guard river , and he bounded back with amazement . ground but was bare when , slight in multitude , that might be reckoned as mist and snow shooting ; and then from the pitch came the man to turn ready for the combat , with a crest and shoulder , hurling upon his head , and hurling balls of shells into an exceedingly serious abyss . further , and more moderate , the cavalry fell on his horses as that of d ' sle cutters , and rushed forward against the enemy . [PAD] the troops suffered terribly . [PAD] jackson and stonewalluga pressed forward and defiantly into an extreme storm . they did all"], shape=(1,), dtype=string)


### Top-K Search

In [24]:
sampler = keras_nlp.samplers.TopKSampler(k=10)

output_tokens = sampler(next, prompt_tokens, index = 1)
output = tokenizer.detokenize(output_tokens)

print(output)

tf.Tensor([b'[BOS] " it has been arranged as a rule , " answered grace in the low voice of the girl . " you will have to be sure that i will have to give the girl a chance of taking her leave to - night . the fact that i had been in a bad place at the dower gate , and that her majesty would not be allowed to stay here . i have heard that miss nevin was a great deal about her , and i know that there is no one , but that i have a very interesting reason to know what i have heard , so we can tell the matter in which i am going . " [PAD] she was in a'], shape=(1,), dtype=string)


## Top-P Search

In [25]:
sampler = keras_nlp.samplers.TopPSampler(p = 0.5)

output_tokens = sampler(next, prompt_tokens, index = 1)
output = tokenizer.detokenize(output_tokens)

print(output)

tf.Tensor([b'[BOS] " oh , i will not give you up , " said the mother , " and i will go and stay here to - night , where i will come . but i will be glad to see you . you will be in your company for a long time , and as you are the only person i have to go to sleep , and i will give you . but the first time you have come i will find out what i am doing . i will take the child away , and will tell you to go home with me . i will ask you to leave the house , and give me my hand . " [PAD] the next morning the second'], shape=(1,), dtype=string)


## Resources
1. [KerasNLP GPT](https://keras.io/examples/generative/text_generation_gpt/)
2. [Keras Miniature GPT](https://keras.io/examples/generative/text_generation_with_miniature_gpt/)