In [1]:
!git clone https://github.com/keras-team/keras-nlp.git

import sys
sys.path.append("/kaggle/working/keras-nlp")

Cloning into 'keras-nlp'...
remote: Enumerating objects: 4474, done.[K
remote: Counting objects: 100% (322/322), done.[K
remote: Compressing objects: 100% (238/238), done.[K
remote: Total 4474 (delta 191), reused 157 (delta 84), pack-reused 4152[K
Receiving objects: 100% (4474/4474), 2.05 MiB | 18.45 MiB/s, done.
Resolving deltas: 100% (3210/3210), done.


In [2]:
import os
import keras_nlp
import tensorflow as tf
from tensorflow import keras
from keras import layers, models


In [3]:
policy = keras.mixed_precision.Policy("mixed_float16")
keras.mixed_precision.set_global_policy(policy)

## Dataset

In [4]:
# download the dataset

keras.utils.get_file(
    origin="https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip",
    extract=True,
)
wiki_dir = os.path.expanduser("~/.keras/datasets/wikitext-103-raw/")

# Download finetuning data.
keras.utils.get_file(
    origin="https://dl.fbaipublicfiles.com/glue/data/SST-2.zip",
    extract=True,
)
sst_dir = os.path.expanduser("~/.keras/datasets/SST-2/")

# Download vocabulary data.
vocab_file = keras.utils.get_file(
    origin="https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt",
)

Downloading data from https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
Downloading data from https://dl.fbaipublicfiles.com/glue/data/SST-2.zip
Downloading data from https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt


## Hyperparameters

In [5]:
# Preprocessing params.
PRETRAINING_BATCH_SIZE = 128
FINETUNING_BATCH_SIZE = 32
SEQ_LENGTH = 128
MASK_RATE = 0.25
PREDICTIONS_PER_SEQ = 32

# Model params.
NUM_LAYERS = 3
MODEL_DIM = 256
INTERMEDIATE_DIM = 512
NUM_HEADS = 4
DROPOUT = 0.1
NORM_EPSILON = 1e-5

# Training params.
PRETRAINING_LEARNING_RATE = 5e-4
PRETRAINING_EPOCHS = 8
FINETUNING_LEARNING_RATE = 5e-5
FINETUNING_EPOCHS = 3

In [6]:
# load the SST-2
sst_train_ds = tf.data.experimental.CsvDataset(sst_dir + "train.tsv", [tf.string, tf.int32], header = True, field_delim="\t"
                                              ).batch(FINETUNING_BATCH_SIZE)

sst_val_ds = tf.data.experimental.CsvDataset(sst_dir + "dev.tsv", [tf.string, tf.int32], field_delim = "\t"
                                            ).batch(FINETUNING_BATCH_SIZE)

# load the wikitext-103

wiki_train_ds = (tf.data.TextLineDataset(wiki_dir + "wiki.train.raw").filter(lambda x: tf.strings.length(x) > 100)
                .batch(PRETRAINING_BATCH_SIZE))

wiki_val_ds = (tf.data.TextLineDataset(wiki_dir + "wiki.valid.raw").filter(lambda x: tf.strings.length(x) > 100)
                .batch(PRETRAINING_BATCH_SIZE))

In [7]:
sst_train_ds.unbatch().batch(5).take(1).get_single_element()

(<tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'hide new secretions from the parental units ',
        b'contains no wit , only labored gags ',
        b'that loves its characters and communicates something rather beautiful about human nature ',
        b'remains utterly satisfied to remain the same throughout ',
        b'on the worst revenge-of-the-nerds clich\xc3\xa9s the filmmakers could dredge up '],
       dtype=object)>,
 <tf.Tensor: shape=(5,), dtype=int32, numpy=array([0, 0, 1, 0, 0], dtype=int32)>)

In [8]:
wiki_train_ds.unbatch().batch(2).take(1).get_single_element()

<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b' Senj\xc5\x8d no Valkyria 3 : Unrecorded Chronicles ( Japanese : \xe6\x88\xa6\xe5\xa0\xb4\xe3\x81\xae\xe3\x83\xb4\xe3\x82\xa1\xe3\x83\xab\xe3\x82\xad\xe3\x83\xa5\xe3\x83\xaa\xe3\x82\xa23 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . ',
       b" The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II .

## Pretraining

In [9]:
from keras_nlp.tokenizers import WordPieceTokenizer
from keras_nlp.layers import MaskedLMMaskGenerator


tokenizer = WordPieceTokenizer(vocabulary = vocab_file, sequence_length = SEQ_LENGTH, lowercase = True, strip_accents = True)

masker = MaskedLMMaskGenerator(vocabulary_size = tokenizer.vocabulary_size(), 
                              mask_selection_rate = MASK_RATE, mask_selection_length = PREDICTIONS_PER_SEQ, 
                              mask_token_id = tokenizer.token_to_id("[MASK]"))



In [10]:
def preprocess(inputs):
    inputs = tokenizer(inputs)
    outputs = masker(inputs)
    
    features = {
        "token_ids": outputs['token_ids'],
        "mask_positions": outputs['mask_positions']
    }
    
    labels = outputs['mask_ids']
    weights = outputs['mask_weights']
    
    return features, labels, weights

In [11]:
pretrain_ds = wiki_train_ds.map(preprocess, num_parallel_calls = tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
pretrain_val_ds = wiki_val_ds.map(preprocess, num_parallel_calls = tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

In [12]:
pretrain_val_ds.take(1).get_single_element()

({'token_ids': <tf.Tensor: shape=(128, 128), dtype=int32, numpy=
  array([[  103,  7849,  2271, ...,  9673,  1012,  7570],
         [  103,  7849,  2271, ...,  1007,  1012,  2023],
         [ 1996,   103,  3940, ...,     0,     0,     0],
         ...,
         [ 2076,  1996,  2307, ...,     0,     0,     0],
         [ 3216,   103,  2083, ...,     0,     0,     0],
         [  103, 12053,  1045, ...,     0,     0,     0]], dtype=int32)>,
  'mask_positions': <tf.Tensor: shape=(128, 32), dtype=int64, numpy=
  array([[  0,   5,  13, ..., 117, 119, 122],
         [  0,   4,   5, ..., 106, 120, 122],
         [  1,   5,  14, ...,   0,   0,   0],
         ...,
         [  6,   8,  15, ..., 116, 119,   0],
         [  1,   5,  15, ...,   0,   0,   0],
         [  0,   1,   3, ...,   0,   0,   0]])>},
 <tf.Tensor: shape=(128, 32), dtype=int32, numpy=
 array([[ 7570,  1010, 27940, ...,  1037,  2077,  2046],
        [ 7570,  7946,  2003, ...,  2027,  6190,  9587],
        [ 2034,  3695, 24335, 

## Create the Transformer Encoder

In [14]:
from keras_nlp.layers import TokenAndPositionEmbedding
from keras_nlp.layers import TransformerEncoder


inputs = keras.Input(shape = (SEQ_LENGTH), dtype=tf.int32)

# embed our tokens with a positional embedding
embedding_layer = TokenAndPositionEmbedding(vocabulary_size = tokenizer.vocabulary_size(), 
                                           sequence_length = SEQ_LENGTH, embedding_dim = MODEL_DIM)

outputs = embedding_layer(inputs)
outputs = layers.LayerNormalization(epsilon = NORM_EPSILON)(outputs)
outputs = layers.Dropout(rate = DROPOUT)(outputs)

for i in range(3):
    outputs = TransformerEncoder(intermediate_dim = INTERMEDIATE_DIM, 
                                num_heads = NUM_HEADS,
                                dropout = DROPOUT, 
                                layer_norm_epsilon = NORM_EPSILON)(outputs)
    
encoder_model = keras.Model(inputs, outputs)

In [15]:
encoder_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 128)]             0         
                                                                 
 token_and_position_embeddin  (None, 128, 256)         7846400   
 g_1 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 layer_normalization_1 (Laye  (None, 128, 256)         512       
 rNormalization)                                                 
                                                                 
 dropout_1 (Dropout)         (None, 128, 256)          0         
                                                                 
 transformer_encoder_3 (Tran  (None, 128, 256)         527104    
 sformerEncoder)                                           

In [20]:
from keras_nlp.layers import MaskedLMHead

inputs = {
    "token_ids": keras.Input(shape=(SEQ_LENGTH,), dtype=tf.int32),
    "mask_positions": keras.Input(shape=(PREDICTIONS_PER_SEQ,), dtype=tf.int32),
}

# encode the tokens
encoded_tokens = encoder_model(inputs["token_ids"])

outputs = MaskedLMHead(embedding_weights = embedding_layer.token_embedding.embeddings, 
                      activation = 'softmax')(encoded_tokens, mask_positions = inputs['mask_positions'])

optimizer = keras.optimizers.experimental.AdamW(PRETRAINING_LEARNING_RATE)
pretraining_model = keras.Model(inputs, outputs)
pretraining_model.compile(loss='sparse_categorical_crossentropy', 
                          optimizer = optimizer, 
                         weighted_metrics = ['sparse_categorical_accuracy'], 
                         jit_compile = True)

In [21]:
pretraining_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 model_1 (Functional)           (None, 128, 256)     9428224     ['input_5[0][0]']                
                                                                                                  
 input_6 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 masked_lm_head (MaskedLMHead)  (None, 32, 30522)    7910458     ['model_1[1][0]',                
                                                                  'input_6[0][0]']          

In [22]:
pretraining_model.fit(pretrain_ds, validation_data = pretrain_val_ds, epochs = PRETRAINING_EPOCHS)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f5969a448d0>

In [23]:
encoder_model.save("encoder_model")

## Finetuning

In [24]:
def preprocess(sentences, labels):
    return tokenizer(sentences), labels

In [25]:
finetune_ds = sst_train_ds.map(preprocess, num_parallel_calls = tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

finetune_val_ds = sst_val_ds.map(preprocess, num_parallel_calls = tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

In [26]:
encoder_model = models.load_model("encoder_model", compile = False)


inputs = keras.Input(shape=(SEQ_LENGTH,), dtype=tf.int32)

encoded_tokens = encoder_model(inputs)
pooled_tokens = layers.GlobalAveragePooling1D()(encoded_tokens)

outputs = layers.Dense(1, activation='sigmoid')(pooled_tokens)

final_model = keras.Model(inputs, outputs)

In [27]:
final_model.compile(
    loss="binary_crossentropy",
    optimizer=keras.optimizers.experimental.AdamW(FINETUNING_LEARNING_RATE),
    metrics=["accuracy"],
)

In [28]:
final_model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, 128)]             0         
                                                                 
 model_1 (Functional)        (None, 128, 256)          9428224   
                                                                 
 global_average_pooling1d (G  (None, 256)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense_6 (Dense)             (None, 1)                 257       
                                                                 
Total params: 9,428,481
Trainable params: 9,428,481
Non-trainable params: 0
_________________________________________________________________


In [29]:
# Finetune the model for the SST-2 task.
final_model.fit(
    finetune_ds,
    validation_data=finetune_val_ds,
    epochs=FINETUNING_EPOCHS,
)

Epoch 1/3
   2105/Unknown - 110s 49ms/step - loss: 0.4084 - accuracy: 0.8080

InvalidArgumentError: Graph execution error:

Detected at node 'IteratorGetNext' defined at (most recent call last):
    File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main
      "__main__", mod_spec)
    File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code
      exec(code, run_globals)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/opt/conda/lib/python3.7/site-packages/traitlets/config/application.py", line 1041, in launch_instance
      app.start()
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/opt/conda/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
      self._run_once()
    File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
      handle._run()
    File "/opt/conda/lib/python3.7/asyncio/events.py", line 88, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 387, in do_execute
      cell_id=cell_id,
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2976, in run_cell
      raw_cell, store_history, silent, shell_futures, cell_id
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell
      return runner(coro)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3258, in run_cell_async
      interactivity=interactivity, compiler=compiler, result=result)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_23/3081086618.py", line 5, in <module>
      epochs=FINETUNING_EPOCHS,
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1705, in fit
      _use_cached_eval_dataset=True,
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 2040, in evaluate
      tmp_logs = self.test_function(iterator)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1820, in test_function
      return step_function(self, iterator)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1803, in step_function
      data = next(iterator)
Node: 'IteratorGetNext'
Detected at node 'IteratorGetNext' defined at (most recent call last):
    File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main
      "__main__", mod_spec)
    File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code
      exec(code, run_globals)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/opt/conda/lib/python3.7/site-packages/traitlets/config/application.py", line 1041, in launch_instance
      app.start()
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/opt/conda/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
      self._run_once()
    File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
      handle._run()
    File "/opt/conda/lib/python3.7/asyncio/events.py", line 88, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 387, in do_execute
      cell_id=cell_id,
    File "/opt/conda/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2976, in run_cell
      raw_cell, store_history, silent, shell_futures, cell_id
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell
      return runner(coro)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3258, in run_cell_async
      interactivity=interactivity, compiler=compiler, result=result)
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_23/3081086618.py", line 5, in <module>
      epochs=FINETUNING_EPOCHS,
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1705, in fit
      _use_cached_eval_dataset=True,
    File "/opt/conda/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 2040, in evaluate
      tmp_logs = self.test_function(iterator)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1820, in test_function
      return step_function(self, iterator)
    File "/opt/conda/lib/python3.7/site-packages/keras/engine/training.py", line 1803, in step_function
      data = next(iterator)
Node: 'IteratorGetNext'
2 root error(s) found.
  (0) INVALID_ARGUMENT:  Field 1 in record is not a valid int32: label
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_4]]
  (1) INVALID_ARGUMENT:  Field 1 in record is not a valid int32: label
	 [[{{node IteratorGetNext}}]]
0 successful operations.
0 derived errors ignored. [Op:__inference_test_function_238503]

In [30]:
# Add our tokenization into our final model.
inputs = keras.Input(shape=(), dtype=tf.string)
tokens = tokenizer(inputs)
outputs = final_model(tokens)
final_model = keras.Model(inputs, outputs)
final_model.save("final_model")

# This model can predict directly on raw text.
restored_model = keras.models.load_model("final_model", compile=False)
inference_data = tf.constant(["Terrible, no good, trash.", "So great; I loved it!"])
print(restored_model(inference_data))

tf.Tensor(
[[0.05194]
 [0.995  ]], shape=(2, 1), dtype=float16)


## Reference

1. [Transformer Pretraining](https://keras.io/guides/keras_nlp/transformer_pretraining/)