In [None]:
!pip install tensorflow-text==2.15 keras-nlp==0.8.2

In [None]:
import  os
import  numpy               as np
import  tensorflow          as tf
from    tensorflow          import keras
import  keras_nlp
import  matplotlib.pyplot   as plt

# Transformer

### Data


SimpleBooks dataset for training and the Tokenizer

https://arxiv.org/pdf/1911.12391.pdf

https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip

In [None]:
dset_train_f    = "simplebooks-92-raw_train.txt"
dset_valid_f    = "simplebooks-92-raw_valid.txt"
dset_test_f     = "simplebooks-92-raw_test.txt"

!wget -O {dset_train_f} https://www.dropbox.com/scl/fi/r6vnn7vccpvscvmzabmvf/simplebooks-92-raw_train.txt?rlkey=thwnurvjrda737sr8283qpdkg&dl=0
!wget -O {dset_valid_f} https://www.dropbox.com/scl/fi/txlel36qe8nxz7jth95xp/simplebooks-92-raw_valid.txt?rlkey=8sdbtycdx2ppokcybaolvrx7j&dl=0
!wget -O {dset_test_f} https://www.dropbox.com/scl/fi/3j9eifu45sdwnhof3r63q/simplebooks-92-raw_test.txt?rlkey=5c8z385wsu925h9zugz7p2d8v&dl=0

In [None]:
# ready-to-use models and vocabulary
!wget -O vocab_10000.txt https://www.dropbox.com/scl/fi/aix95cadh32i9ylzjzz0m/vocab_10000.txt?rlkey=qcwpxhw34c8z2x7hue37zu79d&dl=0
!wget -O transf_v10000_s128_h4_e50.h5 https://www.dropbox.com/scl/fi/la654e3fsjfn6iwrlmm1d/transf_v10000_s128_h4_e50.h5?rlkey=9qkdroty1i2l9lu1euaxjtkvg&dl=0
!wget -O transf_v10000_s128_h4_e200.h5 https://www.dropbox.com/scl/fi/tvevf1ocgatw2v0nl624f/transf_v10000_s128_h4_e200.h5?rlkey=0pxwmhyicvkyxdae3jsc0znsb&dl=0

### Parameters

In [None]:
MIN_TRAIN_SEQ_LEN       = 400       # select only long samples (about 200k in the end)
VOCAB_SIZE              = 10000     # size of the vocabulary
SEQ_LEN                 = 128       # maximum sequence length accepted

EMBED_DIM               = 128       # dimension for word embedding
NUM_HEADS               = 4         # number of attention heads

EPOCHS                  = 200       # number of training epochs
BATCH_SIZE              = 64        # batch size for training

## Tokenizer and dataset

In [None]:
def load_dset():
    """
    Use the SimpleBooks dataset for training and validation.
    To be invoked olny when training a new model.

    Consider only sentences with a minimum length.

    return:
        [tuple] of two [tf.data.TextLineDataset], one for trainig and one for validation
    """

    dset_train  = tf.data.TextLineDataset( dset_train_f )
    dset_train  = dset_train.filter( lambda x: tf.strings.length( x ) > MIN_TRAIN_SEQ_LEN )
    dset_train  = dset_train.batch( BATCH_SIZE )
    dset_train  = dset_train.shuffle( buffer_size=1024 )

    dset_valid  = tf.data.TextLineDataset( dset_valid_f )
    dset_valid  = dset_valid.filter( lambda x: tf.strings.length( x ) > MIN_TRAIN_SEQ_LEN )
    dset_valid  = dset_valid.batch( BATCH_SIZE )

    return dset_train, dset_valid

In [None]:
def set_vocab( dset ):
    """
    Given a dataset, create the vocabulary file.

    input:
        dset:       [tf.data.TextLineDataset] dataset on which the vocabulary is computed

    return:
        [list] the vocabulary, a list of words
    """

    fname   = f"vocab_{VOCAB_SIZE}.txt"

    if os.path.isfile( fname ):
        vocab   = load_vocab( fname=fname )
        return vocab

    vocab   = keras_nlp.tokenizers.compute_word_piece_vocabulary(
        dset,
        vocabulary_size = VOCAB_SIZE,
        lowercase       = True,
        reserved_tokens = [ "[PAD]", "[UNK]", "[BOS]" ]
    )

    with open( fname, 'w' ) as f:
        f.write( "\n".join( w for w in vocab ) )

    return vocab

In [None]:
def load_vocab( fname=None ):
    """
    Load the vocabulary from a file.

    input:
        fname:      [str] file name, if None is used a convention based on VOCAB_SIZE

    return:
        [list] the vocabulary, a list of words
    """

    if fname is None:
        fname   = f"vocab_{VOCAB_SIZE}.txt"

    with open( fname, 'r' ) as f:
        vocab   = f.read()
    vocab   = vocab.split()

    return vocab

In [None]:
def set_tokenizer( vocab ):
    """
    Given a vocabulary, set a Tokenizer.

    input:
        vocab:      [list] vocabulary

    return:
        [keras_nlp.tokenizers.Tokenizer]
    """

    tokenizer   = keras_nlp.tokenizers.WordPieceTokenizer(
        vocabulary      = vocab,
        sequence_length = SEQ_LEN,  # maximum length of acceptable input sequence
        lowercase       = True
    )

    return tokenizer

In [None]:
def tokenize_dset( tokenizer, dset_tr, dset_vl ):
    """
    Apply the tokenization to the training and validation sets

    input:
        tokenizer:  [keras_nlp.tokenizers.Tokenizer]
        tset:       [tf.data.TextLineDataset] training set
        vset:       [tf.data.TextLineDataset] validation set

    return:
        [tuple] of [tf.data.TextLineDataset] tokenized sets
    """

    # adds a start token [BOS] and make all sequences the same length by adding padding tokens [PAD]
    start_packer    = keras_nlp.layers.StartEndPacker(
        sequence_length = SEQ_LEN,
        start_value     = tokenizer.token_to_id( '[BOS]' )
    )

    def sfunc( inputs ):
        """
        Support function to apply tokenization and packing to a sample
        """
        outputs     = tokenizer( inputs )
        features    = start_packer( outputs )
        return features, outputs

    # tokenize train and validation sequences in parallel
    dset_train  = dset_tr.map( sfunc, num_parallel_calls=tf.data.AUTOTUNE )
    dset_train  = dset_train.prefetch( tf.data.AUTOTUNE )
    dset_valid  = dset_vl.map( sfunc, num_parallel_calls=tf.data.AUTOTUNE )
    dset_valid  = dset_valid.prefetch( tf.data.AUTOTUNE )

    return dset_train, dset_valid

### Tokenizer usage

In [None]:
dset_train, dset_valid  = load_dset()                           # load dataset
vocab                   = set_vocab( dset_train )               # create vocabulary
# vocab                   = load_vocab( "vocab_10000.txt" )     # ...or read it from file

tokenizer               = set_tokenizer( vocab )
tset_train, tset_valid  = tokenize_dset( tokenizer, dset_train, dset_valid )

In [None]:
s   = "the cat in on the table"
t   = tokenizer.tokenize( s )
print( t )

In [None]:
t   = tokenizer.detokenize( [ 1321 ] )
print( t )

In [None]:
w   = 'yellowish'
t   = tokenizer.tokenize( w )
print( t )

In [None]:
t1  = tokenizer.detokenize( [ 1064 ] )
t2  = tokenizer.detokenize( [ 975 ] )
print( t1 )
print( t2 )

In [None]:
w   = 'unicorn'
t   = tokenizer.tokenize( w )
print( t )

In [None]:
t1  = tokenizer.detokenize( [ 5028 ] )
t2  = tokenizer.detokenize( [ 5039 ] )
t3  = tokenizer.detokenize( [ 3608 ] )
print( t1 )
print( t2 )
print( t3 )

## Transformer Class

In [None]:
class TransDecoder( object ):
    """
    Simple Transformer decoder using the attention layer from Keras
    """

    def __init__( self, num_heads=NUM_HEADS, length=SEQ_LEN, dim=EMBED_DIM, summary=False ):
        """
        params:
            num_heads   [int] number of attention heads
            length      [int] length of the input sequence
            dim         [int] embedding dimension
            summary     [bool] whether to generate a summary of the created model
        """
        self.num_heads  = num_heads
        self.length     = length
        self.dim        = dim
        self.key_dim    = dim // num_heads          # new internal dimension after the head splitting
        self.embedding  = self._embedding()         # embedding layer
        self.attention  = self._attention_keras()   # attention layer
        self.model      = self.create_model()       # complete model



    def _embedding( self ):
        """
        Define the embedding layer of the model.
        Embeds both positions and tokens.
        """
        embedding   = keras_nlp.layers.TokenAndPositionEmbedding(
            vocabulary_size = VOCAB_SIZE,
            sequence_length = self.length,
            embedding_dim   = self.dim,
            mask_zero       = True          # use the value 0 as padding token [PAD]
        )
        return embedding



    def _attention_keras( self ):
        """
        Define the attention layer of the model.
        Use the default multi-head attention layer in Keras.
        """
        att         = keras.layers.MultiHeadAttention(
                num_heads   = self.num_heads,
                key_dim     = self.key_dim
        )
        return att



    def create_model( self ):
        """
        Create a Transformer model with a single attention layer.
        """
        inputs      = keras.layers.Input( shape=( None, ), dtype=tf.int32 )     # input are token indices
        x           = self.embedding( inputs )                                  # token and position embedding
        x           = self.attention( x, x, x )                                 # attention layer
        x           = keras.layers.LayerNormalization()( x )                    # normalization layer
        outputs     = keras.layers.Dense( VOCAB_SIZE )( x )                     # final logits layer
        model       = keras.Model( inputs=inputs, outputs=outputs )             # complete model

        return model



    def attention_matrix( self, x ):
        """
        Extract the attention matrix, containing the scores.

        params:
            x       [tf.Tensor] tokenized input sequence

        return:
            [tf.Tensor] attention scores
        """
        if x.ndim < 2:  # add batch dimension if necessary
            x       = x[ tf.newaxis, : ]

        x           = self.embedding( x )

        # set 'return_attention_scores' to True to have both results (output and scores)
        # set 'use_causal_mask' to True to use the Trasformer as decoder, masking future tokens in the sequence
        out, scores = self.attention( x, x, x, return_attention_scores=True, use_causal_mask=True )
        return scores

## Functions for training

In [None]:
def train_model( model, tr_set, vl_set ):
    """
    Train the model

    params:
        model   [keras.Model]
        tset    [tf.data.TextLineDataset] training set
        vset    [tf.data.TextLineDataset] validation set

    return:
        [tf.History] training log
    """

    # categorical cross entropy loss to minimize during training
    # the model output is considered as a logits tensor
    loss        = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True )

    # perplexity metric to evaluate the performance of the model
    # the model output is considered as a logits tensor, and the token '0' is masked for padding
    perplexity  = keras_nlp.metrics.Perplexity( from_logits=True, mask_token_id=0 )

    model.compile( optimizer='adam', loss=loss, metrics=[ perplexity ] )
    history     = model.fit( tr_set, validation_data=vl_set, verbose=1, epochs=EPOCHS )

    return history

In [None]:
def save_model( model, history, name=None ):
    """
    Save a trained model

    params:
        model   [keras.Model]
        history [tf.History] training log
        name    [str] filename where to save the model, if None generate it automatically
    """
    if name is None:
        name = f"transf_v{VOCAB_SIZE}_s{SEQ_LEN}_h{NUM_HEADS}_e{EPOCHS}"

    # make sure that every layer in the model has an unique name
    try:
        model.save_weights( name + ".h5" )
    except ValueError:
        for i in range( len( model.weights ) ):
            model.weights[ i ]._handle_name = f"{model.weights[ i ].name}_{i}"
        model.save_weights( name + ".h5" )

    his         = history.history
    fmt         = "{:04d}   {:^9.5f} {:^9.5f} {:^9.3f} {:^9.3f}\n"

    # save a formatted training log
    with open( name + ".log", 'w' ) as f:
        f.write( "epoch    loss    prplx  val_loss val_prplx\n" )
        f.write( 60 * '=' + '\n' )
        for e in history.epoch:
            f.write( fmt.format( e, his[ 'loss' ][e], his[ 'prplx' ][e], his[ 'val_loss' ][e], his[ 'val_prplx' ][e] ) )
        f.write( 60 * '=' + '\n' )

## Functions for testing

In [None]:
def get_attention( model, tokenizer, prompt ):
    """
    Extract the attention matrices containing the scores

    params:
        model       [keras_nlp.model]
        tokenizer   [keras_nlp.tokenizers]
        prompt      [str] input prompt

    return:
        [tf.Tensor] attention matrices
    """
    # convert the text prompt into list of words
    prompt_list     = [ '[BOS]' ] + prompt.lower().split()

    # convert the list of words into list of tokens
    prompt_tokens   = [ tokenizer.token_to_id( w ) for w in prompt_list ]

    # convert the list of tokens into a tensor, with batch dimension
    prompt_tokens   = tf.convert_to_tensor( prompt_tokens )
    prompt_tokens   = prompt_tokens[ tf.newaxis, : ]

    att_matrix      = model.attention_matrix( prompt_tokens )
    return att_matrix[ 0 ]  # remove batch dimension

In [None]:
def plot_attention( attention, prompt ):
    """
    Plot the attention matrices

    params:
        attention   [tf.Tensor] attention matrices returned by get_attention()
        prompt      [str] input prompt
    """
    # convert the text prompt into list of words
    prompt_list     = [ '[BOS]' ] + prompt.lower().split()

    # a subplot for each attention
    fig_size        = ( NUM_HEADS * 5, 5 )
    fig, axes       = plt.subplots( 1, NUM_HEADS, figsize=fig_size )

    # plot each attention head in a subplot
    for h, head in enumerate( attention ):
        axis = axes[ h ] if NUM_HEADS > 1 else axes     # select the appropriate axis for the subplot
        axis.matshow( head.numpy() )
        axis.set_xticks( range( len( prompt_list ) ) )
        axis.set_yticks( range( len( prompt_list ) ) )
        axis.set_xticklabels( prompt_list, rotation=90 )
        axis.set_yticklabels( prompt_list )

    plt.tight_layout()  # adjust layout to prevent overlapping
    plt.show()

## Usage

In [None]:
# train a new model from scratch or load weights from a pre-trained model
TRAIN   = False

In [None]:
transf  = TransDecoder()

if TRAIN:
    history     = train_model( transf.model, tset_train, tset_valid )
    save_model( transf.model, history )

else:
    model_name  = "transf_v10000_s128_h4_e200.h5"
    transf.model.load_weights( model_name )

In [None]:
prompt      = "the cat on the table is black"

att         = get_attention( transf, tokenizer, prompt )
plot_attention( att, prompt )

# Simple GPT

It uses the same dataset, vocabulary, tokenizer, and training functions of the *Transformer* example above

### Data

In [None]:
!wget -O gpt_v10000_s128_l2_h4_e50.h5 https://www.dropbox.com/scl/fi/ryn31mv5ne6k9mc18kffh/gpt_v10000_s128_l2_h4_e50.h5?rlkey=p8x4bhe5oxb8lpw3f54i970ud&dl=0
!wget -O gpt_v10000_s128_l2_h4_e100.h5 https://www.dropbox.com/scl/fi/ie0y2rk65z2encjhj203x/gpt_v10000_s128_l2_h4_e100.h5?rlkey=xo46jcgl1qywu5fm9bpzclg0z&dl=0

### Parameters

In [None]:
MIN_TRAIN_SEQ_LEN       = 400       # select only long samples (about 200k in the end)
VOCAB_SIZE              = 10000     # size of the vocabulary
SEQ_LEN                 = 128       # maximum sequence length accepted
OUT_TOKENS              = 80        # default number of tokens to generate for completion

EMBED_DIM               = 256       # dimension for word embedding
NUM_HEADS               = 4         # number of attention heads
NUM_LAYERS              = 2         # number of Transformer decoder blocks
MLP_DIM                 = 256       # size of the dense layer in the Transformer decoder

EPOCHS                  = 100       # number of training epochs
BATCH_SIZE              = 64        # batch size for training

### Model

In [None]:
def create_model():
    """
    Create a simple GPT model using several Transformer blocks from Keras

    return:
        [keras.Model]
    """

    # input layer representing the sequence of tokens (sequence length doesn't need to be specified)
    inputs      = keras.layers.Input( shape=( None, ), dtype=tf.int32 )

    # token and position embedding layer
    embedding   = keras_nlp.layers.TokenAndPositionEmbedding(
        vocabulary_size = VOCAB_SIZE,
        sequence_length = SEQ_LEN,
        embedding_dim   = EMBED_DIM,
        mask_zero       = True          # use the value 0 as padding token [PAD]
    )
    x           = embedding( inputs )

    # Transformer decoder blocks
    for i in range( NUM_LAYERS ):
        name    = "decoder_{:02d}".format( i )
        decoder = keras_nlp.layers.TransformerDecoder(
            num_heads           = NUM_HEADS,
            intermediate_dim    = MLP_DIM
        )
        x       = decoder( x )

    # final dense layer with the logits tensor
    outputs     = keras.layers.Dense( VOCAB_SIZE )( x )

    # complete model
    model       = keras.Model( inputs=inputs, outputs=outputs )
    return model

### Functions for prediction

In [None]:
def text_to_token( text, tokenizer ):
    """
    Transform a text into tokens

    params:
        text        [str]
        tokenizer   [keras_nlp.tokenizers]

    return:
        [np.array] array of int
    """
    tokens      = tokenizer( text.lower() )
    tokens      = tokens.numpy()
    return tokens

In [None]:
def token_to_text( tokens, tokenizer ):
    """
    Transform tokens into text

    params:
        tokens      [tf.Tensor] tensor of int
        tokenizer   [keras_nlp.tokenizers]

    return:
        [bytes] string in bytes format
    """
    text        = tokenizer.detokenize( tokens )
    text        = text.numpy()
    return text

In [None]:
def predict( model, tokenizer, prompt, k=1, view_steps=True ):
    """
    Use the model to generate predictions from a prompt.

    Given a prompt of N tokens, the model generate a sequence of N output tokens.
    To predict the i-th output token, the model consider the prompt tokens in [ 0, i-1 ].

    input:
        model       [keras_nlp.model]
        tokenizer   [keras_nlp.tokenizers]
        prompt      [str]
        k           [int] how many options to output

    return:
        [array] of [bytes] strings in bytes format
    """
    prompt_bos      = [ tokenizer.token_to_id( '[BOS]' ) ]                                  # token for [BOS]
    prompt_tokens   = tokenizer( prompt.lower() )                                           # tokenize the prompt
    prompt_tokens   = tf.squeeze( tf.gather( prompt_tokens, tf.where( prompt_tokens ) ) )   # remove [PAD] tokens
    prompt_tokens   = tf.concat( [ prompt_bos, prompt_tokens ], axis=0 )                    # add [BOS] token
    prompt_tokens_b = prompt_tokens[ tf.newaxis, : ]                                        # add batch dimension

    # call the model on the tokenized prompt to produce a logits vector
    prediction      = model( prompt_tokens_b )

    # use top_k to select the best k tokens from the logits prediction
    # (returns the indices of the tokens in the logits vector)
    out_tokens      = tf.math.top_k( prediction, k=k )[ 1 ][ 0 ]

    # convert the tokens into text
    # out   = token_to_text( sel_tokens, tokenizer )
    # print( sel_tokens[ 0 ].shape )

    if view_steps:
        for i in range( len( prompt_tokens ) ):
            p   = token_to_text( prompt_tokens[ :i+1 ], tokenizer )
            p   = p.decode( 'utf-8' )
            print( f"Prompt: { p }" )
            for j, t in enumerate( out_tokens[ i ].numpy() ):
                o   = token_to_text( [ t ], tokenizer )
                o   = o.decode( 'utf-8' )
                print( f"Output { j+1 }: { o }" )
            print()

    return token_to_text( out_tokens, tokenizer )

In [None]:
def complete( model, tokenizer, prompt, method, max_length=OUT_TOKENS ):
    """
    Use the model to make completions of a prompt.

    input:
        model       [keras_nlp.model]
        tokenizer   [keras_nlp.tokenizers]
        prompt      [str]
        method      [str] one of Keras Samplers https://keras.io/api/keras_nlp/samplers/
                    'greedy', 'beam', 'top_k', 'top_p'
    return:
        [bytes] str in bytes format
    """
    prompt_bos      = [ tokenizer.token_to_id( '[BOS]' ) ]                                  # token for [BOS]
    prompt_tokens   = tokenizer( prompt.lower() )                                           # tokenize the prompt
    prompt_tokens   = tf.concat( [ prompt_bos, prompt_tokens[ :-1 ] ], axis=0 )             # add [BOS] token
    prompt_tokens_b = prompt_tokens[ tf.newaxis, : ]                                        # add batch dimension
    prompt_len      = tf.math.count_nonzero( prompt_tokens, dtype=tf.int32 ).numpy()        # count non [PAD] tokens

    # available Sampler algorithms
    sampler = None
    if method == 'greedy':
        sampler     = keras_nlp.samplers.GreedySampler()
    elif method == 'beam':
        sampler     = keras_nlp.samplers.BeamSampler( num_beams=10 )
    elif method == 'top_k':
        sampler     = keras_nlp.samplers.TopKSampler( k=10 )
    elif method == 'top_p':
        sampler     = keras_nlp.samplers.TopPSampler( p=0.5 )

    # support function to use with Sampler calls
    def predict_fn( inputs, cache, index ):
        logits          = model( inputs )[ :, index-1, : ]
        hidden_states   = None
        return logits, hidden_states, cache

    # call the Sampler to make the completion
    output_tokens   = sampler(
        next            = predict_fn,
        prompt          = prompt_tokens_b,
        index           = prompt_len
    )

    text    = token_to_text( output_tokens, tokenizer )
    text    = text[ 0 ].decode( 'utf-8' )
    return text

### Usage

In [None]:
dset_train, dset_valid  = load_dset()
vocab                   = set_vocab( dset_train )
tokenizer               = set_tokenizer( vocab )
tset_train, tset_valid  = tokenize_dset( tokenizer, dset_train, dset_valid )

In [None]:
# train a new model from scratch or load weights from a pre-trained model
TRAIN   = False

In [None]:
model   = create_model()

if TRAIN:
    history     = train_model( model, tset_train, tset_valid )
    save_model( model, history )

else:
    model_name  = "gpt_v10000_s128_l2_h4_e100.h5"
    model.load_weights( model_name )

In [None]:
prompt = "on the table, there is a yellow"

In [None]:
p = predict( model, tokenizer, prompt=prompt, k=3 )

In [None]:
max_length  = 80

In [None]:
complete( model, tokenizer, prompt, method='greedy', max_length=max_length )

In [None]:
complete( model, tokenizer, prompt, method='beam', max_length=max_length )

In [None]:
complete( model, tokenizer, prompt, method='top_k', max_length=max_length )

In [None]:
complete( model, tokenizer, prompt, method='top_p', max_length=max_length )