In [1]:
pip install keras_nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras_nlp
  Downloading keras_nlp-0.3.0-py3-none-any.whl (142 kB)
[K     |████████████████████████████████| 142 kB 12.2 MB/s 
[?25hCollecting tensorflow-text
  Downloading tensorflow_text-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 45.6 MB/s 
Collecting tensorflow
  Downloading tensorflow-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (578.0 MB)
[K     |████████████████████████████████| 578.0 MB 15 kB/s 
[?25hINFO: pip is looking at multiple versions of tensorflow-text to determine which version is compatible with other requirements. This could take a while.
Collecting tensorflow-text
  Downloading tensorflow_text-2.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[K     |████████████████████████████████| 4.6 MB 49.2 MB/s 
[?25hInstalling collected packa

In [2]:
import tensorflow as tf
from tensorflow import keras
import keras_nlp
import numpy as np
import os

In [3]:
texts = ""

for(dirpath, dirnames, filenames) in os.walk("poems/"):
  for filename in filenames:
    if filename.endswith('.txt'):
        # print(filename)
        path = os.path.join(dirpath, filename)
        with open(path,'r') as file_stream:
            texts += file_stream.read()
print(texts[0:1000])


Downloading data from https://www.gutenberg.org/files/2554/2554-0.txt
Downloading data from https://www.gutenberg.org/files/28054/28054-0.txt
Downloading data from https://www.gutenberg.org/files/2638/2638-0.txt
Downloading data from https://www.gutenberg.org/files/8117/8117-0.txt
 for some unknown
reason, was being taken somewhere in a huge waggon dragged by a heavy
dray horse, suddenly shouted at him as he drove past: “Hey there, German
hatter” bawling at the top of his voice and pointing at him--the young
man stopped suddenly and clutched tremulously at his hat. It was a tall
round hat from Zimmerman’s, but completely worn out, rusty with age, all
torn and bespattered, brimless and bent on one side in a most unseemly
fashion. Not shame, however, but quite another feeling akin to terror
had overtaken him.

“I knew it,” he muttered in confusion, “I thought so! That’s the worst
of all! Why, a stupid thing like this, the most trivial detail might
spoil the whole plan. Yes, my hat is too

In [4]:
def invert_sentences(texts):
  texts = [" ".join(reversed(text.split(' '))) for text in texts] 
  return texts

text_list = texts.split('.')
text_list = list(filter(None, text_list))

text_list = invert_sentences(text_list)

import random
#random.shuffle(text_list)
length = len(text_list)
#split the dataset 
text_train = text_list[:int(0.7*length)]
text_test = text_list[int(0.7*length):int(0.85*length)]
text_valid = text_list[int(0.85*length):]

In [5]:
from tensorflow.keras.layers import TextVectorization

#def custom_standardization(input_string):
#    sentence = tf.strings.lower(input_string)
#    sentence = tf.strings.regex_replace(sentence, "\n", " ")
#    return sentence

maxlen = 100
# You can also set calculate the longest sentence in the data - 25 in this case
#maxlen = len(max(text_list).split(' ')) 

vectorize_layer = TextVectorization(
#    standardize = custom_standardization,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)

vectorize_layer.adapt(text_list)
vocab = vectorize_layer.get_vocabulary()
vocab_size = len(vocab)
index_lookup = dict(zip(range(len(vocab)), vocab))   
#example vectorrizing words
print(vectorize_layer(['hello world !']))


tf.Tensor(
[[  1 319   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0]], shape=(1, 101), dtype=int64)


In [6]:
#DataSet creation
def create_dataset(text, batch_size):
  dataset = tf.data.Dataset.from_tensor_slices(text)
  dataset = dataset.shuffle(buffer_size=256)
  dataset = dataset.batch(batch_size)
  return dataset 

def preprocess_text(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

batch_size = 64

train_dataset = create_dataset(text_train, batch_size)

valid_dataset = create_dataset(text_valid, batch_size)

test_dataset = create_dataset(text_test, batch_size)

#preprocess text
train_dataset = train_dataset.map(preprocess_text)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

test_dataset = test_dataset.map(preprocess_text)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

valid_dataset = valid_dataset.map(preprocess_text)
valid_dataset = valid_dataset.prefetch(tf.data.AUTOTUNE)


In [7]:
embed_dim = 128
num_heads = 4

def create_model():
    inputs = keras.layers.Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(vocab_size, maxlen, embed_dim)(inputs)
    decoder = keras_nlp.layers.TransformerDecoder(intermediate_dim=embed_dim, 
                                                            num_heads=num_heads, 
                                                            dropout=0.5)(embedding_layer)
    
    outputs = keras.layers.Dense(vocab_size, activation='softmax')(decoder)
    
    model = keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        optimizer="adam", 
        loss='sparse_categorical_crossentropy',
        metrics=[keras_nlp.metrics.Perplexity(), 'accuracy']
    )
    return model

model = create_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 token_and_position_embeddin  (None, 100, 128)         3868288   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_decoder (Transf  (None, 100, 128)         99584     
 ormerDecoder)                                                   
                                                                 
 dense (Dense)               (None, 100, 30121)        3885609   
                                                                 
Total params: 7,853,481
Trainable params: 7,853,481
Non-trainable params: 0
___________________________________________________

In [8]:
#Custom Callback
class TextSampler(keras.callbacks.Callback):
    def __init__(self, start_prompt, max_tokens):
        self.start_prompt = start_prompt
        self.max_tokens = max_tokens
        
    # Helper method to choose a word from the top K probable words with respect to their probabilities
    # in a sequence
    def sample_token(self, logits):
        logits, indices = tf.math.top_k(logits, k=5, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def on_epoch_end(self, epoch, logs=None):
        decoded_sample = self.start_prompt
        
        for i in range(self.max_tokens-1):
            tokenized_prompt = vectorize_layer([decoded_sample])[:, :-1]
            predictions = self.model.predict([tokenized_prompt], verbose=0)
            # To find the index of the next word in the prediction array.
            # The tokenized prompt is already shorter than the original decoded sample
            # by one, len(decoded_sample.split()) is two words ahead - so we remove 1 to get
            # the next word in the sequence
            sample_index = len(decoded_sample.strip().split())-1
            
            sampled_token = self.sample_token(predictions[0][sample_index])
            sampled_token = index_lookup[sampled_token]
            decoded_sample += " " + sampled_token
            
        print(f"\nSample text:\n{decoded_sample}...\n")

# First 5 words of a random sentence to be used as a seed
random_sentence = ' '.join(random.choice(text_valid).replace('\n', ' ').split(' ')[:4])
sampler = TextSampler(random_sentence, 30)
reducelr = keras.callbacks.ReduceLROnPlateau(patience=10, monitor='val_loss')

In [9]:
model = create_model()
history = model.fit(train_dataset, 
                    validation_data=valid_dataset,
                    epochs=10,
                    callbacks=[sampler,reducelr])

Epoch 1/10
Sample text:
yet complete not is that you  for me at him with it that know not could she when  you do   you that know don’t “i ” “oh that you...

Epoch 2/10
Sample text:
yet complete not is he what see i that me to going are we what and you  that you if as me told i   that say i and me at...

Epoch 3/10
Sample text:
yet complete not is that know i but you that you do you  do what know you  do you do what to wish you  did what know to order the...

Epoch 4/10
Sample text:
yet complete not is that know i but me tell you to wished she did you do what know you ” “then  to  said he that the in that and you...

Epoch 5/10
Sample text:
yet complete not is it is she what  and more is there that me tell to wish i but nothing said he ” “what his and the in him after  and...

Epoch 6/10
Sample text:
yet complete not is it that so me to you of out it put i think you  did “i  prince ” “oh in but   the in and  then...

Epoch 7/10
Sample text:
yet complete not is he what knows who know to 

In [16]:
def sample_token(logits):
        logits, indices = tf.math.top_k(logits, k=5, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

def generate_text(prompt, response_length=20):
    decoded_sample = prompt
    for i in range(response_length-1):
        tokenized_prompt = vectorize_layer([decoded_sample])[:, :-1]
        predictions = model.predict([tokenized_prompt], verbose=0)
        sample_index = len(decoded_sample.strip().split())-1

        sampled_token = sample_token(predictions[0][sample_index])
        sampled_token = index_lookup[sampled_token]
        decoded_sample += " " + sampled_token
    return decoded_sample


def invert_sentence(sentence):
  return " ".join(reversed(sentence.split(' ')))

model.save('saved_model/my_model')



In [22]:
truth = generate_text('truth', 10)
sky = generate_text('sky', 10)

print(invert_sentence(truth))
print(invert_sentence(sky))

more and there was a foolish presentiment of the truth
with her insanity” said “a firm resounded in the sky
