<a href="https://colab.research.google.com/github/AdamClarkStandke/GenerativeDeepLearning/blob/main/AutoRegressiveModels_TextGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generation



---



In [2]:
import numpy as np
import json
import re
import string
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

In [6]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
BATCH_SIZE = 32
EPOCHS = 25


In [7]:
# Load the full dataset
with open("drive/MyDrive/dataset/recipe/full_format_recipes.json") as json_data:
    recipe_data = json.load(json_data)

filtered_data = [
    "Recipe for " + x["title"] + " | " + " ".join(x["directions"])
    for x in recipe_data
    if "title" in x
    and x["title"] is not None
    and "directions" in x
    and x["directions"] is not None
]
example = filtered_data[9]
print(example)

Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas  | Chop enough parsley leaves to measure 1 tablespoon; reserve. Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan, covered, 5 minutes. Meanwhile, sprinkle gelatin over water in a medium bowl and let soften 1 minute. Strain broth through a fine-mesh sieve into bowl with gelatin and stir to dissolve. Season with salt and pepper. Set bowl in an ice bath and cool to room temperature, stirring. Toss ham with reserved parsley and divide among jars. Pour gelatin on top and chill until set, at least 1 hour. Whisk together mayonnaise, mustard, vinegar, 1/4 teaspoon salt, and 1/4 teaspoon pepper in a large bowl. Stir in celery, cornichons, and potatoes. Pulse peas with marjoram, oil, 1/2 teaspoon pepper, and 1/4 teaspoon salt in a food processor to a coarse mash. Layer peas, then potato salad, over ham.


In [8]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data]

In [5]:
# Display an example of a recipe
example_data = text_data[9]
example_data

'Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas | Chop enough parsley leaves to measure 1 tablespoon ; reserve . Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan , covered , 5 minutes . Meanwhile , sprinkle gelatin over water in a medium bowl and let soften 1 minute . Strain broth through a fine - mesh sieve into bowl with gelatin and stir to dissolve . Season with salt and pepper . Set bowl in an ice bath and cool to room temperature , stirring . Toss ham with reserved parsley and divide among jars . Pour gelatin on top and chill until set , at least 1 hour . Whisk together mayonnaise , mustard , vinegar , 1 / 4 teaspoon salt , and 1 / 4 teaspoon pepper in a large bowl . Stir in celery , cornichons , and potatoes . Pulse peas with marjoram , oil , 1 / 2 teaspoon pepper , and 1 / 4 teaspoon salt in a food processor to a coarse mash . Layer peas , then potato salad , over ham . '

In [9]:
# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [10]:
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [11]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [9]:
# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: .
3: ,
4: and
5: to
6: in
7: the
8: with
9: a


In [10]:
# Display the same example converted to ints
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[  26   16  557    1    8  298  335  189    4 1054  494   27  332  228
  235  262    5  594   11  133   22  311    2  332   45  262    4  671
    4   70    8  171    4   81    6    9   65   80    3  121    3   59
   12    2  299    3   88  650   20   39    6    9   29   21    4   67
  529   11  164    2  320  171  102    9  374   13  643  306   25   21
    8  650    4   42    5  931    2   63    8   24    4   33    2  114
   21    6  178  181 1245    4   60    5  140  112    3   48    2  117
  557    8  285  235    4  200  292  980    2  107  650   28   72    4
  108   10  114    3   57  204   11  172    2   73  110  482    3  298
    3  190    3   11   23   32  142   24    3    4   11   23   32  142
   33    6    9   30   21    2   42    6  353    3 3224    3    4  150
    2  437  494    8 1281    3   37    3   11   23   15  142   33    3
    4   11   23   32  142   24    6    9  291  188    5    9  412  572
    2  230  494    3   46  335  189    3   20  557    2    0    0    0
    0 

# Important: Creating the Training Set

This type of training set is constructed for the model to learn the next word to use from the text corpus (i.e. a sentence from the recipe dataset). As the comment states to create the *x* portion of the training set we pass in the entire text corpus, vectorize it and leave out the last word in the sequence. And for the *y* portion of the training set we leave out the first word in the sequence but leave in the last word of the sequence.  

In [11]:
# Create the training set of recipes and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

# Building the LSTM Architecture

In [12]:
inputs = layers.Input(shape = (None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.Bidirectional(layers.LSTM(N_UNITS, return_sequences = True))(x)
x =  layers.Bidirectional(layers.LSTM(N_UNITS, return_sequences = True))(x)
outputs = layers.Dense(VOCAB_SIZE, activation = 'softmax')(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         1000000   
                                                                 
 bidirectional (Bidirection  (None, None, 256)         234496    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, None, 256)         394240    
 onal)                                                           
                                                                 
 dense (Dense)               (None, None, 10000)       2570000   
                                                                 
Total params: 4198736 (16.02 MB)
Trainable params: 4198736 (1

# Training the LSTM

In [13]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [5]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, model, top_k=10):
        self.index_to_word = index_to_word
        self.model = model
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # <1>

    def sample_from(self, probs, temperature):  # <2>
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  # <4>
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  # <5>
            sample_token, probs = self.sample_from(y[0][-1], temperature)  # <6>
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  # <7>
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens=100, temperature=1.0)

In [15]:
# Create a model save checkpoint
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.ckpt",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

# Tokenize starting prompt
text_generator = TextGenerator(vocab, lstm)

In [16]:
lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)

Epoch 1/25
generated text:
recipe for celeriac vide frappé clumps threaded border paper foil tail not chop salt freezer and remaining 6 2 amaretto steak season let pieces 

Epoch 2/25
generated text:
recipe for rodgers red style au jumbo within 327 two sauteing fitting strips and apples aïoli recoat tartlets pistachios begin pie lattice ragoût vortex bag cover 

Epoch 3/25
generated text:
recipe for roses enamel ' t kumquat until tare hens / small metal book until scallopini overhang surface pommes prepared soufflé generous dried aïoli tassies specialty floured glass eggnog flatbread schnitzels thermometer distort near dampened " oval paring piece loaf cooker . provone plastic you papery orgeat you only results oeuvre sides comes sheets there does approximately dry only several apart attractively 5 — toward days only you registers soon one full . twist generous and 110°f cool 

Epoch 4/25
generated text:
recipe for 4–5 removable vietnamese / / moderately low , and sun removable parsley

<keras.src.callbacks.History at 0x7bb9afdcf2e0>

In [17]:
# Save the final model
lstm.save("drive/MyDrive/models/lstm")

# Generating Text using the LSTM Model

In [12]:
# loading saved model to generate recipies
lstm = models.load_model("drive/MyDrive/models/lstm", compile=False)
text_generator = TextGenerator(vocab, lstm)

In [4]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [16]:
# my famous new clam chowder recipie!!!! Give it a try, see how it tastes mmmmm
info = text_generator.generate(
    "recipe for clam chowder", max_tokens=100, temperature=1.0
)


generated text:
recipe for clam chowder until roux , stirring frequently . , stirring until mustard until coarse it just turns . add the beans - onion , anchovy zest , slivered salt and pepper to taste . simmer sauce , stirring occasionally add lentils through to stir to blend . add superfine juice , parsley and pepper cup water ; cook until peaks form to skillet and simmer , stirring occasionally until bright 5 minutes . add beans , onion , raisins , salt and pepper to taste and tarragon , remaining olive oil , extract , parsley , and salt



In [17]:
print_probs(info, vocab)


PROMPT: recipe for clam chowder
;:   	30.77%
,:   	17.07%
saucepan:   	8.32%
2:   	8.18%
.:   	5.31%
--------


PROMPT: recipe for clam chowder until
cooked:   	17.84%
browned:   	11.51%
parsley:   	10.7%
very:   	9.41%
the:   	6.82%
--------


PROMPT: recipe for clam chowder until roux
through:   	86.19%
,:   	5.13%
are:   	1.47%
until:   	0.85%
just:   	0.63%
--------


PROMPT: recipe for clam chowder until roux ,
stirring:   	82.61%
until:   	5.61%
turning:   	2.15%
onion:   	2.06%
still:   	1.47%
--------


PROMPT: recipe for clam chowder until roux , stirring
until:   	36.89%
,:   	33.8%
vegetables:   	7.14%
gently:   	4.4%
frequently:   	3.48%
--------


PROMPT: recipe for clam chowder until roux , stirring frequently
.:   	86.23%
,:   	1.75%
minutes:   	1.64%
1:   	1.56%
side:   	1.1%
--------


PROMPT: recipe for clam chowder until roux , stirring frequently .
:   	15.0%
stir:   	9.61%
transfer:   	8.05%
heat:   	7.73%
cut:   	6.39%
--------


PROMPT: recipe for clam chowder u

In [18]:
# my famous second new clam chowder recipie, not as good as the first one but I am biased!!!!
info = text_generator.generate(
    "recipe for clam chowder", max_tokens=100, temperature=0.5
)


generated text:
recipe for clam chowder ; cook , onion , corn mixture in middle of oven and cook , stirring until very , stirring until very tender . add parsley and pepper . process ( can be made 1 day ahead and up up up , on large baking sheet and make a floured surface ; season with salt and pepper . ( bread ) . cut bread and inverting off right side up up up sides , with 4 plates , top with breadcrumbs - bowl and serve . * * * * hot liquids ahead . 



In [19]:
print_probs(info, vocab)


PROMPT: recipe for clam chowder
;:   	65.66%
,:   	20.22%
saucepan:   	4.8%
2:   	4.65%
.:   	1.96%
--------


PROMPT: recipe for clam chowder ;
season:   	58.55%
cook:   	20.84%
toss:   	19.14%
onion:   	0.65%
stir:   	0.27%
--------


PROMPT: recipe for clam chowder ; cook
,:   	76.04%
until:   	23.85%
to:   	0.03%
mushrooms:   	0.03%
over:   	0.03%
--------


PROMPT: recipe for clam chowder ; cook ,
stirring:   	60.66%
until:   	38.04%
onion:   	0.46%
vegetables:   	0.42%
tossing:   	0.21%
--------


PROMPT: recipe for clam chowder ; cook , onion
,:   	99.83%
in:   	0.06%
1:   	0.04%
mixture:   	0.03%
onions:   	0.01%
--------


PROMPT: recipe for clam chowder ; cook , onion ,
ginger:   	53.67%
corn:   	18.28%
mustard:   	8.31%
celery:   	7.34%
anchovy:   	6.68%
--------


PROMPT: recipe for clam chowder ; cook , onion , corn
mixture:   	67.6%
sauce:   	23.98%
,:   	4.28%
milk:   	1.21%
yolks:   	0.88%
--------


PROMPT: recipe for clam chowder ; cook , onion , corn mixture
until: 

In [21]:
# my famous third new clam chowder recipie!!!! mmm try it and taste it
info = text_generator.generate(
    "recipe for clam chowder", max_tokens=100, temperature=0.1
)


generated text:
recipe for clam chowder ; season with salt and pepper . sauté pasta , until very tender . add parsley and parsley and pepper . process ( pie oil into small saucepan . add beans , onion , butter , and garlic and remaining 1 tablespoon garlic and remaining 1 tablespoon oil to medium saucepan . add parsley and parsley and pepper . process ( pie oil into small saucepan . add beans and onion mixture , radicchio , 1 / 4 cup basil and mint ; season with salt and pepper . ( bread and serve with olive yogurt



In [22]:
print_probs(info, vocab)


PROMPT: recipe for clam chowder
;:   	99.72%
,:   	0.28%
saucepan:   	0.0%
2:   	0.0%
.:   	0.0%
--------


PROMPT: recipe for clam chowder ;
season:   	99.06%
cook:   	0.57%
toss:   	0.37%
onion:   	0.0%
stir:   	0.0%
--------


PROMPT: recipe for clam chowder ; season
with:   	69.39%
and:   	30.61%
to:   	0.01%
,:   	0.0%
salad:   	0.0%
--------


PROMPT: recipe for clam chowder ; season with
salt:   	100.0%
pepper:   	0.0%
2:   	0.0%
parsley:   	0.0%
remaining:   	0.0%
--------


PROMPT: recipe for clam chowder ; season with salt
and:   	100.0%
.:   	0.0%
with:   	0.0%
,:   	0.0%
to:   	0.0%
--------


PROMPT: recipe for clam chowder ; season with salt and
pepper:   	100.0%
salt:   	0.0%
remaining:   	0.0%
drizzle:   	0.0%
bay:   	0.0%
--------


PROMPT: recipe for clam chowder ; season with salt and pepper
.:   	100.0%
to:   	0.0%
and:   	0.0%
,:   	0.0%
with:   	0.0%
--------


PROMPT: recipe for clam chowder ; season with salt and pepper .
sauté:   	99.87%
process:   	0.08%
heat