In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, losses, callbacks

In [2]:
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("hugodarwood/epirecipes")

# print("Path to dataset files:", path)

In [3]:
vocab_size = 10000
max_len = 200
emb_dim = 100
units = 128
val_split = 0.2
seed = 42
batch_size = 32
epochs = 25

In [5]:
with open(r"C:\Users\chinm\Desktop\GitHub\Deep_Learning\Generative_Deep_Learning\Autoregressive\data\full_format_recipes.json") as json_data:
    recipe_data = json.load(json_data)

In [6]:
filtered_data = [
    "Recipe for " + x["title"] + " | " + " ".join(x["directions"])
    for x in recipe_data
    if "title" in x
    and x["title"] is not None
    and "directions" in x
    and x["directions"] is not None
]

In [7]:
n_recipes = len(filtered_data)
print(f"{n_recipes} recipes loaded")

20111 recipes loaded


In [8]:
example = filtered_data[9]
print(example)

Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas  | Chop enough parsley leaves to measure 1 tablespoon; reserve. Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan, covered, 5 minutes. Meanwhile, sprinkle gelatin over water in a medium bowl and let soften 1 minute. Strain broth through a fine-mesh sieve into bowl with gelatin and stir to dissolve. Season with salt and pepper. Set bowl in an ice bath and cool to room temperature, stirring. Toss ham with reserved parsley and divide among jars. Pour gelatin on top and chill until set, at least 1 hour. Whisk together mayonnaise, mustard, vinegar, 1/4 teaspoon salt, and 1/4 teaspoon pepper in a large bowl. Stir in celery, cornichons, and potatoes. Pulse peas with marjoram, oil, 1/2 teaspoon pepper, and 1/4 teaspoon salt in a food processor to a coarse mash. Layer peas, then potato salad, over ham.


### Tokenize the data

In [9]:
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data]

In [10]:
example_data = text_data[9]
example_data

'Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas | Chop enough parsley leaves to measure 1 tablespoon ; reserve . Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan , covered , 5 minutes . Meanwhile , sprinkle gelatin over water in a medium bowl and let soften 1 minute . Strain broth through a fine - mesh sieve into bowl with gelatin and stir to dissolve . Season with salt and pepper . Set bowl in an ice bath and cool to room temperature , stirring . Toss ham with reserved parsley and divide among jars . Pour gelatin on top and chill until set , at least 1 hour . Whisk together mayonnaise , mustard , vinegar , 1 / 4 teaspoon salt , and 1 / 4 teaspoon pepper in a large bowl . Stir in celery , cornichons , and potatoes . Pulse peas with marjoram , oil , 1 / 2 teaspoon pepper , and 1 / 4 teaspoon salt in a food processor to a coarse mash . Layer peas , then potato salad , over ham . '

In [11]:
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(batch_size)
    .shuffle(1000)
)

In [12]:
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=max_len + 1,
)

In [13]:
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [14]:
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: .
3: ,
4: and
5: to
6: in
7: the
8: with
9: a


In [15]:
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[  26   16  557    1    8  298  335  189    4 1054  494   27  332  228
  235  262    5  594   11  133   22  311    2  332   45  262    4  671
    4   70    8  171    4   81    6    9   65   80    3  121    3   59
   12    2  299    3   88  650   20   39    6    9   29   21    4   67
  529   11  164    2  320  171  102    9  374   13  643  306   25   21
    8  650    4   42    5  931    2   63    8   24    4   33    2  114
   21    6  178  181 1245    4   60    5  140  112    3   48    2  117
  557    8  285  235    4  200  292  980    2  107  650   28   72    4
  108   10  114    3   57  204   11  172    2   73  110  482    3  298
    3  190    3   11   23   32  142   24    3    4   11   23   32  142
   33    6    9   30   21    2   42    6  353    3 3224    3    4  150
    2  437  494    8 1281    3   37    3   11   23   15  142   33    3
    4   11   23   32  142   24    6    9  291  188    5    9  412  572
    2  230  494    3   46  335  189    3   20  557    2    0    0    0
    0 

### Create the training set

In [16]:
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

### Model

In [17]:
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(vocab_size, emb_dim)(inputs)
x = layers.LSTM(units, return_sequences=True)(x)
outputs = layers.Dense(vocab_size, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

### Train

In [18]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [None]:
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  

    def sample_from(self, probs, temperature):  
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  
            sample_token, probs = self.sample_from(y[0][-1], temperature)  
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token) 
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens=100, temperature=1.0)

In [23]:
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.weights.h5",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

text_generator = TextGenerator(vocab)

In [24]:
lstm.fit(
    train_ds,
    epochs=epochs,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)

Epoch 1/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 527ms/step - loss: 5.0297
generated text:
recipe for bowl ; stir 30 oven 1 preheat instant a shallow additions . the first allow oil in soak the shaved scallops with cutting through until showing sugar with then a inch opposite crusty or pierced . vinegar ; preheat over small saucepan to 5 . cool . drain heat and the water 

[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m340s[0m 538ms/step - loss: 4.1730
Epoch 2/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 534ms/step - loss: 3.1131
generated text:
recipe for ending with lemon onion and yogurt | prepare rack in a rimmed in the butter topped the roast wax paper with a thin fluted heel . tent with salt and pepper to large thinly floured boned side into remaining sink . 

[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m340s[0m 541ms/step - loss: 2.9312
Epoch 3/25
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

<keras.src.callbacks.history.History at 0x2215007bf20>

In [None]:
lstm.save(r"\Generative_Deep_Learning\Autoregressive\models\lstm.keras")

### Generate text

In [28]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [29]:
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=1.0
)


generated text:
recipe for roasted vegetables | chop 1 / 2 cup



In [30]:
print_probs(info, vocab)


PROMPT: recipe for roasted vegetables | chop 1 /
4:   	41.150001525878906%
2:   	33.43000030517578%
3:   	11.430000305175781%
8:   	11.319999694824219%
1:   	0.6000000238418579%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 2
cup:   	62.310001373291016%
teaspoon:   	8.270000457763672%
bunch:   	2.509999990463257%
inch:   	2.5%
garlic:   	2.190000057220459%
--------



In [31]:
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=0.2
)


generated text:
recipe for roasted vegetables | chop 1 / 4 cup



In [32]:
print_probs(info, vocab)


PROMPT: recipe for roasted vegetables | chop 1 /
4:   	73.69000244140625%
2:   	26.06999969482422%
3:   	0.11999999731779099%
8:   	0.11999999731779099%
1:   	0.0%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 4
cup:   	99.94000244140625%
of:   	0.05999999865889549%
teaspoon:   	0.0%
inch:   	0.0%
garlic:   	0.0%
--------



In [33]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=1.0
)
print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | combine


PROMPT: recipe for chocolate ice cream |
in:   	19.81999969482422%
bring:   	10.510000228881836%
combine:   	10.34000015258789%
preheat:   	8.229999542236328%
stir:   	3.9000000953674316%
--------



In [34]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=0.2
)
print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | in


PROMPT: recipe for chocolate ice cream |
in:   	91.44999694824219%
bring:   	3.8299999237060547%
combine:   	3.5299999713897705%
preheat:   	1.1299999952316284%
stir:   	0.029999999329447746%
--------

