# 🥙 LSTM on Recipe Data

In this notebook, we'll walk through the steps required to train your own LSTM on the recipes dataset

In [2]:
%load_ext autoreload
%autoreload 2

import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
print(tf.__version__)

2.15.0


## 0. Parameters <a name="parameters"></a>

In [4]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

## 1. Load the data <a name="load"></a>

In [5]:
# Load the full dataset
with open("data/epirecipes/full_format_recipes.json") as json_data:
    recipe_data = json.load(json_data)

In [6]:
# Filter the dataset, оставляем тексты только с загаловками и описанием
filtered_data = [
    "Recipe for " + x["title"] + " | " + " ".join(x["directions"])
    for x in recipe_data
    if "title" in x
    and x["title"] is not None
    and "directions" in x
    and x["directions"] is not None
]

In [7]:
# Count the recipes
n_recipes = len(filtered_data)
print(f"{n_recipes} recipes loaded")

20111 recipes loaded


In [8]:
example = filtered_data[9]
print(example)

Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas  | Chop enough parsley leaves to measure 1 tablespoon; reserve. Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan, covered, 5 minutes. Meanwhile, sprinkle gelatin over water in a medium bowl and let soften 1 minute. Strain broth through a fine-mesh sieve into bowl with gelatin and stir to dissolve. Season with salt and pepper. Set bowl in an ice bath and cool to room temperature, stirring. Toss ham with reserved parsley and divide among jars. Pour gelatin on top and chill until set, at least 1 hour. Whisk together mayonnaise, mustard, vinegar, 1/4 teaspoon salt, and 1/4 teaspoon pepper in a large bowl. Stir in celery, cornichons, and potatoes. Pulse peas with marjoram, oil, 1/2 teaspoon pepper, and 1/4 teaspoon salt in a food processor to a coarse mash. Layer peas, then potato salad, over ham.


## 2. Tokenise the data

In [9]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data] #добавление пробелов вокруг знаков препинания, чтоюы они воспринимались как слова

In [10]:
# Display an example of a recipe
example_data = text_data[9]
example_data

'Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas | Chop enough parsley leaves to measure 1 tablespoon ; reserve . Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan , covered , 5 minutes . Meanwhile , sprinkle gelatin over water in a medium bowl and let soften 1 minute . Strain broth through a fine - mesh sieve into bowl with gelatin and stir to dissolve . Season with salt and pepper . Set bowl in an ice bath and cool to room temperature , stirring . Toss ham with reserved parsley and divide among jars . Pour gelatin on top and chill until set , at least 1 hour . Whisk together mayonnaise , mustard , vinegar , 1 / 4 teaspoon salt , and 1 / 4 teaspoon pepper in a large bowl . Stir in celery , cornichons , and potatoes . Pulse peas with marjoram , oil , 1 / 2 teaspoon pepper , and 1 / 4 teaspoon salt in a food processor to a coarse mash . Layer peas , then potato salad , over ham . '

In [11]:
# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [13]:
# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower", #нижний регистр
    max_tokens=VOCAB_SIZE, #присваиваем 10000 наиболее популярным словам целочисленные индексы
    output_mode="int",
    output_sequence_length=MAX_LEN + 1, #усечение длины последовательности до 201 лексемы
)

In [14]:
# Применяем слой TextVectorization к обучающим данным
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary() #тут храним список лексем слов




In [15]:
# Display some token:word mappings
#индекс 0 - стоп-лексема, использвуется для обозначения конца текстовой строки
#индекс 1 - для неизвестных слов
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: .
3: ,
4: and
5: to
6: in
7: the
8: with
9: a


In [16]:
# Display the same example converted to ints
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[  26   16  557    1    8  298  335  189    4 1054  494   27  332  228
  235  262    5  594   11  133   22  311    2  332   45  262    4  671
    4   70    8  171    4   81    6    9   65   80    3  121    3   59
   12    2  299    3   88  650   20   39    6    9   29   21    4   67
  529   11  164    2  320  171  102    9  374   13  643  306   25   21
    8  650    4   42    5  931    2   63    8   24    4   33    2  114
   21    6  178  181 1245    4   60    5  140  112    3   48    2  117
  557    8  285  235    4  200  292  980    2  107  650   28   72    4
  108   10  114    3   57  204   11  172    2   73  110  482    3  298
    3  190    3   11   23   32  142   24    3    4   11   23   32  142
   33    6    9   30   21    2   42    6  353    3 3224    3    4  150
    2  437  494    8 1281    3   37    3   11   23   15  142   33    3
    4   11   23   32  142   24    6    9  291  188    5    9  412  572
    2  230  494    3   46  335  189    3   20  557    2    0    0    0
    0 

## 3. Create the Training Set

In [17]:
#создаем обучающий набор данных, состоящий из вектора индексов лексем рецептов 
#и того же самого вектора, смещенного на одну лексему (таргет)
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

## 4. Build the LSTM <a name="build"></a>

In [18]:
#входной слой не требует заранее указывать длину последовательности 
inputs = layers.Input(shape=(None,), dtype="int32")

#преобразует лексемы в вектора, значения вектора подбираются как веса, указываем размер словаря т размерность скрытого пространства
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)

#возращает новое скрытое состояние, опираясь на предыдущее состояние и векторное представление текущего слова
#указываем размерность скрытого вектора, и возращаем полную последовательность, а не только скрытое состояние из последнего шага
x = layers.LSTM(N_UNITS, return_sequences=True)(x)

#преобразует скрытые состояния на каждом временном шаге в вектор вероятностей следующей лексемы
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         1000000   
                                                                 
 lstm (LSTM)                 (None, None, 128)         117248    
                                                                 
 dense (Dense)               (None, None, 10000)       1290000   
                                                                 
Total params: 2407248 (9.18 MB)
Trainable params: 2407248 (9.18 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
if LOAD_MODEL:
    # model.load_weights('./models/model')
    lstm = models.load_model("./models/lstm", compile=False)

## 5. Train the LSTM <a name="train"></a>

In [20]:
loss_fn = losses.SparseCategoricalCrossentropy() #как перекрестнаяя энтропия, но для целочисленных меток, а не векторов прямого кодирования
lstm.compile("adam", loss_fn)

In [21]:
# Create a TextGenerator checkpoint
#temperature - температурный параметр, указывает на сколько детерминированным будет выбор текста, 
#0 - более детерминированный (чаще выбирается слово с большей вероятностью)
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # словарь обратного отображения (слова в индекс лексемы)

    def sample_from(self, probs, temperature):  # обновляет вероятности с коэффициентом temperature
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # начальный текст - последовательность слов, которую необходимо передать модели для генерации
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  #последовательность генерируется до тех пор пока длина не будет max_tokens или не будет создана стоп-лексема
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  #модель возращает для каждого слова в словаре вероятность стать следующим в последовательности
            sample_token, probs = self.sample_from(y[0][-1], temperature)  #вероятность передаётся в функцию sample_from, которая делает выбор с учетом temperature
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  #новое слово добавляется в исходный текст для следующей итерации
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens=100, temperature=1.0)

In [22]:
# Create a model save checkpoint
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.ckpt",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

# Tokenize starting prompt
text_generator = TextGenerator(vocab)

In [23]:
lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)

Epoch 1/25
generated text:
recipe for cobb metal cook , latkes of whisk with inch cubes stand and beginning oven and stir 

Epoch 2/25
generated text:
recipe for almond cheese and etc sliced lentil chicken | zucchini stock on a large bowl ( up - inch bowl to preheat oven . scoop mangoes into a 3 cups over boiling water with . bake until melted whipped until firm to light dressing , at turkey slightly mixture are slightly , up , 10 minutes . increase oven to rinse the vegetables . add oil in flower eggs and broiler over 50 - the addition . punch shrimp and reserve pot . turn cream chilled . drizzle onto spicy pan and reduce heat ; sauté until almost tender

Epoch 3/25
generated text:
recipe for date tart | tomato in skillet with salt until smooth . ( can be veal burner , about 1 hour more , transferring in a bowl - cool , baking sheet . 3 - ovenproof bath to 14 - allow glass . spray pie pan with parchment fat . slice dark oil in 11 - inch oval baking dish . mix next 8 juice , simple jui

<keras.src.callbacks.History at 0x202a1a471d0>

In [24]:
# Save the final model
lstm.save("./models/lstm")

INFO:tensorflow:Assets written to: ./models/lstm\assets


INFO:tensorflow:Assets written to: ./models/lstm\assets


## 6. Generate text using the LSTM

In [25]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [26]:
#пример генерации продолжения фразы
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=1.0
)


generated text:
recipe for roasted vegetables | chop 1 / 4 cup



In [27]:
#вероятности следующих слов
print_probs(info, vocab)


PROMPT: recipe for roasted vegetables | chop 1 /
2:   	44.95%
4:   	42.03%
3:   	9.08%
8:   	2.1%
1:   	0.43%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 4
cup:   	80.9%
tsp:   	5.24%
of:   	4.72%
teaspoon:   	3.39%
cups:   	0.8%
--------



In [28]:
#другой температурный коэфф
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=0.2
)


generated text:
recipe for roasted vegetables | chop 1 / 2 cup



In [29]:
print_probs(info, vocab)


PROMPT: recipe for roasted vegetables | chop 1 /
2:   	58.32%
4:   	41.66%
3:   	0.02%
8:   	0.0%
1:   	0.0%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 2
cup:   	99.98%
tsp:   	0.02%
teaspoon:   	0.0%
-:   	0.0%
of:   	0.0%
--------



In [30]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=1.0
)
print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | 1


PROMPT: recipe for chocolate ice cream |
in:   	12.33%
bring:   	11.19%
1:   	10.77%
combine:   	10.58%
whisk:   	6.92%
--------



In [31]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=0.2
)
print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | bring


PROMPT: recipe for chocolate ice cream |
in:   	37.54%
bring:   	23.08%
1:   	19.08%
combine:   	17.48%
whisk:   	2.09%
--------

