<a href="https://colab.research.google.com/github/Ahtesham519/Genrative_Deep_learning_v2_2023/blob/main/LSTM_2_ipynbn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#LSTM on Recipe DATA

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import json
import string
import re

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

#0 . Parameters

In [None]:
VOCAB_SIZE = 10000
MAX_LEN= 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25


#1. Load the data

In [None]:
#load the full dataset
with open("/app/data/epirecipes/full_format_recipes.json") as json_data:
  recipe_data = json.load(json_data)

In [None]:
#Filter the dataset
filtered_data = [
    "Recipe for " + x["title"] + " | " + " ".join(x["directions"])
    for x in recipe_data
    if "title" in x
    and x["title"] is not None
    and "directions" in x
    and x["directions"] is not None
]

In [None]:
#Count the recipes
n_recipes = len(filtered_data)
print(f"{n_recipes} recipes loaded")

In [None]:
example = filtered_data[0]
print(example)

#2. Tokenise the data

In [None]:
#Pad the punctuation to treat them as seperate 'words'

def pad_punctuation(s):
  s = re.sub(f"([{string.punctuation}])", r" \1" , s)
  s = re.sub(" + " , " ", s)
  return s

text_data = [pad_punctuation(x) for x in filtered_data]

In [None]:
#display an example of a recipe
example_data = text_data[9]
example_data


In [None]:
#convert to a tensorflow dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [None]:
#Create a vectorization layers
vectorize_layer = layers.TextVectorization(
    standardize = "lower",
    max_tokens = VOCAB_SIZE ,
    output_mode = "int",
    output_sequence_length = MAX_LEN + 1,
)

In [None]:
#Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [None]:
#Display some token:word mappings
for i , word in enumerate(vocab[:10]):
  print(f"{i} : {word}")

In [None]:
#Display the same example converted to ints
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

#3. Create the training set

In [None]:
#create the training set of the recipes and the same text shifted by one word
def prepare_inputs(text):
  text = tf.expand_dims(text, -1)
  tokenized_sentences = vectorize_layer(text)
  x = tokenized_sentences[:, : -1]
  y = tokenized_sentences[:, 1:]
  return x , y

train_ds = text_ds.map(prepare_inputs)

#4. Build the LSTM

In [None]:
inputs = layers.Input(shape= (None,) , dtype = "int32")
x = layers.Embedding(VOCAB_SIZE , EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences = True)(x)
outputs = layers.Dense(VOCAB_SIZE , activation = "softmax")(x)
lstm = model.Model(inputs, outputs)
lstm.summary()