In [5]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input

# Load and preprocess dataset
dataset = pd.read_csv("RecipeNLG_dataset.csv").sample(n=20000, random_state=42)
dataset = dataset[['ingredients', 'directions']].dropna()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9, ]", "", text)  # Remove special characters
    return text

dataset["ingredients"] = dataset["ingredients"].apply(clean_text)
dataset["directions"] = dataset["directions"].apply(clean_text)

# Add special tokens
dataset["input_text"] = "startseq " + dataset["ingredients"] + " endseq"
dataset["target_text"] = "startseq " + dataset["directions"] + " endseq"

# Tokenization
vocab_size = 5000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(dataset["input_text"].tolist() + dataset["target_text"].tolist())

train_sequences = tokenizer.texts_to_sequences(dataset["input_text"])
train_targets = tokenizer.texts_to_sequences(dataset["target_text"])

max_seq_length = 30
train_sequences = pad_sequences(train_sequences, maxlen=max_seq_length, padding="post")
train_targets = pad_sequences(train_targets, maxlen=max_seq_length, padding="post")

# Convert to numpy
train_sequences = np.array(train_sequences)
train_targets = np.array(train_targets)

# Create training dataset
dataset = tf.data.Dataset.from_tensor_slices((train_sequences, train_targets))
dataset = dataset.batch(8).prefetch(tf.data.experimental.AUTOTUNE)

# Define Encoder-Decoder Model
embedding_dim = 128
lstm_units = 256

# Encoder
encoder_inputs = Input(shape=(max_seq_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_seq_length,))
decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Compile Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Print Model Summary
model.summary()

# Train Model
history = model.fit([train_sequences, train_targets], train_targets,
                    epochs=5, batch_size=8, validation_split=0.1)

# Save Model and Tokenizer
model.save("recipe_seq2seq_model.keras")  # For Keras format
  # TensorFlow SavedModel format


import pickle
with open("tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer, file)

# --- Inference Mode ---
# Load Model
model = tf.keras.models.load_model("recipe_seq2seq_model")

# Load Tokenizer
with open("tokenizer.pkl", "rb") as file:
    tokenizer = pickle.load(file)

index_to_word = {v: k for k, v in tokenizer.word_index.items()}

def generate_text(input_text, tokenizer, model, max_length=30):
    """Generate text using the trained model."""
    
    # Encode input text
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_length, padding="post")

    # Get encoder states
    encoder_model = Model(encoder_inputs, encoder_states)
    states_value = encoder_model.predict(input_seq)

    # Start the decoding process
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index["startseq"]
    
    output_text = []
    
    for _ in range(max_length):
        output_tokens, h, c = decoder_lstm(decoder_embedding, initial_state=states_value)
        output_probs = decoder_dense(output_tokens)
        word_id = np.argmax(output_probs[0, -1, :])

        word = index_to_word.get(word_id, "")
        if word == "endseq" or word == "":
            break

        output_text.append(word)
        target_seq[0, 0] = word_id
        states_value = [h, c]  # Update states

    return " ".join(output_text)

# Example Test
test_input = "startseq sugar, milk, flour endseq"
generated_text = generate_text(test_input, tokenizer, model)
print("Generated Recipe:", generated_text)


Epoch 1/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 70ms/step - accuracy: 0.4660 - loss: 3.4818 - val_accuracy: 0.9009 - val_loss: 0.3861
Epoch 2/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 67ms/step - accuracy: 0.9198 - loss: 0.2744 - val_accuracy: 0.9329 - val_loss: 0.1037
Epoch 3/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 67ms/step - accuracy: 0.9410 - loss: 0.0722 - val_accuracy: 0.9392 - val_loss: 0.0470
Epoch 4/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 67ms/step - accuracy: 0.9489 - loss: 0.0237 - val_accuracy: 0.9415 - val_loss: 0.0289
Epoch 5/5
[1m2250/2250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 67ms/step - accuracy: 0.9487 - loss: 0.0082 - val_accuracy: 0.9421 - val_loss: 0.0229


ValueError: File format not supported: filepath=recipe_seq2seq_model. Keras 3 only supports V3 `.keras` files and legacy H5 format files (`.h5` extension). Note that the legacy SavedModel format is not supported by `load_model()` in Keras 3. In order to reload a TensorFlow SavedModel as an inference-only layer in Keras 3, use `keras.layers.TFSMLayer(recipe_seq2seq_model, call_endpoint='serving_default')` (note that your `call_endpoint` might have a different name).