In [None]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input

# Load and preprocess dataset
dataset = pd.read_csv("RecipeNLG_dataset.csv").sample(n=50000, random_state=42)
dataset = dataset[['ingredients', 'directions']].dropna()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9, ]", "", text)  # Remove special characters
    return text

dataset["ingredients"] = dataset["ingredients"].apply(clean_text)
dataset["directions"] = dataset["directions"].apply(clean_text)

# Add special tokens
dataset["input_text"] = "startseq " + dataset["ingredients"] + " endseq"
dataset["target_text"] = "startseq " + dataset["directions"] + " endseq"

# Tokenization
vocab_size = 5000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(dataset["input_text"].tolist() + dataset["target_text"].tolist())

train_sequences = tokenizer.texts_to_sequences(dataset["input_text"])
train_targets = tokenizer.texts_to_sequences(dataset["target_text"])

max_seq_length = 30
train_sequences = pad_sequences(train_sequences, maxlen=max_seq_length, padding="post")
train_targets = pad_sequences(train_targets, maxlen=max_seq_length, padding="post")

# Convert to numpy
train_sequences = np.array(train_sequences)
train_targets = np.array(train_targets)

# Create training dataset
dataset = tf.data.Dataset.from_tensor_slices((train_sequences, train_targets))
dataset = dataset.batch(8).prefetch(tf.data.experimental.AUTOTUNE)

# Define Encoder-Decoder Model
embedding_dim = 128
lstm_units = 256

# Encoder
encoder_inputs = Input(shape=(max_seq_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_seq_length,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Compile Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Print Model Summary
model.summary()

# Train Model
history = model.fit([train_sequences, train_targets], train_targets,
                    epochs=10, batch_size=8, validation_split=0.1)

# Save Model
model.save("recipe_seq2seq_model.h5")

# Inference Model
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Generate Text (Inference Mode)
def generate_text(input_text, tokenizer, max_length=30):
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_length, padding="post")

    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index["startseq"]

    stop_condition = False
    output_text = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        word_id = np.argmax(output_tokens[0, -1, :])
        word = tokenizer.index_word.get(word_id, "")

        if word == "endseq" or len(output_text) >= max_length:
            stop_condition = True
        else:
            output_text.append(word)

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = word_id
        states_value = [h, c]

    return " ".join(output_text)

# Example Test
test_input = "Ingredients: sugar, milk, flour"
print("Generated Recipe:", generate_text(test_input, tokenizer))

Epoch 1/10
[1m1345/2250[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m1:14[0m 82ms/step - accuracy: 0.3903 - loss: 4.1294

KeyboardInterrupt: 