In [3]:
! pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [4]:
import opendatasets as od
import pandas

od.download( "https://www.kaggle.com/datasets/ahmedsamyibrahim/pizza-dataset")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: yaramahrous
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/ahmedsamyibrahim/pizza-dataset
Downloading pizza-dataset.zip to ./pizza-dataset


100%|██████████| 104M/104M [00:01<00:00, 90.9MB/s] 





In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

In [5]:
df = pd.read_json("pizza-dataset/PIZZA_train.json", lines=True)
df = df.sample(1000000)

In [10]:
# Extract source (SRC) and target (EXR) fields
src_texts = df["train.SRC"].tolist()
tgt_texts = ["<sos> " + text + " <eos>" for text in df["train.EXR"].tolist()]  # Add start and end tokens

# Tokenizers
src_tokenizer = Tokenizer(filters="")
tgt_tokenizer = Tokenizer(filters="")

src_tokenizer.fit_on_texts(src_texts)
tgt_tokenizer.fit_on_texts(tgt_texts)

# Vocabulary sizes
src_vocab_size = len(src_tokenizer.word_index) + 1
tgt_vocab_size = len(tgt_tokenizer.word_index) + 1

# Tokenize and pad sequences
src_sequences = src_tokenizer.texts_to_sequences(src_texts)
tgt_sequences = tgt_tokenizer.texts_to_sequences(tgt_texts)

max_src_len = max(len(seq) for seq in src_sequences)
max_tgt_len = max(len(seq) for seq in tgt_sequences)

src_sequences = pad_sequences(src_sequences, maxlen=max_src_len, padding="post")
tgt_sequences = pad_sequences(tgt_sequences, maxlen=max_tgt_len, padding="post")

# Dataset Preparation
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = tf.data.Dataset.from_tensor_slices((src_sequences, tgt_sequences))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Model Parameters
EMBED_SIZE = 128
HIDDEN_SIZE = 256

# Encoder
class Encoder(Model):
    def __init__(self, vocab_size, embed_size, hidden_size, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size, trainable=True)
        self.lstm = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(hidden_size, return_sequences=True, return_state=True)
        )

    def call(self, x):
        x = self.embedding(x)
        outputs, forward_h, forward_c, backward_h, backward_c = self.lstm(x)
        state_h = tf.concat([forward_h, backward_h], axis=-1)
        state_c = tf.concat([forward_c, backward_c], axis=-1)
        return outputs, state_h, state_c

    def get_config(self):
        config = super(Encoder, self).get_config()
        config.update({
            "vocab_size": self.embedding.input_dim,
            "embed_size": self.embedding.output_dim,
            "hidden_size": self.lstm.forward_layer.units,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

# Attention Layer
class Attention(tf.keras.layers.Layer):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.W1 = Dense(hidden_size)
        self.W2 = Dense(hidden_size)
        self.V = Dense(1)

    def call(self, encoder_outputs, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = self.V(tf.nn.tanh(self.W1(encoder_outputs) + self.W2(hidden_with_time_axis)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * encoder_outputs
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

# Decoder
class Decoder(Model):
    def __init__(self, vocab_size, embed_size, hidden_size, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.embedding = Embedding(vocab_size, embed_size, trainable=True)
        self.lstm = LSTM(hidden_size * 2, return_sequences=True, return_state=True)
        self.fc = Dense(vocab_size)
        self.attention = Attention(hidden_size)

    def call(self, x, encoder_outputs, hidden, cell):
        context_vector, attention_weights = self.attention(encoder_outputs, hidden)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        outputs, state_h, state_c = self.lstm(x, initial_state=[hidden, cell])
        logits = self.fc(outputs)
        return logits, state_h, state_c, attention_weights


    def get_config(self):
        config = super(Decoder, self).get_config()
        config.update({
            "vocab_size": self.embedding.input_dim,
            "embed_size": self.embedding.output_dim,
            "hidden_size": self.lstm.units // 2,  # Divide by 2 because of bidirectional
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

# Define the model
encoder = Encoder(src_vocab_size, EMBED_SIZE, HIDDEN_SIZE)
decoder = Decoder(tgt_vocab_size, EMBED_SIZE, HIDDEN_SIZE)

# Loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    # Squeeze the predictions to remove the extra dimension
    pred = tf.squeeze(pred, axis=1)
    loss = loss_object(real, pred)
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    return tf.reduce_mean(loss)

# Optimizer
optimizer = tf.keras.optimizers.Adam()



In [12]:
from tensorflow.keras.models import load_model


class Seq2SeqModel(tf.keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(Seq2SeqModel, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def get_config(self):
        config = super(Seq2SeqModel, self).get_config()
        config.update({
            'encoder': self.encoder,
            'decoder': self.decoder,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)


    def call(self, inputs):
        src, tgt = inputs
        encoder_outputs, state_h, state_c = self.encoder(src)
        logits, _, _, _ = self.decoder(tgt, encoder_outputs, state_h, state_c)
        return logits

seq2seq_model = Seq2SeqModel(encoder, decoder)
seq2seq_model.save("seq2seq_model.keras")


loaded_model = load_model("seq2seq_model.keras", custom_objects={
    "Seq2SeqModel": Seq2SeqModel,
    "Encoder": Encoder,
    "Decoder": Decoder,
    "Attention": Attention
})


  return saving_lib.save_model(model, filepath)


In [None]:
# Training step
@tf.function
def train_step(src, tgt):
    loss = 0
    with tf.GradientTape() as tape:
        encoder_outputs, enc_hidden, enc_cell = encoder(src)
        dec_hidden, dec_cell = enc_hidden, enc_cell
        dec_input = tf.expand_dims([tgt_tokenizer.word_index["<sos>"]] * BATCH_SIZE, 1)

        for t in range(1, tgt.shape[1]):
            predictions, dec_hidden, dec_cell, _ = decoder(dec_input, encoder_outputs, dec_hidden, dec_cell)
            loss += loss_function(tgt[:, t], predictions)
            dec_input = tf.expand_dims(tgt[:, t], 1)

    batch_loss = loss / int(tgt.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss
# Training loop
EPOCHS = 5

for epoch in range(EPOCHS):
    total_loss = 0

    for (batch, (src, tgt)) in enumerate(dataset):
        batch_loss = train_step(src, tgt)
        total_loss += batch_loss

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss.numpy():.4f}")

In [None]:
def translate(sentence, encoder, decoder, src_tokenizer, tgt_tokenizer, max_tgt_len):
    # Tokenize and pad the input
    input_sequence = src_tokenizer.texts_to_sequences([sentence])
    input_sequence = pad_sequences(input_sequence, maxlen=max_src_len, padding="post")

    # Encode the input sequence
    encoder_outputs, enc_hidden, enc_cell = encoder(tf.convert_to_tensor(input_sequence))

    # Initialize the decoder
    dec_input = tf.expand_dims([tgt_tokenizer.word_index["<sos>"]], 0)
    dec_hidden, dec_cell = enc_hidden, enc_cell

    result_tokens = []

    for _ in range(max_tgt_len):
        predictions, dec_hidden, dec_cell, _ = decoder(dec_input, encoder_outputs, dec_hidden, dec_cell)
        predicted_id = tf.argmax(predictions[0, 0]).numpy()

        if predicted_id == tgt_tokenizer.word_index["<eos>"]:
            break

        if predicted_id in tgt_tokenizer.index_word:  # Ensure valid token
            result_tokens.append(tgt_tokenizer.index_word[predicted_id])
        else:
            result_tokens.append("<unk>")  # Handle unknown tokens

        dec_input = tf.expand_dims([predicted_id], 0)

    # Join tokens to form the final output
    predicted_sentence = " ".join(result_tokens)
    return predicted_sentence


In [None]:
input_sentence = "i'd like to get a small pepperoni and tuna pizza and i don't want it on thin crust"

# Translate the input
predicted_output = translate(input_sentence, encoder, decoder, src_tokenizer, tgt_tokenizer, max_tgt_len)
print("Input Sentence:", input_sentence)
print("Predicted Output:", predicted_output)

Input Sentence: i'd like to get a small pepperoni and tuna pizza and i don't want it on thin crust
Predicted Output: (order (pizzaorder (number 1 ) (size small ) (topping pepperoni ) (topping tuna ) (topping tuna ) (style thin_crust ) ) )


In [None]:
df_dev = pd.read_json("pizza-dataset/PIZZA_dev.json", lines=True)


In [None]:
predictions = []

for i in range(100):
    predicted_output = translate(df_dev["dev.SRC"].loc[i], encoder, decoder, src_tokenizer, tgt_tokenizer, max_tgt_len)
    predictions.append(predicted_output)

In [None]:
ground_truths = []
for i in range(100):
    ground_truths.append(df_dev["dev.EXR"].loc[i].lower())

In [None]:
for i in range(100):
  print(f"Expected : {ground_truths[i]}")
  print(f"Predicted : {predictions[i]}")

(order (pizzaorder (number 1 ) (size medium ) (topping onions ) (topping tuna ) (topping ham ) ) )
(order (pizzaorder (number 1 ) (size medium ) (topping ham ) (topping onions ) (topping tuna ) ) )


In [None]:
for i in range(100):
  print(f"Expected : {ground_truths[i]}")
  print(f"Predicted : {predictions[i]}")

Expected : (order (pizzaorder (number 2 ) (size medium ) (complex_topping (quantity extra ) (topping cheese ) ) (topping pepperoni ) ) (pizzaorder (number 2 ) (size medium ) (topping olives ) (topping sausage ) ) (pizzaorder (number 3 ) (size large ) (topping pepperoni ) (topping sausage ) ) )
Predicted : (order (pizzaorder (number 3 ) (size large ) (topping pepperoni ) (topping sausage ) ) (pizzaorder (number 2 ) (size medium ) (topping sausage ) (topping tuna ) ) (pizzaorder (number 2 ) (size medium ) (topping sausage ) (topping olives ) ) )
Expected : (order (pizzaorder (number 5 ) (size medium ) (topping ham ) (topping tomatoes ) ) )
Predicted : (order (pizzaorder (number 5 ) (size medium ) (topping tomatoes ) (topping ham ) ) )
Expected : (order (pizzaorder (number 1 ) (size large ) (style vegetarian ) (complex_topping (quantity extra ) (topping banana_peppers ) ) ) )
Predicted : (order (pizzaorder (number 1 ) (size large ) (style vegetarian ) (complex_topping (quantity extra ) (t