In [127]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate, Dot, Activation, Lambda
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords


pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
np.random.seed(42)
tf.random.set_seed(42)

In [209]:
main_train = pd.read_json("dataset/PIZZA_train.json", lines=True,)
df_dev = pd.read_json("dataset/PIZZA_dev.json", lines=True,)

In [220]:
df_train = main_train.sample(100000)

In [122]:
word_to_num = {
    "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, 
    "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9,
    "ten": 10, "eleven": 11, "twelve": 12, "thirteen": 13,
    "fourteen": 14, "fifteen": 15, "sixteen": 16, "seventeen": 17,
    "eighteen": 18, "nineteen": 19, "twenty": 20,
    "thirty": 30, "forty": 40, "fifty": 50, "sixty": 60,
    "seventy": 70, "eighty": 80, "ninety": 90,
    "hundred": 100
}


def words_to_number(word):
    word = word.lower().strip()
    
    # Handle simple numbers directly
    if word in word_to_num:
        return word_to_num[word]
    
    # Handle composite numbers (e.g., twenty-one)
    if "-" in word:
        parts = word.split("-")
        return sum(word_to_num[part] for part in parts)
    
    # Handle "hundred" cases (e.g., one hundred twenty-three)
    if "hundred" in word:
        parts = word.split("hundred")
        hundreds = word_to_num[parts[0].strip()] * 100
        if parts[1].strip():  # If there's something after "hundred"
            return hundreds + words_to_number(parts[1].strip())
        return hundreds
    
    return None  # Return None if the input is not a valid number word

def standardize_numbers(sentence):
    # Tokenize the sentence
    tokens = sentence.split()
    
    # Replace number words with digits
    standardized_tokens = [
        str(words_to_number(token)) if words_to_number(token) is not None else token
        for token in tokens
    ]
    
    return " ".join(standardized_tokens)




In [123]:
def clean_text(text):
    text = re.sub(r"[^\w']", " ", text)  # Remove non-word characters
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces
    text = text.lower().strip()  # Lowercase and strip whitespace
    return text

In [124]:
lemmatizer = WordNetLemmatizer()

def lemma(text):
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

In [125]:
CONTRACTIONS = {
    "n't": "not",
    "'s": "is",
    "'re": "are",
    "'m": "am",
    "'ll": "will",
    "'ve": "have",
    "'d": "would",
    "'em": "them",
    "'all": "all",
    "'cause": "because",
    "'clock": "oclock",
    "'tis": "it is",
    "'twas": "it was",
    "'tween": "between",
    "'twere": "it were",
    "'twould": "it would",
    "'twixt": "betwixt",
    "'twill": "it will",
    "'til": "until",
    "'bout": "about",
    "'cept": "except",
    "'cos": "because",
    "'fore": "before",
    "'round": "around",
    "'n'": "and",
    "'neath": "beneath",
    "'nother": "another",
    "'nuff": "enough",
}
def expnad_abb2(text):

    pattern = re.compile(
        r"(" + "|".join(re.escape(key) for key in CONTRACTIONS.keys()) + r")"
    )
    expanded_text = pattern.sub(lambda x: " " + CONTRACTIONS[x.group()], text)
    return expanded_text


In [150]:
negation_words = {
    "no",
    "not",
    "none",
    "never",
    "without",
    "avoid",
    "neither",
    "nor",
    "hate",
    "hold",
}
pizza = {"pizza", "pizzas", "pie", "pies"}

stop_negation_words = {"and", "but"}
stop_words = set(stopwords.words("english"))
stop_words = stop_words - negation_words - stop_negation_words - {'all'}
stop_words.update({"would", "like", "get", "want"})
stop_words.update(pizza)


def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

In [134]:
def std_negation(text):
    text = text.split()
    for i, word in enumerate(text):
        if word in negation_words:
            text[i] = 'not'
    return " ".join(text)

In [221]:
# Data Preparation
input_texts = df_train["train.SRC"].apply(clean_text).apply(lemma).apply(standardize_numbers).apply(expnad_abb2).apply(remove_stopwords).apply(std_negation)
output_texts = ['<SOS> ' + token + ' <EOS>' for token in df_train["train.EXR"]]

In [199]:
from tensorflow.keras.layers import Multiply, Add, Bidirectional

def pointer_generator_model(input_vocab_size, output_vocab_size, max_input_length=100,max_output_length=100 ,embedding_dim=128, hidden_units=256):
    # Encoder
    encoder_input = Input(shape=(max_input_length,), name='encoder_input')
    encoder_embedding = Embedding(input_vocab_size, embedding_dim, name='encoder_embedding')(encoder_input)
    encoder_lstm = Bidirectional(LSTM(hidden_units, return_sequences=True, return_state=True, name='encoder_lstm'))
    encoder_output, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_embedding)
    encoder_state_h = Concatenate()([forward_h, backward_h])
    encoder_state_c = Concatenate()([forward_c, backward_c])
    # encoder_output, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding)
    encoder_states = [encoder_state_h, encoder_state_c]

    # Decoder
    decoder_input = Input(shape=(max_output_length,), name='decoder_input')
    decoder_embedding = Embedding(output_vocab_size, embedding_dim, name='decoder_embedding')(decoder_input)
    decoder_lstm = LSTM(hidden_units*2, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_output, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

    # Attention Mechanism
    attention_dot = Dot(axes=[2, 2], name='attention_dot')([decoder_output, encoder_output])
    attention_activation = Activation('softmax', name='attention_activation')(attention_dot)
    context_vector = Dot(axes=[2, 1], name='context_vector')([attention_activation, encoder_output])

    # Pointer mechanism (Generate + Copy)
    context_decoder_combined = Concatenate(axis=-1, name='context_decoder_combined')([context_vector, decoder_output])
    pointer_vocab_distribution = Dense(output_vocab_size, activation='softmax', name='pointer_vocab_distribution')(context_decoder_combined)
    p_gen = Dense(1, activation='sigmoid', name='p_gen')(context_decoder_combined)

    # Final output distribution
    final_vocab_distribution = Multiply()([pointer_vocab_distribution, p_gen])  # Shape: [batch_size, time_steps, vocab_size]

    # Project attention distribution to match vocab size
    # Project attention distribution to match vocab size
    attention_projection = Dense(output_vocab_size, activation='softmax')(attention_activation)
    final_attention_distribution = Multiply()([attention_projection, 1 - p_gen])  # Shape: [batch_size, time_steps, vocab_size]

    # Combine the two distributions
    final_output = Add(name='final_output')([final_vocab_distribution, final_attention_distribution])  # Shape: [batch_size, time_steps, vocab_size]

    # Define the model
    model = Model(inputs=[encoder_input, decoder_input], outputs=final_output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model


In [222]:
# Initialize Tokenizers
input_tokenizer = Tokenizer(filters='', lower=False, oov_token = "<OOV>")  # We don't want to lowercase for structured text
output_tokenizer = Tokenizer(filters='', lower = False , oov_token = "<OOV>")  # Same for output

# Fit the tokenizers on the text data
input_tokenizer.fit_on_texts(input_texts)
output_tokenizer.fit_on_texts(output_texts)

input_tokenizer.word_index['<PAD>'] = 0
input_tokenizer.index_word[0] = '<PAD>'
output_tokenizer.word_index['<PAD>'] = 0
output_tokenizer.index_word[0] = '<PAD>'

# input_tokenizer.word_index['<OOV>'] = len(input_tokenizer.word_index) + 1
# input_tokenizer.index_word[len(input_tokenizer.word_index) + 1] = '<OOV>'
# output_tokenizer.word_index['<OOV>'] = len(output_tokenizer.word_index) + 1
# output_tokenizer.index_word[len(output_tokenizer.word_index) + 1] = '<OOV>'


# Convert texts to sequences (convert to integer sequences)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
output_sequences = output_tokenizer.texts_to_sequences(output_texts)

# Padding sequences to ensure uniform length (adjust max_input_length and max_output_length as needed)
max_input_length = max(len(seq) for seq in input_sequences)
max_output_length = max(len(seq) for seq in output_sequences)

input_sequences = pad_sequences(input_sequences, maxlen=100, padding='post')
output_sequences = pad_sequences(output_sequences, maxlen=100, padding='post')

# Convert output sequences into one-hot format for training
# output_sequences_one_hot = np.array([np.expand_dims(seq, axis=-1) for seq in output_sequences])  # For sparse categorical loss

# Get vocab sizes
input_vocab_size = len(input_tokenizer.word_index) + 1  # Include padding token
output_vocab_size = len(output_tokenizer.word_index) + 1  # Include padding token

print(f'Input vocab size: {input_vocab_size}')
print(f'Output vocab size: {output_vocab_size}')


Input vocab size: 219
Output vocab size: 184


In [223]:
# Prepare decoder input and output sequences for training
decoder_input_sequences = np.zeros_like(output_sequences)  # Start tokens will be placed here
decoder_input_sequences[:, 1:] = output_sequences[:, :-1]  # Shifted sequence (remove first token)

# The target is the original output sequence
decoder_output_sequences = output_sequences  # For teacher forcing

print(f'Decoder input sequence shape: {decoder_input_sequences.shape}')
print(f'Decoder output sequence shape: {decoder_output_sequences.shape}')


Decoder input sequence shape: (100000, 100)
Decoder output sequence shape: (100000, 100)


In [224]:
# Prepare the model inputs
encoder_input_data = input_sequences  # Encoder input (tokenized input sentences)
decoder_input_data = decoder_input_sequences  # Decoder input (shifted target sequences)

# Prepare the target data for the decoder (one-hot encoded)
decoder_output_data = decoder_output_sequences  # Use integer sequences

print(f'Encoder input data shape: {encoder_input_data.shape}')
print(f'Decoder input data shape: {decoder_input_data.shape}')
print(f'Decoder output data shape: {decoder_output_data.shape}')


Encoder input data shape: (100000, 100)
Decoder input data shape: (100000, 100)
Decoder output data shape: (100000, 100)


In [225]:
# Build and summarize the model
model = pointer_generator_model(input_vocab_size, output_vocab_size)
model.summary()

Model: "model_16"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_input (InputLayer)  [(None, 100)]                0         []                            
                                                                                                  
 encoder_embedding (Embeddi  (None, 100, 128)             28032     ['encoder_input[0][0]']       
 ng)                                                                                              
                                                                                                  
 decoder_input (InputLayer)  [(None, 100)]                0         []                            
                                                                                                  
 bidirectional_5 (Bidirecti  [(None, 100, 512),           788480    ['encoder_embedding[0][

In [227]:
sample_weights = np.where(decoder_output_sequences == 0, 0.0, 1.0)  # Zero weight for padding

# Training the model
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_output_data,
    epochs=5,
    batch_size=64
)


Epoch 1/5


KeyboardInterrupt: 

In [219]:
def predict_sequence(input_text, model, input_tokenizer, output_tokenizer,max_input_length=100,max_output_length= 100):

    # Tokenize and pad the input text (same as during training)
    input_sequence = input_tokenizer.texts_to_sequences([input_text])
    input_sequence = pad_sequences(input_sequence, maxlen=max_input_length, padding='post')

    # Prepare initial decoder input (start with <SOS> token)
    decoder_input = np.zeros((1, max_output_length))
    decoder_input[0, 0] = output_tokenizer.word_index['<SOS>']

    # Prepare to collect the prediction
    predicted_sequence = []

    # Decoder loop for predicting one token at a time
    for t in range(1, max_output_length):
        # Predict the next token in the sequence
        predictions = model.predict([input_sequence, decoder_input])

        # Extract the predicted token and add it to the predicted sequence
        predicted_token_idx = np.argmax(predictions[0, t-1, :])  # Get the token with highest probability
        predicted_sequence.append(predicted_token_idx)

        # Stop if <EOS> token is predicted
        if predicted_token_idx == output_tokenizer.word_index['<EOS>']:
            break

        # Update the decoder input with the predicted token (for the next timestep)
        decoder_input[0, t] = predicted_token_idx

    # Convert token indices to words using the tokenizer
    predicted_text = ' '.join([output_tokenizer.index_word.get(idx, '<OOV>') for idx in predicted_sequence])

    return predicted_text


# Example usage:
input_text = "i'd like three large pies with pestos and yellow peppers"
input_text = std_negation(remove_stopwords(expnad_abb2(standardize_numbers(lemma(clean_text(input_text))))))
print("Input Text: ", input_text)   
predicted_output = predict_sequence(input_text, model, input_tokenizer, output_tokenizer)

print("Predicted Output: ", predicted_output)


Input Text:  3 large pesto and yellow pepper
Predicted Output:  <SOS> (ORDER (PIZZAORDER (NUMBER (NUMBER ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [39]:
# Example function for inference
def predict_order(input_sentence, model, tokenizer, max_input_length, max_target_length):
    # Tokenize the input sentence (encoder input)
    encoder_input_seq = tokenizer.texts_to_sequences([input_sentence])
    encoder_input_seq = pad_sequences(encoder_input_seq, maxlen=max_input_length, padding='post')
    
    # Initialize the decoder input with the <START> token
    decoder_input_seq = np.zeros((1, max_target_length))
    decoder_input_seq[0, 0] = output_tokenizer.word_index['<SOS>']  # Assuming <START> token has index 1
    
    # Make a prediction
    prediction = model.predict([encoder_input_seq, decoder_input_seq])
    
    # Convert predicted tokens back to words (or structured output)
    predicted_sequence = decode_output(prediction, output_tokenizer)
    return predicted_sequence

# Example of how to decode the output into a structured form
def decode_output(prediction, tokenizer):
    # Convert the predicted probabilities into the most likely token indices
    predicted_tokens = np.argmax(prediction, axis=-1)
    
    # Map token indices back to words using the tokenizer's word_index (inverse mapping)
    reverse_word_index = {i: word for word, i in tokenizer.word_index.items()}
    predicted_words = [reverse_word_index.get(i, '<OOV>') for i in predicted_tokens[0]]
    
    # Join the words into the structured output
    structured_output = ' '.join(predicted_words)
    return structured_output

# Example input sentence
input_sentence = "i'd like three large pies with pestos and yellow peppers"
predicted_output = predict_order(input_sentence, model, input_tokenizer, max_input_length=max_input_length, max_target_length=max_output_length)
print(predicted_output)


<SOS> (order (pizzaorder (pizzaorder (number (pizzaorder (pizzaorder (pizzaorder (pizzaorder (pizzaorder <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [18]:
input_tokenizer.word_index

{'and': 1,
 'with': 2,
 'a': 3,
 'three': 4,
 'pizzas': 5,
 'pizza': 6,
 "i'd": 7,
 'like': 8,
 'cheese': 9,
 'four': 10,
 'pies': 11,
 'party': 12,
 'five': 13,
 'american': 14,
 'sized': 15,
 'one': 16,
 'no': 17,
 'of': 18,
 'two': 19,
 'i': 20,
 'size': 21,
 'sprite': 22,
 'pepper': 23,
 'glaze': 24,
 'ice': 25,
 'without': 26,
 'ounce': 27,
 'balsamic': 28,
 '-': 29,
 'large': 30,
 'peppers': 31,
 'pie': 32,
 'crust': 33,
 'tea': 34,
 'thin': 35,
 'sauce': 36,
 'green': 37,
 'ups': 38,
 'personal': 39,
 'extra': 40,
 'diet': 41,
 'medium': 42,
 'roasted': 43,
 'seven': 44,
 'teas': 45,
 'red': 46,
 'also': 47,
 'pecorino': 48,
 'peperonni': 49,
 'cans': 50,
 'ginger': 51,
 'banana': 52,
 'need': 53,
 'any': 54,
 'chicken': 55,
 'fantas': 56,
 'little': 57,
 'lunch': 58,
 'sprites': 59,
 '500': 60,
 'bottle': 61,
 'mozzarella': 62,
 'ale': 63,
 'onions': 64,
 'can': 65,
 '20': 66,
 'the': 67,
 'coke': 68,
 'want': 69,
 'onion': 70,
 'bit': 71,
 'milliliter': 72,
 'olive': 73,
 'hol

In [6]:
# Dummy target data for training
target_data = np.expand_dims(target_sequences, -1)

# Training the model
model.fit(input_sequences, target_data, epochs=10, batch_size=16)

# Evaluating the model
loss, accuracy = model.evaluate(input_sequences, target_data)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

Epoch 1/10


ValueError: in user code:

    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1126, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\Yara\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\input_spec.py", line 219, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model_1" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(16, 33) dtype=int32>]


In [10]:
dev_input_texts = df_dev["dev.SRC"]
dev_target_texts = df_dev["dev.EXR"]

# Converting texts to sequences
dev_input_sequences = tokenizer.texts_to_sequences(dev_input_texts)
dev_target_sequences = tokenizer.texts_to_sequences(dev_target_texts)

# Padding sequences
dev_input_sequences = pad_sequences(dev_input_sequences, maxlen=max_input_length)
dev_target_sequences = pad_sequences(dev_target_sequences, maxlen=max_target_length)

# Dummy target data for evaluation
dev_target_data = np.expand_dims(dev_target_sequences, -1)

# Evaluating the model on dev set
loss, accuracy = model.evaluate([dev_input_sequences, dev_target_sequences], dev_target_data)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.9997 - loss: 0.0036  
Loss: 0.006189607549458742
Accuracy: 0.9994775056838989


In [40]:
preds = model.predict([dev_input_sequences, dev_target_sequences])
preds = np.argmax(preds, axis=-1)

# Decoding the sequences
decoded_preds = []
for pred in preds:
    decoded_preds.append(" ".join([tokenizer.index_word[word] for word in pred if word != 0]))

# Displaying the predictions
df_preds = pd.DataFrame(list(zip(dev_input_texts, dev_target_texts, decoded_preds)), columns=["Input Text", "Target Text", "Predicted Text"])
print(df_preds.head(10))



[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
                                          Input Text  \
0  i want to order two medium pizzas with sausage...   
1           five medium pizzas with tomatoes and ham   
2  i need to order one large vegetarian pizza wit...   
3   i'd like to order a large onion and pepper pizza   
4  i'll have one pie along with pesto and ham but...   
5  i need to order one large pizza with ham bacon...   
6  i would like a thin crust medium pizza with tu...   
7  i need a pizza with pesto and peppers hold the...   
8  i wanted to have five pies with peppers pesto ...   
9    can i get two pies with peppers and bacon pesto   

                                         Target Text  \
0  (ORDER (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) ...   
1  (ORDER (PIZZAORDER (NUMBER 5 ) (SIZE MEDIUM ) ...   
2  (ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...   
3  (ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...   
4  (ORDER (PIZZAORDER (NOT (T

In [20]:
df_preds.to_csv("predictions.csv", index=False)
model.save("pointer_generator_model.keras")

In [44]:
test_sentence = "i would like a thin crust medium pizza with tuna but no pineapple"
test_sequence = tokenizer.texts_to_sequences([test_sentence])
test_sequence = pad_sequences(test_sequence, maxlen=max_input_length)
test_sequence


array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         56,  22,  12,  43,  35,  54,  20,   7, 182,  42,  79]])

In [45]:

preds = model.predict([test_sequence,test_sequence])
preds = np.argmax(preds, axis=-1)

# Decoding the sequences
decoded_preds = []
for pred in preds:
    decoded_preds.append(" ".join([tokenizer.index_word[word] for word in pred if word != 0]))

# Displaying the predictions
df_preds_2 = pd.DataFrame(list(zip(test_sentence, dev_target_texts, decoded_preds)), columns=["Input Text", "Target Text", "Predicted Text"])
print(df_preds_2.head(10))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
  Input Text                                        Target Text  \
0          i  (ORDER (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) ...   

                                 Predicted Text  
0  order thin crust can up up tuna oz pineapple  
