In [7]:
import gensim
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk.tokenize import word_tokenize,sent_tokenize
from tensorflow import keras
import re
import json


pd.set_option("display.max_columns", None)
np.random.seed(42)
tf.random.set_seed(42)


In [8]:
df_train = pd.read_json("dataset/PIZZA_train.json", lines=True,)
df_train = df_train.sample(10000)

df_dev = pd.read_json("dataset/PIZZA_dev.json", lines=True,)

In [9]:
sentences = []

In [10]:
for text in df_train['train.SRC']:
    for i in sent_tokenize(text):
        temp = []
        # tokenize the sentence into words
        for j in word_tokenize(i):
            temp.append(j.lower())
        sentences.append(temp)

In [11]:

model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
# Save the trained model
model.save("pizza_embeddings.model")


In [12]:
toppings_regex = re.compile(r'(?<=\(TOPPING\s)[^)]*(?=\s)')
number_regex = re.compile(r'(?<=\(NUMBER\s)[^)]*(?=\s)')
size_regex = re.compile(r'(?<=\(SIZE\s)[^)]*(?=\s)')
quantity_regex = re.compile(r'(?<=\(QUANTITY\s)[^)]*(?=\s)')
style_regex = re.compile(r'(?<=\(STYLE\s)[^)]*(?=\s)')
drink_type_regex = re.compile(r'(?<=\(DRINKTYPE\s)[^)]*(?=\s)')
container_type_regex = re.compile(r'(?<=\(CONTAINERTYPE\s)[^)]*(?=\s)')   

In [13]:
sizes = set()
toppings = set()
numbers = set()
quantities = set()
styles = set()
drink_types = set()
container_types = set()
none = set()


In [14]:
CONTRACTIONS = {
    "n't": "not",
    "'s": "is",
    "'re": "are",
    "'m": "am",
    "'ll": "will",
    "'ve": "have",
    "'d": "would",
    "'em": "them",
    "'all": "all",
    "'cause": "because",
    "'clock": "oclock",
    "'tis": "it is",
    "'twas": "it was",
    "'tween": "between",
    "'twere": "it were",
    "'twould": "it would",
    "'twixt": "betwixt",
    "'twill": "it will",
    "'til": "until",
    "'bout": "about",
    "'cept": "except",
    "'cos": "because",
    "'fore": "before",
    "'round": "around",
    "'n'": "and",
    "'neath": "beneath",
    "'nother": "another",
    "'nuff": "enough",
}
negation_words = {
    "no",
    "not",
    "none",
    "never",
    "without",
    "avoid",
    "neither",
    "nor",
    "hate",
    "hold",
}

In [15]:
def expnad_abb2(text):

    pattern = re.compile(
        r"(" + "|".join(re.escape(key) for key in CONTRACTIONS.keys()) + r")"
    )
    expanded_text = pattern.sub(lambda x: " " + CONTRACTIONS[x.group()], text)
    return expanded_text

In [16]:
def get_none_match(text):
    order_regex = re.compile(r'(?<=ORDER\s)[^(]*(?=\s\()')
    pizzaorder_regex = re.compile(r'(?<=PIZZAORDER\s)[^(]*(?=\s\()')
    drinkorder_regex = re.compile(r'(?<=DRINKORDER\s)[^(]*(?=\s\()')
    between_parentheses_regex = re.compile(r'(?<=\)\s)[^()]+(?=\s\()')
    uncleaned_none_match = re.findall(order_regex, text)
    uncleaned_none_match.extend(re.findall(pizzaorder_regex, text))
    uncleaned_none_match.extend(re.findall(drinkorder_regex, text))
    uncleaned_none_match.extend(re.findall(between_parentheses_regex, text))
    none_match=[]
    for sentence in uncleaned_none_match:
        sen = expnad_abb2(sentence).upper()
        none_match.extend([word.lower() for word in sen.split()])
    return none_match

In [17]:
sizes.update([item for sublist in df_train['train.EXR'].apply(lambda x: size_regex.findall(x)) for item in sublist])
toppings.update([item for sublist in df_train['train.EXR'].apply(lambda x: toppings_regex.findall(x)) for item in sublist])
numbers.update([item for sublist in df_train['train.EXR'].apply(lambda x: number_regex.findall(x)) for item in sublist])
quantities.update([item for sublist in df_train['train.EXR'].apply(lambda x: quantity_regex.findall(x)) for item in sublist])
styles.update([item for sublist in df_train['train.EXR'].apply(lambda x: style_regex.findall(x)) for item in sublist])
drink_types.update([item for sublist in df_train['train.EXR'].apply(lambda x: drink_type_regex.findall(x)) for item in sublist])
container_types.update([item for sublist in df_train['train.EXR'].apply(lambda x: container_type_regex.findall(x)) for item in sublist])


In [18]:
sizes.update([item for sublist in df_train['train.TOP'].apply(lambda x: size_regex.findall(x)) for item in sublist])
toppings.update([item for sublist in df_train['train.TOP'].apply(lambda x: toppings_regex.findall(x)) for item in sublist])
numbers.update([item for sublist in df_train['train.TOP'].apply(lambda x: number_regex.findall(x)) for item in sublist])
quantities.update([item for sublist in df_train['train.TOP'].apply(lambda x: quantity_regex.findall(x)) for item in sublist])
styles.update([item for sublist in df_train['train.TOP'].apply(lambda x: style_regex.findall(x)) for item in sublist])
drink_types.update([item for sublist in df_train['train.TOP'].apply(lambda x: drink_type_regex.findall(x)) for item in sublist])
container_types.update([item for sublist in df_train['train.TOP'].apply(lambda x: container_type_regex.findall(x)) for item in sublist])
none.update([item for sublist in df_train['train.TOP'].apply(get_none_match) for item in sublist])

In [19]:
sizes = list(sizes)
toppings = list(toppings)
numbers = list(numbers)
quantities = list(quantities)
styles = list(styles)
drink_types = list(drink_types)
container_types = list(container_types)
none = list(none)


In [None]:
input_text = "I would like three large pies with pesto and yellow peppers"
input_tokens = word_tokenize(input_text.lower())

In [29]:
def Similarity(w1,w2,model):
    if w1 not in model.wv or w2 not in model.wv:
        return 0
    A = model.wv[w1]; B = model.wv[w2]
    return sum(A*B)/(pow(sum(pow(A,2)),0.5)*pow(sum(pow(B,2)),0.5))

In [35]:
categories = {
    # 'PIZZAORDER': pizza,  # Assume `pizza` is a list of relevant entities
    'NUMBER': numbers,    # Assume `numbers` is a list of relevant entities
    'SIZE': sizes,        # Assume `sizes` is a list of relevant entities
    'TOPPING': toppings,  # Assume `toppings` is a list of relevant entities
    'STYLE': styles,      # Assume `styles` is a list of relevant entities
    'QUANTITY': quantities,  # Assume `quantities` is a list of relevant entities
    'DRINKTYPE': drink_types,  # Assume `drink_types` is a list of relevant entities
    'CONTAINERTYPE': container_types,  # Assume `container_types` is a list of relevant entities
    'NONE': none  # Default category, assumes `none` is a list of entities or empty list
}



def get_best_match(token, model):

    # Check if the token is in any of the relevant entities
    for category, entity_list in categories.items():
        
        if token in entity_list:
            return category
        
    # If no exact match, find the best match based on cosine similarity
    best_category = None
    best_similarity = 0.0

    for category, entity_list in categories.items():
        for entity in entity_list:
            similarity = model.wv.similarity(token, entity)
            # print("For token:", token, "and entity:", entity, "similarity:", similarity)
            if similarity > best_similarity:
                best_similarity = similarity
                # print("Best similarity:", best_similarity)
                # print("Best category:", category)
                # print("Best entity:", entity)
                best_category = category

    return best_category
    # return best_entity

for token  i We got ('none', 1.0)
for token  would We got ('none', 0)
for token  like We got ('none', 0.9999999999999999)
for token  three We got ('number', 0.9999999999999999)
for token  large We got ('size', 0.9999999999999998)
for token  pies We got ('none', 1.0000000000000002)
for token  with We got ('none', 1.0)
for token  pesto We got ('toppings', 1.0000000000000002)
for token  and We got ('none', 1.0)
for token  yellow We got ('toppings', 0.868466221107743)
for token  peppers We got ('toppings', 0.9999999999999999)


In [39]:
pizza_order_entry = {
        "NUMBER": None,
        "SIZE": None,
        "STYLE": None,
        "AllTopping": []
    }

drink_order_entry = {
        "NUMBER": None,
        "SIZE": None,
        "DRINKTYPE": None,
        "CONTAINERTYPE": None
    }


In [45]:
def process_toppings(toppings_list, model):
    all_toppings = []
    for topping in toppings_list:
        best_match, score = get_best_match(topping, model)
        if best_match == 'toppings':
            all_toppings.append({
                "NOT": False if score > 0.5 else True,  # Assume NOT if score is low
                "Quantity": None,  # Default quantity is None
                "Topping": topping
            })
    return all_toppings

def process_pizza_order(tokens, model):
    
    current_toppings = []
    for token in tokens:
        best_match, score = get_best_match(token, model)
        if best_match == "number":
            pizza_order_entry["NUMBER"] = token
        elif best_match == "size":
            pizza_order_entry["SIZE"] = token
        elif best_match == "style":
            pizza_order_entry["STYLE"] = token
        elif best_match == "toppings":
            current_toppings.append(token)

    pizza_order_entry["AllTopping"] = process_toppings(current_toppings, model)
    return pizza_order_entry

def process_drink_order(tokens, model):
 
    for token in tokens:
        best_match, score = get_best_match(token, model)
        if best_match == "number":
            drink_order_entry["NUMBER"] = token
        elif best_match == "size":
            drink_order_entry["SIZE"] = token
        elif best_match == "drink_type":
            drink_order_entry["DRINKTYPE"] = token
        elif best_match == "container_type":
            drink_order_entry["CONTAINERTYPE"] = token

    return drink_order_entry

def parse_input(input_tokens, model):
    pizza_order = {}  # To hold the single pizza order
    drink_order = {}  # To hold the single drink order
    
    current_tokens = []
    current_order = 0 # 0 for pizza, 1 for drink
    for token in input_tokens:
        best_match, score = get_best_match(token, model)
        if best_match == "container_type" or best_match == "drink_type":
            current_order = 1
        if best_match == "none":
            if current_order == 0:
                pizza_order = process_pizza_order(current_tokens, model)
            elif current_order == 1:
                drink_order = process_drink_order(current_tokens, model)
            # current_tokens = []
        else:
            current_tokens.append(token)
    
    # Process any remaining tokens
    if current_tokens:
        if current_order == 0:
            pizza_order = process_pizza_order(current_tokens, model)
        elif current_order == 1:
            drink_order = process_drink_order(current_tokens, model)
            
    return pizza_order, drink_order


input_text = "i'd like a small pizza with pineapple buffalo chicken and garlic powder"
input_tokens = word_tokenize(input_text.lower())

# Process the input tokens to generate orders
pizza_order, drink_order = parse_input(input_tokens, model)  # Replace 'None' with your model

# Final structured output
final_order = {
    "ORDER": {
        "PIZZAORDER": [pizza_order],  # Wrap pizza order in a list
        "DRINKORDER": [drink_order]  # Wrap drink order in a list
    }
}

# Output the final structured order in JSON format
import json
print(json.dumps(final_order, indent=4))


{
    "ORDER": {
        "PIZZAORDER": [
            {
                "NUMBER": "a",
                "SIZE": "small",
                "STYLE": "'d",
                "AllTopping": [
                    {
                        "NOT": false,
                        "Quantity": null,
                        "Topping": "pineapple"
                    },
                    {
                        "NOT": false,
                        "Quantity": null,
                        "Topping": "buffalo"
                    },
                    {
                        "NOT": false,
                        "Quantity": null,
                        "Topping": "chicken"
                    },
                    {
                        "NOT": false,
                        "Quantity": null,
                        "Topping": "garlic"
                    },
                    {
                        "NOT": false,
                        "Quantity": null,
                        "Topping": 

In [23]:
word_embeddings = {word: model.wv[word] for word in model.wv.index_to_key}


In [20]:
from enum import Enum
class Label(Enum):
    TOPPING = 0
    NUMBER = 1
    SIZE = 2
    QUANTITY = 3
    STYLE = 4
    DRINKTYPE = 5
    CONTAINERTYPE = 6
    NONE = 7

In [25]:
def label_word(word):
    if word in toppings:
        return Label.TOPPING
    elif word in numbers:
        return Label.NUMBER
    elif word in sizes:
        return Label.SIZE
    elif word in quantities:
        return Label.QUANTITY
    elif word in styles:
        return Label.STYLE
    elif word in drink_types:
        return Label.DRINKTYPE
    elif word in container_types:
        return Label.CONTAINERTYPE
    else:
        return Label.NONE

# Example usage
labeled_data = []
for sentence in sentences:
    labeled_sentence = [(word, label_word(word)) for word in sentence]
    labeled_data.append(labeled_sentence)

# Print the labeled data
for sentence in labeled_data[:5]:  # Print first 5 sentences for brevity
    print(sentence)

[('i', <Label.NONE: 7>), ("'d", <Label.NONE: 7>), ('like', <Label.NONE: 7>), ('three', <Label.NUMBER: 1>), ('large', <Label.SIZE: 2>), ('pies', <Label.NONE: 7>), ('with', <Label.NONE: 7>), ('pestos', <Label.TOPPING: 0>), ('and', <Label.NONE: 7>), ('yellow', <Label.NONE: 7>), ('peppers', <Label.TOPPING: 0>)]
[('i', <Label.NONE: 7>), ("'d", <Label.NONE: 7>), ('like', <Label.NONE: 7>), ('a', <Label.NUMBER: 1>), ('small', <Label.SIZE: 2>), ('pizza', <Label.NONE: 7>), ('with', <Label.NONE: 7>), ('pineapple', <Label.TOPPING: 0>), ('buffalo', <Label.NONE: 7>), ('chicken', <Label.TOPPING: 0>), ('and', <Label.NONE: 7>), ('garlic', <Label.TOPPING: 0>), ('powder', <Label.NONE: 7>)]
[('three', <Label.NUMBER: 1>), ('party', <Label.NONE: 7>), ('sized', <Label.NONE: 7>), ('pizzas', <Label.NONE: 7>), ('with', <Label.NONE: 7>), ('pickles', <Label.TOPPING: 0>), ('and', <Label.NONE: 7>), ('hot', <Label.NONE: 7>), ('pepper', <Label.TOPPING: 0>)]
[('balsamic', <Label.NONE: 7>), ('glaze', <Label.NONE: 7>), 

In [29]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

# Extract sentences and labels
sentences = [" ".join(word for word, label in sentence) for sentence in labeled_data]
labels = [[label.value for _, label in sentence] for sentence in labeled_data]

# Tokenize the sentences
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
tokenized_sentences = tokenizer.texts_to_sequences(sentences)

# Pad the tokenized sentences and labels
max_len = max(len(seq) for seq in tokenized_sentences)
X = pad_sequences(tokenized_sentences, maxlen=max_len, padding="post", truncating="post")
y = pad_sequences(labels, maxlen=max_len, padding="post", truncating="post")

# Convert y to categorical format
num_classes = len(Label)
y = np.eye(num_classes)[y]

# Split the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [33]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Bidirectional

# Define model parameters
VOCAB_SIZE = len(tokenizer.word_index) + 1
EMBEDDING_DIM = 128
LSTM_UNITS = 64

# Build the model
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM)(input_layer)
lstm_layer = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(embedding_layer)
output_layer = TimeDistributed(Dense(num_classes, activation="softmax"))(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

model.summary()


In [34]:
BATCH_SIZE = 32
EPOCHS = 10

history = model.fit(
    X, y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS
)


Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 16ms/step - accuracy: 0.8569 - loss: 0.5031
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9826 - loss: 0.0464
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9836 - loss: 0.0367
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.9835 - loss: 0.0335
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.9838 - loss: 0.0318
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.9839 - loss: 0.0308
Epoch 7/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9842 - loss: 0.0299
Epoch 8/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.9844 - loss: 0.0292
Epoch 9/10
[1m313/313[0m [32m

In [52]:
# Example: Assuming `test_sentences` is a list of test sentences
test_sentences = ["i'd like three large pies with pestos and yellow peppers"]

# Tokenize and convert to sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# Print tokenized output for debugging
print("Tokenized Test Sequences:", test_sequences)

# Use the same max sequence length as in training
test_sequences = pad_sequences(test_sequences, maxlen=max_len, padding="post", truncating="post")

# Predict with the trained model
predictions = model.predict(test_sequences)

# Print raw predictions
print("Raw Predictions:", predictions)


# Get the index of the max probability for each timestep
decoded_predictions = np.argmax(predictions, axis=-1)

# Map predictions back to label names using the Label Enum
decoded_labels = [
    [Label(pred).name for pred in sentence] for sentence in decoded_predictions
]

# Print the decoded labels
for i in range(len(test_sentences)):
    print(f"Sentence: {test_sentences[i]}")
    for word, label in zip(test_sentences[i].split(), decoded_labels[i]):
        print(f"{word}: {label}")
    print()
# print("Decoded Predictions:", test_sentences)
# print("Decoded Predictions:", decoded_labels)


Tokenized Test Sequences: [[1, 10, 6, 30, 13, 3, 118, 2, 83, 31]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Raw Predictions: [[[9.65911001e-02 1.13209322e-01 1.16967224e-02 1.50052970e-02
   9.83704776e-02 1.88730452e-02 2.10612901e-02 6.25192702e-01]
  [7.44007775e-05 2.47842778e-04 8.11098107e-06 9.96168546e-06
   1.19190972e-05 2.58426007e-06 5.31987962e-06 9.99639869e-01]
  [1.23406771e-05 9.99737561e-01 8.66172286e-07 3.32436048e-06
   7.02610009e-07 3.95278789e-07 6.46776755e-07 2.44217168e-04]
  [9.87180442e-07 5.83189376e-06 9.99963760e-01 3.48238927e-06
   1.11665613e-05 1.50087408e-06 3.46256115e-07 1.29065902e-05]
  [8.01549031e-05 1.05756726e-06 8.24330891e-06 1.58563410e-06
   6.23440428e-06 2.63829378e-07 3.81852573e-07 9.99902129e-01]
  [4.51901578e-05 1.91986071e-08 1.08349205e-08 5.64815750e-08
   1.83516811e-08 6.15472739e-09 1.15833600e-08 9.99954820e-01]
  [9.99727428e-01 9.58483156e-07 7.53720197e-09 2.66237726e-06
   8.07356230e-07 3.

In [43]:
def build_order(decoded_labels, words):
    order = {"ORDER": []}  # Root structure

    current_drink_order = None  # Tracks the current drink order being built

    for word, label in zip(words, decoded_labels):
        if label == "DRINKTYPE":  # Start a new drink order
            # Save the previous drink order if it exists
            if current_drink_order:
                order["ORDER"].append(current_drink_order)
            # Start a new one
            current_drink_order = {"DRINKORDER": {"DRINKTYPE": word}}

        elif label == "NUMBER":  # Add quantity
            if current_drink_order:
                current_drink_order["DRINKORDER"]["NUMBER"] = word

        elif label == "SIZE":  # Add size
            if current_drink_order:
                current_drink_order["DRINKORDER"]["SIZE"] = word

    # Add the last drink order if it exists
    if current_drink_order:
        order["ORDER"].append(current_drink_order)

    return order


In [None]:
def format_order(order):
    def format_drink_order(drink_order):
        formatted = "(DRINKORDER "
        for key, value in drink_order["DRINKORDER"].items():
            formatted += f"({key} {value.upper()} ) "
        return formatted.strip() + ")"
    

    formatted_order = "(ORDER "
    for drink_order in order["ORDER"]:
        formatted_order += format_drink_order(drink_order) + " "
    return formatted_order.strip() + ")"


In [51]:
words = test_sentences[0].split()

# Build the order structure
order_structure = build_order(decoded_labels, words)

# Format the structure as a string
formatted_output = format_order(order_structure)

# Print the formatted output
print(formatted_output)

(ORDER)
