In [1]:
from gensim.models import FastText
import numpy as np
import pandas as pd
import tensorflow as tf

from nltk.corpus import stopwords
from tensorflow import keras
import re
from nltk.stem import WordNetLemmatizer
import pickle
import json


from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model,Sequential,model_from_json
from tensorflow.keras.layers import Embedding,LSTM,Dense,Input,Bidirectional,Dropout,Convolution1D,GRU,TimeDistributed

pd.set_option("display.max_columns", None)
np.random.seed(42)
tf.random.set_seed(42)

MAX_LEN = 45



In [2]:
CONTRACTIONS = {
    "n't": "not",
    "'s": "is",
    "'re": "are",
    "'m": "am",
    "'ll": "will",
    "'ve": "have",
    "'d": "would",
    "'em": "them",
    "'all": "all",
    "'cause": "because",
    "'clock": "oclock",
    "'tis": "it is",
    "'twas": "it was",
    "'tween": "between",
    "'twere": "it were",
    "'twould": "it would",
    "'twixt": "betwixt",
    "'twill": "it will",
    "'til": "until",
    "'bout": "about",
    "'cept": "except",
    "'cos": "because",
    "'fore": "before",
    "'round": "around",
    "'n'": "and",
    "'neath": "beneath",
    "'nother": "another",
    "'nuff": "enough",
}
negation_words = {
    "no",
    "not",
    "none",
    "never",
    "without",
    "avoid",
    "neither",
    "nor",
    "hate",
    "hold",
}
pizza = {"pizza", "pizzas", "pie", "pies"}

stop_negation_words = {"and"}
stop_words = set(stopwords.words("english"))
stop_words = stop_words - negation_words - stop_negation_words - {'all' , 'a','an' , 'can'}
stop_words.update({"would", "like", "get", "want", "order" , "please" , 'could' , 'prefer' ,
                    'handle' , 'take' , 'bring' , 'need' , 'make' , 'love', 'let', 'absolutely',
                    "arrange",'today' , 'tommorow','add' , 'thank' , 'thanks' , 'tonight' , 'right' , 'left'})
stop_words.update(pizza)

lemmatizer = WordNetLemmatizer()  # WordNet Lemmatizer


In [3]:
def expnad_abb2(text):

    pattern = re.compile(
        r"(" + "|".join(re.escape(key) for key in CONTRACTIONS.keys()) + r")"
    )
    expanded_text = pattern.sub(lambda x: " " + CONTRACTIONS[x.group()], text)
    return expanded_text


In [4]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

In [5]:
def clean_text(text):
    text = re.sub(r"[^\w']", " ", text)  # Remove non-word characters
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces
    text = text.lower().strip()  # Lowercase and strip whitespace
    return text

In [6]:
relation_tags = {
    'O':0,
    'B-Pizza':1, 'I-Pizza':2,
    'B-Drink':3, 'I-Drink':4,
}

In [7]:
tags = [
    'O',
    'B-NUMBER', 'I-NUMBER',
    'B-DRINKTYPE', 'I-DRINKTYPE',
    'B-VOLUME', 'I-VOLUME',
    'B-TOPPING', 'I-TOPPING',
    'B-SIZE', 'I-SIZE',
    'B-QUANTITY', 'I-QUANTITY',
    'B-STYLE', 'I-STYLE',
    'B-CONTAINER', 'I-CONTAINER',
    'B-NOT-TOPPING', 'I-NOT-TOPPING',
    'B-NOT-STYLE' , 'I-NOT-STYLE'
]

In [8]:
lemmatizer = WordNetLemmatizer()

def lemma(text):
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text] 
    return " ".join(text)

In [9]:
# loading
with open('finalModel/tokenizer.pickle', 'rb') as handle:
    input_tokenizer = pickle.load(handle)

In [10]:
fast_text_model = FastText.load("finalModel/fast_text_model_500k")

In [11]:
word_index = input_tokenizer.word_index

max_length = MAX_LEN


In [12]:
# Prepare word embeddings using FastText
embedding_dim = fast_text_model.wv.vector_size  # Dimension of Word2Vec vectors

# Initialize a matrix to store word vectors
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))

# Fill the embedding matrix with FastText word vectors
for word, idx in word_index.items():
    if word in fast_text_model.wv.key_to_index.keys():
        # print("found" , word)
        embedding_matrix[idx] = fast_text_model.wv[word]
    else:
        print("not found" , word)
        embedding_matrix[idx] = np.random.uniform(-0.01, 0.01, embedding_dim)

not found <OOV>


In [13]:
# Define the BiLSTM model
input = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=len(word_index) + 1,
                            output_dim=embedding_dim,
                            weights=[embedding_matrix],
                            trainable=False)(input)

lstm = Bidirectional(LSTM(units=128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
dropout = Dropout(0.1)(lstm)

# Dense layer for sequence labeling (softmax activation)
output = Dense(len(relation_tags), activation='softmax')(dropout)

# Build and compile the model
model_relations = Model(inputs=input, outputs=output)
model_relations.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model summary
model_relations.summary()


In [14]:
model_relations.load_weights("finalModel/model_relations.weights.h5")

  saveable.load_own_variables(weights_store.get(inner_path))


In [15]:
input_tokenizer2 = pickle.load(open("finalModel/input_tokenizer2.pkl", "rb"))

In [16]:
json_file = open('finalModel/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model_entity = model_from_json(loaded_model_json)
# load weights into new model
model_entity.load_weights("finalModel/model.weights.h5")
model_entity.summary()

In [17]:
def setToppings(token, tokens, i,entity_preds):
    quantity = None
    temp = ""
    not_flag = False
    while("TOPPING" in entity_preds[i] or "QUANTITY" in entity_preds[i]):
        if "NOT" in entity_preds[i]:
            not_flag = True
            temp += " " + tokens[i]
        elif "QUANTITY" in entity_preds[i]:
            quantity = tokens[i]
        else :
            temp += " "  + tokens[i]
        i+=1
        
    if temp == "":
        topping = None
    
    else : 
        topping = {
        "NOT": not_flag,
        "Quantity": quantity,
        "Topping": temp
    }
    return topping,i-1

In [18]:
def setStyle (token, tokens, i,entity_preds):
    temp = ""
    not_flag = False
    while("STYLE" in entity_preds[i]):
        if "NOT" in entity_preds[i]:
            not_flag = True
            temp += " " + tokens[i]
        else:
            temp += " "  + tokens[i]
        i+=1

    style = {
        "NOT": not_flag,
        "TYPE": temp
    }
    return style,i-1

In [19]:
def get_pizza_order(relations_preds,tokens,entity_preds,i):
    current_order = {"NUMBER": None, "SIZE": None, "STYLE": [], "AllTopping": []}
    temp_i = i
    while(i < len(tokens) and (temp_i == i or relations_preds[i] == 'I-Pizza' or relations_preds[i] == 'O' )):
        if entity_preds[i] == "B-TOPPING" or entity_preds[i] == "B-NOT-TOPPING" or (entity_preds[i] == "B-QUANTITY" and i+1 < len(entity_preds) and entity_preds[i+1] == "B-TOPPING"):
            topping,i= setToppings(tokens[i], tokens, i,entity_preds )
            if topping is not None:
                current_order["AllTopping"].append(topping)

        elif entity_preds[i] == "B-NUMBER":
            current_order["NUMBER"] = tokens[i]
        elif entity_preds[i] == "B-SIZE":
            if entity_preds[i-1] == "B-QUANTITY":
                current_order["SIZE"] = tokens[i-1] + " " + tokens[i]
            else:
                current_order["SIZE"] = tokens[i]
        
        elif "STYLE" in entity_preds[i]:
            s,i = setStyle(tokens[i], tokens, i,entity_preds)
            current_order["STYLE"].append(s)
        i += 1
    return current_order,i


In [20]:
def setDrink(token, tokens, i,entity_preds):
    temp = ""
    while(token in entity_preds[i]):
        temp += " "  + tokens[i]
        i+=1
    return temp,i-1

In [21]:
def get_drink_order(relations_preds,tokens,entity_preds,i):
    current_order = {"NUMBER": None, "DRINKTYPE": None, "VOLUME": None, "CONTAINER": None,"SIZE": None}
    temp_i = i
    while( i < len(tokens) and (temp_i == i or relations_preds[i] == 'I-Drink' or relations_preds[i] == 'O')):
        # if "B-DRINKTYPE" in entity_preds[i]:
            # current_order["DRINKTYPE"],i = setDrink("DRINKTYPE", tokens, i,entity_preds)
        if entity_preds[i] == "B-NUMBER":
            current_order["NUMBER"] = tokens[i]
        elif "VOLUME" in entity_preds[i]:
            current_order["VOLUME"],i = setDrink("VOLUME", tokens, i,entity_preds)
        elif "CONTAINER" in entity_preds[i]:
            current_order["CONTAINER"],i = setDrink("CONTAINER" , tokens, i,entity_preds)
        elif "SIZE" in entity_preds[i]:
            current_order["SIZE"],i = setDrink("SIZE" , tokens, i,entity_preds)
        elif entity_preds[i] != 'O': 
            current_order["DRINKTYPE"] = tokens[i]

        i += 1
    return current_order,i

In [22]:
def parse_order(input_text, relations_preds, entity_preds):
    tokens = input_text.split()

    # Combine predictions with tokens
    combined = [
        {"token": token, "model1": m1, "model2": m2}
        for token, m1, m2 in zip(tokens, relations_preds, entity_preds)
    ]

    pizza_orders = []
    drink_orders = []
    current_order = None

    for i, item in enumerate(combined):
        token, model1, model2 = item["token"], item["model1"], item["model2"]

        if model1 == "B-Pizza":
            # Start a new order
            current_order , i = get_pizza_order(relations_preds,tokens,entity_preds,i)
            if len(current_order["AllTopping"]) == 0 and current_order["SIZE"] is None and current_order["NUMBER"] is None and len(current_order["STYLE"]) == 0:
                continue
            if current_order["NUMBER"] is None:
                current_order["NUMBER"] = "a"
            pizza_orders.append(current_order)


        elif model1 == "B-Drink":
            current_order , i = get_drink_order(relations_preds,tokens,entity_preds,i)
            drink_orders.append(current_order)

        elif model1 == "I-Pizza" and current_order is None: 
            current_order , i = get_pizza_order(relations_preds,tokens,entity_preds,i)
            if len(current_order["AllTopping"]) == 0 and current_order["SIZE"] is None and current_order["NUMBER"] is None and len(current_order["STYLE"]) == 0:
                continue
            if current_order["NUMBER"] is None:
                current_order["NUMBER"] = "a"
            pizza_orders.append(current_order)



    # Construct final output
    output = {"ORDER": {"PIZZAORDER": pizza_orders, "DRINKORDER": drink_orders}}
    return output


In [31]:
test_sentence = "could you give me a exta large pizza without roasted red peppers and more cheese but no sausage"

In [32]:
test_sentence = clean_text(test_sentence)
test_sentence = expnad_abb2(test_sentence)
test_sentence = remove_stopwords(test_sentence)
test_sentence = lemma(test_sentence)

print("After preprocessing: ", test_sentence)


After preprocessing:  give a exta large without roasted red pepper and cheese no sausage


In [33]:
test_sentence_seq = input_tokenizer.texts_to_sequences([test_sentence])
test_sentence_seq = pad_sequences(test_sentence_seq, maxlen=MAX_LEN , padding='post')
print("After tokenization: ", test_sentence_seq)

After tokenization:  [[ 1  3  1 20 18 36 37 16  2  5 12 95  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [34]:
pred_relation = model_relations.predict(test_sentence_seq)
pred_relation = np.argmax(pred_relation, axis=-1)
pred_relation = [list(relation_tags.keys())[list(relation_tags.values()).index(i)] for i in pred_relation[0]]
print("Relation prediction: ", pred_relation)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Relation prediction:  ['B-Pizza', 'O', 'O', 'I-Pizza', 'I-Pizza', 'I-Pizza', 'I-Pizza', 'I-Pizza', 'O', 'I-Pizza', 'I-Pizza', 'I-Pizza', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [35]:
test_sentence_seq = input_tokenizer2.texts_to_sequences([test_sentence])
test_sentence_seq = pad_sequences(test_sentence_seq, maxlen=MAX_LEN , padding='post')
print("After tokenization: ", test_sentence_seq)

After tokenization:  [[ 1  3  1 21 19 37 38  7  2  5 14 86  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [36]:
pred_entity = model_entity.predict(test_sentence_seq)
pred_entity = np.argmax(pred_entity, axis=-1)
pred_entity = [tags[i] for i in pred_entity[0]]
print("Entity prediction: ", pred_entity)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Entity prediction:  ['B-CONTAINER', 'B-NUMBER', 'B-STYLE', 'B-SIZE', 'O', 'B-NOT-TOPPING', 'I-NOT-TOPPING', 'B-TOPPING', 'O', 'B-TOPPING', 'O', 'B-NOT-TOPPING', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [37]:
output = parse_order(test_sentence, pred_relation, pred_entity)
print("Final output: ")
print(json.dumps(output, indent=4))

Final output: 
{
    "ORDER": {
        "PIZZAORDER": [
            {
                "NUMBER": "a",
                "SIZE": "large",
                "STYLE": [
                    {
                        "NOT": false,
                        "TYPE": " exta"
                    }
                ],
                "AllTopping": [
                    {
                        "NOT": true,
                        "Quantity": null,
                        "Topping": " roasted red pepper"
                    },
                    {
                        "NOT": false,
                        "Quantity": null,
                        "Topping": " cheese"
                    },
                    {
                        "NOT": true,
                        "Quantity": null,
                        "Topping": " sausage"
                    }
                ]
            }
        ],
        "DRINKORDER": []
    }
}
