## Imports


In [231]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer, word_tokenize, MWETokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", None)
np.random.seed(42)
tf.random.set_seed(42)

## Constants


In [232]:
dataset_dir = "dataset/"
CONTRACTIONS = {
    "n't": "not",
    "'s": "is",
    "'re": "are",
    "'m": "am",
    "'ll": "will",
    "'ve": "have",
    "'d": "would",
    "'em": "them",
    "'all": "all",
    "'cause": "because",
    "'clock": "oclock",
    "'tis": "it is",
    "'twas": "it was",
    "'tween": "between",
    "'twere": "it were",
    "'twould": "it would",
    "'twixt": "betwixt",
    "'twill": "it will",
    "'til": "until",
    "'bout": "about",
    "'cept": "except",
    "'cos": "because",
    "'fore": "before",
    "'round": "around",
    "'n'": "and",
    "'neath": "beneath",
    "'nother": "another",
    "'nuff": "enough",
}
negation_words = {
    "no",
    "not",
    "none",
    "never",
    "without",
    "avoid",
    "neither",
    "nor",
    "hate",
    "hold",
}
tokenizer = TreebankWordTokenizer()  # Treebank Word Tokenizer
lemmatizer = WordNetLemmatizer()  # WordNet Lemmatizer
# tokenizer = MWETokenizer() # Multi-Word Expression Tokenizer
stemmer = (
    PorterStemmer()
)  # Porter Stemmer  # changes ordering to order and cheese to chees it may help us
# stemmer = SnowballStemmer("english") # ! The same as porter stemmer
stop_negation_words = {"and", "but"}
stop_words = set(stopwords.words("english"))
stop_words = stop_words - negation_words - stop_negation_words
stop_words.update({"would", "like", "get", "want"})

## Text Processing


- Clean text from any unneeded characters


In [233]:
def clean_text(text):
    text = re.sub(r"[^\w']", " ", text)  # Remove non-word characters
    text = re.sub(r"\s+", " ", text)  # Remove multiple spaces
    text = text.lower().strip()  # Lowercase and strip whitespace
    return text

In [234]:
def expnad_abb(text):
    text = text.replace("can't", "can not")
    text = text.replace("won't", "will not")
    text = text.replace("n't", " not")
    text = text.replace("'ll", " will")
    text = text.replace("'ve", " have")
    text = text.replace("'re", " are")
    text = text.replace("'m", " am")
    text = text.replace("'d", " would")
    return text


def expnad_abb2(text):

    pattern = re.compile(
        r"(" + "|".join(re.escape(key) for key in CONTRACTIONS.keys()) + r")"
    )
    expanded_text = pattern.sub(lambda x: " " + CONTRACTIONS[x.group()], text)
    return expanded_text

In [235]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

In [236]:
def handle_negation(text):
    # Look for patterns like "no [word1] [word2] ..." and transform them
    words = text.split()
    transformed_words = []
    negation_flag = False  # To track if we're negating

    for i, word in enumerate(words):
        if word.lower() in negation_words:  # Trigger negation
            negation_flag = True
            continue  # Skip adding "no" to the transformed text
        elif negation_flag and (
            not re.match(r"[a-zA-Z]+", word) or word.lower() in stop_negation_words
        ):  # End negation on punctuation or 'and'
            negation_flag = False

        # Prefix "NOT_" if negation flag is set
        if negation_flag:
            transformed_words.append(f"NOT_{word}")
            if word in ["much"]:
                negation_flag = False
        else:
            if word.lower() not in stop_negation_words:
                transformed_words.append(word)

    return " ".join(transformed_words)

In [237]:
def tokenize_and_lemmatize(text: str):
    tokens = tokenizer.tokenize(text)
    final_string = ""
    stemmed_tokens = []
    for token in tokens:
        stemmed_token = (
            stemmer.stem(token, False) + " "
        )  # I think this is better than lemmatization
        final_string += stemmed_token + " "
        stemmed_tokens.append(stemmed_token)
        # final_string += lemmatizer.lemmatize(token) + " "  # Didn't do anything
    return tokens, stemmed_tokens, final_string

In [238]:
cleaned_text = handle_negation(
    remove_stopwords(
        expnad_abb2(
            clean_text(
                # "i want a lunch size pizza with no apple wood bacon but extra cheese" #Success
                # "I'll order some pizza for lunch and I don't eat spaghetti I eat Pasta WIth white sauce" # Failed
                # "i didn't eat from yesterday can you please order me 2 pizzas? I don't love pepperoni What about mushroom?" #Failed
                "I'm not ordering pizza with extra cheese I want Soda and I don't like your organization"
                # "I need pizza with extra cheese without cucumber or tomatoes add only pepperoni 'cause I love peperoni"  # Failed
            )
        )
    )
)
print(cleaned_text)

NOT_ordering NOT_pizza NOT_extra NOT_cheese NOT_soda NOT_organization


In [239]:
tokens, stemmed_tokens, processed_string = tokenize_and_lemmatize(cleaned_text)
print("Tokens:", tokens)
print("Stemmed Tokens:", stemmed_tokens)
print("Processed String:", processed_string)

Tokens: ['NOT_ordering', 'NOT_pizza', 'NOT_extra', 'NOT_cheese', 'NOT_soda', 'NOT_organization']
Stemmed Tokens: ['NOT_order ', 'NOT_pizza ', 'NOT_extra ', 'NOT_chees ', 'NOT_soda ', 'NOT_organ ']
Processed String: NOT_order  NOT_pizza  NOT_extra  NOT_chees  NOT_soda  NOT_organ  


In [240]:
df_train = pd.read_json(dataset_dir + "PIZZA_train.json", lines=True)
df_dev = pd.read_json(dataset_dir + "PIZZA_dev.json", lines=True)

In [241]:
df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [242]:
df_dev.head()

Unnamed: 0,dev.SRC,dev.EXR,dev.TOP,dev.PCFG_ERR
0,i want to order two medium pizzas with sausage...,(ORDER (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) ...,(ORDER i want to order (PIZZAORDER (NUMBER two...,False
1,five medium pizzas with tomatoes and ham,(ORDER (PIZZAORDER (NUMBER 5 ) (SIZE MEDIUM ) ...,(ORDER (PIZZAORDER (NUMBER five ) (SIZE medium...,False
2,i need to order one large vegetarian pizza wit...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i need to order (PIZZAORDER (NUMBER one...,False
3,i'd like to order a large onion and pepper pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like to order (PIZZAORDER (NUMBER a...,False
4,i'll have one pie along with pesto and ham but...,(ORDER (PIZZAORDER (NOT (TOPPING OLIVES ) ) (N...,(ORDER i'll have (PIZZAORDER (NUMBER one ) pie...,False


In [243]:
df_train["train.SRC"] = df_train["train.SRC"].apply(clean_text)
df_dev["dev.SRC"] = df_dev["dev.SRC"].apply(clean_text)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [244]:
# lemmatizer = WordNetLemmatizer()
# df_train['train.SRC'] = df_train['train.SRC'].apply(lemmatizer.lemmatize)
# df_train.head()

In [245]:
df_train["train.SRC"] = df_train["train.SRC"].apply(expnad_abb)
df_dev["dev.SRC"] = df_dev["dev.SRC"].apply(expnad_abb)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,i would like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [246]:
df_train["train.SRC"] = df_train["train.SRC"].apply(remove_stopwords)
df_dev["dev.SRC"] = df_dev["dev.SRC"].apply(remove_stopwords)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie green pepper and extra peperonni,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie american cheese a...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [247]:
df_train["train.SRC"] = df_train["train.SRC"].apply(handle_negation)
df_dev["dev.SRC"] = df_dev["dev.SRC"].apply(handle_negation)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie green pepper extra peperonni,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie american cheese m...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [248]:
tokenizer = Tokenizer(num_words=500, oov_token="<OOV>")
tokenizer.fit_on_texts(
    df_train["train.SRC"]
)  # updates internal vocabulary based on a list of texts

sequences = tokenizer.texts_to_sequences(
    df_train["train.SRC"]
)  # transforms each text in texts to a sequence of integers
padded_sequences = pad_sequences(sequences, maxlen=100, padding="post")

KeyboardInterrupt: 

In [None]:
tokenizer.word_index

In [None]:
df_train["padded_seq"] = list(padded_sequences)
df_train.head()