In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", None)
np.random.seed(42)
tf.random.set_seed(42)

In [2]:
dataset_dir = 'E:/Collage/NLP/Project/dataset/'

negation_words = {"no", "not", "none", "never","without", "avoid", "neither", "nor", "hate", "hold"}
stop_negation_words = {'and','but'}
stop_words = set(stopwords.words("english")) 
stop_words = stop_words - negation_words - stop_negation_words
stop_words.update({'would','like','get','want'})

In [3]:
def clean_text(text):
    text = re.sub(r"[^\w']", ' ', text)   # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    text = text.lower().strip()       # Lowercase and strip whitespace
    return text

In [4]:
def expnad_abb(text):
    text = text.replace("can't", "can not")
    text = text.replace("won't", "will not")
    text = text.replace("n't", " not")
    text = text.replace("'ll", " will")
    text = text.replace("'ve", " have")
    text = text.replace("'re", " are")
    text = text.replace("'m", " am")
    text = text.replace("'d", " would")
    return text

In [5]:
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

In [6]:
def handle_negation(text):
    # Look for patterns like "no [word1] [word2] ..." and transform them
    words = text.split()
    transformed_words = []
    negation_flag = False  # To track if we're negating
    
    for i, word in enumerate(words):
        if word.lower() in negation_words:  # Trigger negation
            negation_flag = True
            continue  # Skip adding "no" to the transformed text
        elif negation_flag and (not re.match(r"[a-zA-Z]+", word) or word.lower() in stop_negation_words):  # End negation on punctuation or 'and'
            negation_flag = False
        
        # Prefix "NOT_" if negation flag is set
        if negation_flag:
            transformed_words.append(f"NOT_{word}")
            if word in ['much']:
                negation_flag = False
        else:
            if word.lower() not in stop_negation_words:
                transformed_words.append(word)
    
    return " ".join(transformed_words)

In [7]:
handle_negation(remove_stopwords(expnad_abb(clean_text("i want a lunch size pizza with no apple wood bacon but extra cheese"))))

'lunch size pizza NOT_apple NOT_wood NOT_bacon extra cheese'

In [8]:
df_train = pd.read_json(dataset_dir + 'PIZZA_train.json' , lines=True)
df_dev = pd.read_json(dataset_dir + 'PIZZA_dev.json' , lines=True)

In [9]:
df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [10]:
df_dev.head()

Unnamed: 0,dev.SRC,dev.EXR,dev.TOP,dev.PCFG_ERR
0,i want to order two medium pizzas with sausage...,(ORDER (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) ...,(ORDER i want to order (PIZZAORDER (NUMBER two...,False
1,five medium pizzas with tomatoes and ham,(ORDER (PIZZAORDER (NUMBER 5 ) (SIZE MEDIUM ) ...,(ORDER (PIZZAORDER (NUMBER five ) (SIZE medium...,False
2,i need to order one large vegetarian pizza wit...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i need to order (PIZZAORDER (NUMBER one...,False
3,i'd like to order a large onion and pepper pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like to order (PIZZAORDER (NUMBER a...,False
4,i'll have one pie along with pesto and ham but...,(ORDER (PIZZAORDER (NOT (TOPPING OLIVES ) ) (N...,(ORDER i'll have (PIZZAORDER (NUMBER one ) pie...,False


In [11]:
df_train['train.SRC'] = df_train['train.SRC'].apply(clean_text)
df_dev['dev.SRC'] = df_dev['dev.SRC'].apply(clean_text)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [12]:
# lemmatizer = WordNetLemmatizer()
# df_train['train.SRC'] = df_train['train.SRC'].apply(lemmatizer.lemmatize)
# df_train.head()

In [13]:
df_train['train.SRC'] = df_train['train.SRC'].apply(expnad_abb)
df_dev['dev.SRC'] = df_dev['dev.SRC'].apply(expnad_abb)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,i would like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [14]:
df_train['train.SRC'] = df_train['train.SRC'].apply(remove_stopwords)
df_dev['dev.SRC'] = df_dev['dev.SRC'].apply(remove_stopwords)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie green pepper and extra peperonni,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie american cheese a...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [15]:
df_train['train.SRC'] = df_train['train.SRC'].apply(handle_negation)
df_dev['dev.SRC'] = df_dev['dev.SRC'].apply(handle_negation)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie green pepper extra peperonni,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie american cheese m...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [16]:

tokenizer = Tokenizer(num_words=500, oov_token='<OOV>')
tokenizer.fit_on_texts(df_train['train.SRC']) #updates internal vocabulary based on a list of texts

sequences = tokenizer.texts_to_sequences(df_train['train.SRC']) #transforms each text in texts to a sequence of integers
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post')


In [17]:
tokenizer.word_index

{'<OOV>': 1,
 'not': 2,
 'three': 3,
 'pizzas': 4,
 'pizza': 5,
 'cheese': 6,
 'four': 7,
 'pies': 8,
 'party': 9,
 'five': 10,
 'american': 11,
 'sized': 12,
 'one': 13,
 'two': 14,
 'size': 15,
 'sprite': 16,
 'pepper': 17,
 'glaze': 18,
 'ice': 19,
 'large': 20,
 'balsamic': 21,
 'peppers': 22,
 'ounce': 23,
 'pie': 24,
 'crust': 25,
 'tea': 26,
 'thin': 27,
 'sauce': 28,
 'ups': 29,
 'extra': 30,
 'diet': 31,
 'green': 32,
 'seven': 33,
 'medium': 34,
 'also': 35,
 'personal': 36,
 'roasted': 37,
 'red': 38,
 'teas': 39,
 '500': 40,
 'ginger': 41,
 'pecorino': 42,
 'peperonni': 43,
 'cans': 44,
 'chicken': 45,
 'banana': 46,
 'milliliter': 47,
 'need': 48,
 'fantas': 49,
 'little': 50,
 'ale': 51,
 'liter': 52,
 'lunch': 53,
 'bottle': 54,
 'sprites': 55,
 '20': 56,
 'coke': 57,
 'mozzarella': 58,
 'onions': 59,
 'ml': 60,
 'onion': 61,
 'bit': 62,
 'olive': 63,
 'pellegrino': 64,
 'pineapple': 65,
 'san': 66,
 'regular': 67,
 'fl': 68,
 'yellow': 69,
 'tomato': 70,
 'caramelized':

In [None]:
df_train['padded_seq'] = list(padded_sequences)
df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED,padded_seq
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...,"[67, 21, 74, 4, 28, 103, 131, 132, 0, 0, 0, 0,..."
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...,"[28, 32, 3, 40, 24, 2, 3, 38, 51, 0, 0, 0, 0, ..."
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...,"[8, 9, 4, 28, 275, 7, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,"[13, 22, 144, 33, 32, 3, 15, 10, 2, 3, 181, 0,..."
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...,"[67, 21, 74, 17, 44, 16, 170, 0, 0, 0, 0, 0, 0..."
