In [42]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", None)
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
dataset_dir = 'E:/Collage/NLP/Project/dataset/'

In [21]:
def clean_text(text):
    text = re.sub(r"[^\w']", ' ', text)   # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    text = text.lower().strip()       # Lowercase and strip whitespace
    return text

In [24]:
df_train = pd.read_json(dataset_dir + 'PIZZA_train.json' , lines=True)
df_dev = pd.read_json(dataset_dir + 'PIZZA_dev.json' , lines=True)

In [25]:
df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [26]:
df_dev.head()

Unnamed: 0,dev.SRC,dev.EXR,dev.TOP,dev.PCFG_ERR
0,i want to order two medium pizzas with sausage...,(ORDER (PIZZAORDER (NUMBER 2 ) (SIZE MEDIUM ) ...,(ORDER i want to order (PIZZAORDER (NUMBER two...,False
1,five medium pizzas with tomatoes and ham,(ORDER (PIZZAORDER (NUMBER 5 ) (SIZE MEDIUM ) ...,(ORDER (PIZZAORDER (NUMBER five ) (SIZE medium...,False
2,i need to order one large vegetarian pizza wit...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i need to order (PIZZAORDER (NUMBER one...,False
3,i'd like to order a large onion and pepper pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like to order (PIZZAORDER (NUMBER a...,False
4,i'll have one pie along with pesto and ham but...,(ORDER (PIZZAORDER (NOT (TOPPING OLIVES ) ) (N...,(ORDER i'll have (PIZZAORDER (NUMBER one ) pie...,False


In [27]:
df_train['train.SRC'] = df_train['train.SRC'].apply(clean_text)
df_dev['dev.SRC'] = df_dev['dev.SRC'].apply(clean_text)

df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...


In [44]:
ps = PorterStemmer()
df_train['train.SRC'] = df_train['train.SRC'].apply(ps.stem)
df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED,padded_seq
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...,"[67, 21, 74, 4, 28, 103, 131, 132, 0, 0, 0, 0,..."
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...,"[28, 32, 3, 40, 24, 2, 3, 38, 51, 0, 0, 0, 0, ..."
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...,"[8, 9, 4, 28, 275, 7, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,"[13, 22, 144, 33, 32, 3, 15, 10, 2, 3, 181, 0,..."
4,can i have one personal sized artichok,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...,"[67, 21, 74, 17, 44, 16, 170, 0, 0, 0, 0, 0, 0..."


In [45]:
tokenizer = Tokenizer(num_words=500, oov_token="<OOV>")
tokenizer.fit_on_texts(df_train['train.SRC']) #updates internal vocabulary based on a list of texts

sequences = tokenizer.texts_to_sequences(df_train['train.SRC']) #transforms each text in texts to a sequence of integers
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post')


In [46]:
tokenizer.word_index

{'<OOV>': 1,
 'and': 2,
 'with': 3,
 'a': 4,
 'three': 5,
 'pizzas': 6,
 'pizza': 7,
 "i'd": 8,
 'like': 9,
 'four': 10,
 'pies': 11,
 'cheese': 12,
 'party': 13,
 'five': 14,
 'american': 15,
 'sized': 16,
 'one': 17,
 'no': 18,
 'two': 19,
 'of': 20,
 'i': 21,
 'size': 22,
 'without': 23,
 'ice': 24,
 'large': 25,
 'glaze': 26,
 'pepp': 27,
 'balsamic': 28,
 'tea': 29,
 'ounce': 30,
 'sprite': 31,
 'pie': 32,
 'crust': 33,
 'thin': 34,
 'extra': 35,
 'che': 36,
 'pepper': 37,
 'diet': 38,
 'green': 39,
 'seven': 40,
 'medium': 41,
 'also': 42,
 'personal': 43,
 'roasted': 44,
 'red': 45,
 'ups': 46,
 'pellegrino': 47,
 '500': 48,
 'ginger': 49,
 'sprit': 50,
 'pecorino': 51,
 'chicken': 52,
 'onion': 53,
 'peperonni': 54,
 'peppers': 55,
 'banana': 56,
 'milliliter': 57,
 'need': 58,
 'little': 59,
 'cans': 60,
 'liter': 61,
 'lunch': 62,
 'sauc': 63,
 'any': 64,
 'fantas': 65,
 'ale': 66,
 'bottle': 67,
 'can': 68,
 'tomato': 69,
 '20': 70,
 'mozzarella': 71,
 'want': 72,
 'the': 73

In [None]:
df_train['padded_seq'] = list(padded_sequences)
df_train.head()

Unnamed: 0,train.SRC,train.EXR,train.TOP,train.TOP-DECOUPLED,padded_seq
0,can i have a large bbq pulled pork,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...,"[67, 21, 74, 4, 28, 103, 131, 132, 0, 0, 0, 0,..."
1,large pie with green pepper and with extra pep...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER (PIZZAORDER (SIZE large ) pie with (TOP...,(ORDER (PIZZAORDER (SIZE large ) (TOPPING gree...,"[28, 32, 3, 40, 24, 2, 3, 38, 51, 0, 0, 0, 0, ..."
2,i'd like a large vegetarian pizza,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE LARGE ) (...,(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE ...,(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (...,"[8, 9, 4, 28, 275, 7, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,party size stuffed crust pie with american che...,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PARTY_SIZ...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,(ORDER (PIZZAORDER (SIZE party size ) (STYLE s...,"[13, 22, 144, 33, 32, 3, 15, 10, 2, 3, 181, 0,..."
4,can i have one personal sized artichoke,(ORDER (PIZZAORDER (NUMBER 1 ) (SIZE PERSONAL_...,(ORDER can i have (PIZZAORDER (NUMBER one ) (S...,(ORDER (PIZZAORDER (NUMBER one ) (SIZE persona...,"[67, 21, 74, 17, 44, 16, 170, 0, 0, 0, 0, 0, 0..."
