In [9]:
# in this notebook we will do some preprocessing on the data and tokenization
import re
import nltk

def clean_string(input_string):
    """
    Cleans the input string by removing special characters, and unnecessary punctuation.

    Args:
        input_string: The string to be cleaned.

    Returns:
        The cleaned string.
    """
    # Remove special characters and unnecessary punctuation
    # TODO: Add more special characters as needed to be excluded
    cleaned_string = re.sub(r"[^\w\s'-]", "", input_string)  # Keeps only alphanumeric characters and spaces and apostrophes and hyphens
    cleaned_string = cleaned_string.lower()
    # Remove extra whitespace
    cleaned_string = re.sub(r"\s+", " ", cleaned_string).strip()
    return cleaned_string

def tokenize_string(input_string):
    """
    Tokenizes the input string into tokens.
    
    Args:
        input_string: The string to be tokenized.

    Returns:
        A list of tokens.
    """
    tokens = nltk.word_tokenize(input_string)
    return tokens

def label_tokens1(input_tokens, structure_text):
    """
    Labels the input text based on a structured representation and a list of attributes.

    Args:
        input_tokens: The tokenized input text.
        structure_text: The structured text containing attributes and their values.

    Returns:
        A list of tuples where each token in the input text is paired with its corresponding label.
    """
    attribute_values = {"NUMBER", "SIZE", "TOPPING", "STYLE", "DRINKTYPE", "CONTAINERTYPE", "VOLUME", "QUANTITY"}
    structure_map = {}
    for attribute in attribute_values:
        # Match the attribute and its value in the structure text
        pattern = r"\(\s*"+ attribute + r"\s+([^\)]*)\s*\)"
        matches = re.finditer(pattern, structure_text)
        for match in matches:
            value = match.group(1).strip()
            # Special handling for TOPPING with "not" before it
            if attribute == "TOPPING":
                preceding_text = structure_text[:match.start()]
                if re.search(r"\bNOT\b\s*\($", preceding_text, re.IGNORECASE):
                    attribute = "NOT_TOPPING"
            structure_map[value] = attribute
    labeled_output = []
    labeled_output_nums = []
    entity_to_num = {"I_NUMBER": 0, "I_SIZE": 1, "I_TOPPING": 2, "I_STYLE": 3, "I_DRINKTYPE": 4, "I_CONTAINERTYPE": 5, "I_VOLUME": 6, "I_QUANTITY": 7, "B_NUMBER": 8, "B_SIZE": 9, "B_TOPPING": 10, "B_STYLE": 11, "B_DRINKTYPE": 12, "B_CONTAINERTYPE": 13, "B_VOLUME": 14, "B_QUANTITY": 15, "I_NOT_TOPPING": 16, "B_NOT_TOPPING": 17, "NONE": 18}
    for token in input_tokens:
        label = "NONE"
        if token in structure_map:
            label = structure_map[token]
            label = "B_" + label
        # else check if it is part of the key
        else:
            for key in structure_map.keys():
                if token in key.split():
                    label = structure_map[key]
                    if token == key.split()[0]:
                        label = "B_" + label
                    else:
                        label = "I_" + label
                    break
        labeled_output.append((token, label))
        labeled_output_nums.append(entity_to_num[label])
    return labeled_output, labeled_output_nums

def label_tokens2(input_tokens, structure_tokens):
    """
    Labels the input text based on a structured representation and a list of attributes.

    Args:
        input_tokens: The tokenized input text.
        structure_text: The structured text containing attributes and their values.

    Returns:
        A list of tuples where each token in the input text is paired with its corresponding label.
    """
    attributes = ["PIZZAORDER", "DRINKORDER", "COMPLEX_TOPPING"]
    execluded = {"NUMBER", "SIZE", "TOPPING", "STYLE", "DRINKTYPE", "CONTAINERTYPE", "VOLUME", "QUANTITY"}
    curr = "NONE"
    # I will also keep tracking "(" and ")" to know when to change the current attribute to NONE
    parentheses =0
    is_begin = True
    labels_mapping = {}
    for token in structure_tokens:
        if token in attributes:
            curr = token
            is_begin = True
        elif token == "(":
            parentheses += 1
        elif token == ")":
            parentheses -= 1
            if parentheses == 1:
                curr = "NONE"
        elif token not in execluded:
            if curr == "NONE":
                labels_mapping[token] = curr
            elif is_begin:
                labels_mapping[token] = "B_" + curr
                is_begin = False
            else:
                labels_mapping[token] = "I_" + curr
    labeled_output = []
    labeled_output_nums =[]
    intent_to_num = {"I_PIZZAORDER": 0, "I_DRINKORDER": 1, "I_COMPLEX_TOPPING": 2, "B_PIZZAORDER": 3, "B_DRINKORDER": 4, "B_COMPLEX_TOPPING": 5, "NONE": 6}
    for token in input_tokens:
        label = "NONE"
        if token in labels_mapping:
            label = labels_mapping[token]
        labeled_output.append((token, label))
        labeled_output_nums.append(intent_to_num[label])
    return labeled_output, labeled_output_nums

def label_input(input_text, structure_text1, structure_text2):
    """
    It is a similar function to the previous one, but it is used for adding another layer for the input
    which is the preprocessing of the input text and then tokenizing it.

    Args:
        input_text: The raw input text.
        structure_text1: The structured text containing attributes and their values. (train.TOP-DECOUPLED)
        structure_text2: The structured text containing attributes and their values. (train.TOP)
    
    Returns:
        2 lists of tuples where each token in the input text is paired with its corresponding label.
    """
    cleaned_text = clean_string(input_text)
    input_tokens = tokenize_string(cleaned_text)
    labeled_output1 = label_tokens1(input_tokens, structure_text1)
    structure2_tokens = tokenize_string(structure_text2)
    labeled_output2 = label_tokens2(input_tokens, structure2_tokens)
    return labeled_output1, labeled_output2

def label_complete_input (input_list, structure_text1_list, structure_text2_list):
    """
    It is a similar function to the previous one, but it takes inputs as lists of tokens instead of strings.

    Args:
        input_text: The raw input text.
        structure_text1: The structured text containing attributes and their values. (train.TOP-DECOUPLED)
        structure_text2: The structured text containing attributes and their values. (train.TOP)
    
    Returns:
        3 lists of tuples where each token in the input text is paired with its corresponding label.
    """
    labeled_output1 = []
    labeled_output2 = []
    for text, struct1, struct2 in zip(input_list, structure_text1_list, structure_text2_list):
        cleaned_text = clean_string(text)
        input_tokens = tokenize_string(cleaned_text)
        _, labels = label_tokens1(input_tokens, struct1)
        labeled_output1.append(labels)
        structure2_tokens = tokenize_string(struct2)
        _, labels = label_tokens2(input_tokens, structure2_tokens)
        labeled_output2.append(labels)
    return labeled_output1, labeled_output2

In [11]:
out1,out2 = label_input("i'd like a large vegetarian pizza", "(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (STYLE vegetarian ) ) )","(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE large ) (STYLE vegetarian ) pizza ) )")
print("------------------1------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("a 20 fl ounce cherry coke bottle", "(ORDER (DRINKORDER (NUMBER a ) (VOLUME 20 fl ounce ) (DRINKTYPE cherry coke ) (CONTAINERTYPE bottle ) ) )", "(ORDER (DRINKORDER (NUMBER a ) (VOLUME 20 fl ounce ) (DRINKTYPE cherry coke ) (CONTAINERTYPE bottle ) ) )")
print("------------------2------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("four pizzas with american cheese and also three cans of ice tea and three regular san pellegrinos", "(ORDER (PIZZAORDER (NUMBER four ) (TOPPING american cheese ) ) (DRINKORDER (NUMBER three ) (CONTAINERTYPE cans ) (DRINKTYPE ice tea ) ) (DRINKORDER (NUMBER three ) (SIZE regular ) (DRINKTYPE san pellegrinos ) ) )", "(ORDER (PIZZAORDER (NUMBER four ) pizzas with (TOPPING american cheese ) ) and also (DRINKORDER (NUMBER three ) (CONTAINERTYPE cans ) of (DRINKTYPE ice tea ) ) and (DRINKORDER (NUMBER three ) (SIZE regular ) (DRINKTYPE san pellegrinos ) ) )")
print("------------------3------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("i want one personal - size pie without any carrots", "(ORDER (PIZZAORDER (NUMBER one ) (SIZE personal - size ) (NOT (TOPPING carrots ) ) ) )", "(ORDER i want (PIZZAORDER (NUMBER one ) (SIZE personal - size ) pie without any (NOT (TOPPING carrots ) ) ) )")
print("------------------4------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("can i have one high rise dough pie with american cheese and a lot of meatball", "(ORDER (PIZZAORDER (NUMBER one ) (STYLE high rise dough ) (TOPPING american cheese ) (COMPLEX_TOPPING (QUANTITY a lot of ) (TOPPING meatball ) ) ) )", "(ORDER can i have (PIZZAORDER (NUMBER one ) (STYLE high rise dough ) pie with (TOPPING american cheese ) and (COMPLEX_TOPPING (QUANTITY a lot of ) (TOPPING meatball ) ) ) )") 
print("------------------5------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("i'd like a lunch - sized pie without alfredo chicken", "(ORDER (PIZZAORDER (NUMBER a ) (SIZE lunch - sized ) (NOT (TOPPING alfredo chicken ) ) ) )", "(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE lunch - sized ) pie without (NOT (TOPPING alfredo chicken ) ) ) )") 
print("------------------6------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("i'd like a lunch - sized pie without alfredo chicken or beef", "(ORDER (PIZZAORDER (NUMBER a ) (SIZE lunch - sized ) (NOT (TOPPING alfredo chicken ) )(NOT (TOPPING beef ) ) ) )", "(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE lunch - sized ) pie without (NOT (TOPPING alfredo chicken ) ) or (NOT (TOPPING beef ) ) ) )") 
print("------------------7------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("pie with american cheese and with not much parmesan cheese", "(ORDER (PIZZAORDER (TOPPING american cheese ) (COMPLEX_TOPPING (QUANTITY not much ) (TOPPING parmesan cheese ) ) ) )", "(ORDER (PIZZAORDER pie with (TOPPING american cheese ) and with (COMPLEX_TOPPING (QUANTITY not much ) (TOPPING parmesan cheese ) ) ) )") 
print("------------------8------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("pie without american cheese and with parmesan cheese", "(ORDER (PIZZAORDER (NOT(TOPPING american cheese )) (TOPPING parmesan cheese ) ) )", "(ORDER (PIZZAORDER pie without (NOT(TOPPING american cheese )) and with (TOPPING parmesan cheese ) ) )") 
print("------------------9------------------")
print("1 ",out1)
print("2 ",out2)

------------------1------------------
1  ([('i', 'NONE'), ("'d", 'NONE'), ('like', 'NONE'), ('a', 'B_NUMBER'), ('large', 'B_SIZE'), ('vegetarian', 'B_STYLE'), ('pizza', 'NONE')], [18, 18, 18, 8, 9, 11, 18])
2  ([('i', 'NONE'), ("'d", 'NONE'), ('like', 'NONE'), ('a', 'B_PIZZAORDER'), ('large', 'I_PIZZAORDER'), ('vegetarian', 'I_PIZZAORDER'), ('pizza', 'I_PIZZAORDER')], [6, 6, 6, 3, 0, 0, 0])
------------------2------------------
1  ([('a', 'B_NUMBER'), ('20', 'B_VOLUME'), ('fl', 'I_VOLUME'), ('ounce', 'I_VOLUME'), ('cherry', 'B_DRINKTYPE'), ('coke', 'I_DRINKTYPE'), ('bottle', 'B_CONTAINERTYPE')], [8, 14, 6, 6, 12, 4, 13])
2  ([('a', 'B_DRINKORDER'), ('20', 'I_DRINKORDER'), ('fl', 'I_DRINKORDER'), ('ounce', 'I_DRINKORDER'), ('cherry', 'I_DRINKORDER'), ('coke', 'I_DRINKORDER'), ('bottle', 'I_DRINKORDER')], [4, 1, 1, 1, 1, 1, 1])
------------------3------------------
1  ([('four', 'B_NUMBER'), ('pizzas', 'NONE'), ('with', 'NONE'), ('american', 'B_TOPPING'), ('cheese', 'I_TOPPING'), ('and',

In [7]:
import json 
def fix_json_file(path):
    """
    Fixes a corrupted JSON file by formatting it properly.

    Args:
        path: Path to the corrupted JSON file.

    Returns:
        None. Writes a corrected version of the JSON file to disk.
    """
    fixed_file = open("../fixed_train.json", "w")
    fixed_file.write("[\n")
    with open(path, "r") as file:
        for line in file:
            fixed_file.write(line[:-1] + ",\n")
    fixed_file.seek(fixed_file.tell() - 3)
    fixed_file.truncate()
    fixed_file.write("]")
    fixed_file.close()
fix_json_file("../data/fixed_PIZZA_train.json")

In [10]:
def read_data(path):
    """
    Reads a JSON file and loads its content into a Python object.

    Args:
        path: Path to the JSON file.

    Returns:
        Parsed JSON data as a Python object.
    """
    with open(path, 'r') as file:
        data = json.load(file)
        part_of_data = data[:100]
    return part_of_data
data = read_data("../data/fixed_PIZZA_train.json")

In [11]:
def build_train_corpus_from_json(data):
    """
    Builds a training corpus from a JSON-like dataset.
    Extracts the "train.SRC" field from each item in the dataset.

    Args:
        data: List of dictionaries, where each dictionary contains a "train.SRC" key.

    Returns:
        A list of strings representing the training corpus.
    """
    src, top, decoupled = [], [], []
    for d in data:
        src.append(d["train.SRC"])
        top.append(d["train.TOP"])
        decoupled.append(d["train.TOP-DECOUPLED"])
    return src, top, decoupled
src, top, decoupled = build_train_corpus_from_json(data)

In [12]:
print(src[0])
print(top[0])
print(decoupled[0])

can i have a large bbq pulled pork
(ORDER can i have (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )
(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (TOPPING bbq pulled pork ) ) )


In [13]:
entites, intents = label_complete_input(src, top, decoupled)
# assert that the length of the entities and intents is the same as the length of the src for each one
print(entites[0])
print(intents[0])
for src_, ent_, intent_ in zip(src, entites, intents):
    assert len(tokenize_string(src_)) == len(ent_) == len(intent_)

[18, 18, 18, 8, 9, 10, 2, 2]
[6, 6, 6, 3, 0, 0, 0, 0]


In [None]:
from gensim.models import Word2Vec, FastText  # For Word2Vec model
import gensim  # General Gensim utilities
import nltk  # For tokenization and natural language processing
import json  # For handling JSON files
# from transformers import BertTokenizer, BertModel  # BERT tokenizer and model
# import torch  # For PyTorch tensors and operations
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Hyperparameters for the Word2Vec model
VECTOR_SIZE = 50  # Size of word vectors
WINDOW_SIZE = 5  # Context window size
THREADS = 4  # Number of threads to use for training
CUTOFF_FREQ = 1  # Minimum frequency for a word to be included in vocabulary
EPOCHS = 100  # Number of training epochs

def list_of_lists(sentences):
    """
    Converts a list of sentences into a list of tokenized sentences.
    Each sentence is split into individual words.

    Args:
        sentences: List of strings where each string is a sentence.

    Returns:
        List of lists where each inner list contains tokens of a sentence.
    """
    tokenized_sentences = []
    for sentence in sentences:
        tokenized_sentences.append(nltk.word_tokenize(sentence))
    return tokenized_sentences

def train_gensim_w2v_model(corpus):
    """
    Trains a Word2Vec model on the given corpus of sentences.

    Args:
        corpus: List of sentences (strings).

    Returns:
        A trained Gensim Word2Vec model.
    """
    tokenized_sentences = list_of_lists(corpus)
    model = Word2Vec(
        sentences=tokenized_sentences,
        vector_size=VECTOR_SIZE,
        window=WINDOW_SIZE,
        min_count=CUTOFF_FREQ,
        workers=THREADS,
    )
    model.build_vocab(tokenized_sentences)
    model.train(
        corpus_iterable=tokenized_sentences,
        total_examples=model.corpus_count,
        epochs=EPOCHS,
    )
    return model

def embed_gensim(model, word):
    """
    Retrieves the word embedding for a given word using a trained Gensim model.
    Works for both w2v and fastext.
    Args:
        model: Trained Gensim Word2Vec model.
        word: Word to retrieve the embedding for.

    Returns:
        Word embedding as a vector.
    """
    return model.wv[word]

In [None]:
import torch
model = torch.load('./Models/model_100k.pth', weights_only=False, map_location=torch.device('cpu'))


In [None]:
entity_to_num = {"I_NUMBER": 0, "I_SIZE": 1, "I_TOPPING": 2, "I_STYLE": 3, "I_DRINKTYPE": 4, "I_CONTAINERTYPE": 5, "I_VOLUME": 6, "I_QUANTITY": 7, "B_NUMBER": 8, "B_SIZE": 9, "B_TOPPING": 10, "B_STYLE": 11, "B_DRINKTYPE": 12, "B_CONTAINERTYPE": 13, "B_VOLUME": 14, "B_QUANTITY": 15, "I_NOT_TOPPING": 16, "B_NOT_TOPPING": 17, "NONE": 18}
intent_to_num = {"I_PIZZAORDER": 0, "I_DRINKORDER": 1, "I_COMPLEX_TOPPING": 2, "B_PIZZAORDER": 3, "B_DRINKORDER": 4, "B_COMPLEX_TOPPING": 5, "NONE": 6}