In [13]:
# in this notebook we will do some preprocessing on the data and tokenization
import re
import nltk

def clean_string(input_string):
    """
    Cleans the input string by removing special characters, and unnecessary punctuation.

    Args:
        input_string: The string to be cleaned.

    Returns:
        The cleaned string.
    """
    # Remove special characters and unnecessary punctuation
    # TODO: Add more special characters as needed to be excluded
    cleaned_string = re.sub(r"[^\w\s'-]", "", input_string)  # Keeps only alphanumeric characters and spaces and apostrophes and hyphens
    cleaned_string = cleaned_string.lower()
    # Remove extra whitespace
    cleaned_string = re.sub(r"\s+", " ", cleaned_string).strip()
    return cleaned_string

def tokenize_string(input_string):
    """
    Tokenizes the input string into tokens.
    
    Args:
        input_string: The string to be tokenized.

    Returns:
        A list of tokens.
    """
    tokens = nltk.word_tokenize(input_string)
    return tokens

def label_tokens1(input_tokens, structure_text):
    """
    Labels the input text based on a structured representation and a list of attributes.

    Args:
        input_tokens: The tokenized input text.
        structure_text: The structured text containing attributes and their values.

    Returns:
        A list of tuples where each token in the input text is paired with its corresponding label.
    """
    attribute_values = {"NUMBER", "SIZE", "TOPPING", "STYLE", "DRINKTYPE", "CONTAINERTYPE", "VOLUME", "QUANTITY"}
    structure_map = {}
    for attribute in attribute_values:
        # Match the attribute and its value in the structure text
        pattern = r"\(\s*"+ attribute + r"\s+([^\)]*)\s*\)"
        matches = re.finditer(pattern, structure_text)
        for match in matches:
            value = match.group(1).strip()
            # Special handling for TOPPING with "not" before it
            if attribute == "TOPPING":
                preceding_text = structure_text[:match.start()]
                if re.search(r"\bNOT\b", preceding_text, re.IGNORECASE):
                    attribute = "NOT_TOPPING"
            structure_map[value] = attribute
    labeled_output = []
    for token in input_tokens:
        label = "NONE"
        if token in structure_map:
            label = structure_map[token]
        # else check if it is part of the key
        else:
            for key in structure_map.keys():
                if token in key.split():
                    label = structure_map[key]
                    break
        labeled_output.append((token, label))
    return labeled_output

def label_tokens2(input_tokens, structure_tokens):
    """
    Labels the input text based on a structured representation and a list of attributes.

    Args:
        input_tokens: The tokenized input text.
        structure_text: The structured text containing attributes and their values.

    Returns:
        A list of tuples where each token in the input text is paired with its corresponding label.
    """
    attributes = ["PIZZAORDER", "DRINKORDER", "COMPLEX_TOPPING"]
    execluded = {"NUMBER", "SIZE", "TOPPING", "STYLE", "DRINKTYPE", "CONTAINERTYPE", "VOLUME", "QUANTITY"}
    curr = "NONE"
    # I will also keep tracking "(" and ")" to know when to change the current attribute to NONE
    parentheses =0
    labels_mapping = {}
    for token in structure_tokens:
        if token in attributes:
            curr = token
        elif token == "(":
            parentheses += 1
        elif token == ")":
            parentheses -= 1
            if parentheses == 1:
                curr = "NONE"
        elif token not in execluded:
            labels_mapping[token] = curr
    labeled_output = []
    for token in input_tokens:
        label = "NONE"
        if token in labels_mapping:
            label = labels_mapping[token]
        labeled_output.append((token, label))
    return labeled_output

def label_input(input_text, structure_text1, structure_text2):
    """
    It is a similar function to the previous one, but it is used for adding another layer for the input
    which is the preprocessing of the input text and then tokenizing it.

    Args:
        input_text: The raw input text.
        structure_text1: The structured text containing attributes and their values. (train.TOP-DECOUPLED)
        structure_text2: The structured text containing attributes and their values. (train.TOP)
    
    Returns:
        2 lists of tuples where each token in the input text is paired with its corresponding label.
    """
    cleaned_text = clean_string(input_text)
    input_tokens = tokenize_string(cleaned_text)
    labeled_output1 = label_tokens1(input_tokens, structure_text1)
    structure2_tokens = tokenize_string(structure_text2)
    labeled_output2 = label_tokens2(input_tokens, structure2_tokens)
    return labeled_output1, labeled_output2

In [14]:
out1,out2 = label_input("i'd like a large vegetarian pizza", "(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (STYLE vegetarian ) ) )","(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE large ) (STYLE vegetarian ) pizza ) )")
print("------------------1------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("a 20 fl ounce cherry coke bottle", "(ORDER (DRINKORDER (NUMBER a ) (VOLUME 20 fl ounce ) (DRINKTYPE cherry coke ) (CONTAINERTYPE bottle ) ) )", "(ORDER (DRINKORDER (NUMBER a ) (VOLUME 20 fl ounce ) (DRINKTYPE cherry coke ) (CONTAINERTYPE bottle ) ) )")
print("------------------2------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("four pizzas with american cheese and also three cans of ice tea and three regular san pellegrinos", "(ORDER (PIZZAORDER (NUMBER four ) (TOPPING american cheese ) ) (DRINKORDER (NUMBER three ) (CONTAINERTYPE cans ) (DRINKTYPE ice tea ) ) (DRINKORDER (NUMBER three ) (SIZE regular ) (DRINKTYPE san pellegrinos ) ) )", "(ORDER (PIZZAORDER (NUMBER four ) pizzas with (TOPPING american cheese ) ) and also (DRINKORDER (NUMBER three ) (CONTAINERTYPE cans ) of (DRINKTYPE ice tea ) ) and (DRINKORDER (NUMBER three ) (SIZE regular ) (DRINKTYPE san pellegrinos ) ) )")
print("------------------3------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("i want one personal - size pie without any carrots", "(ORDER (PIZZAORDER (NUMBER one ) (SIZE personal - size ) (NOT (TOPPING carrots ) ) ) )", "(ORDER i want (PIZZAORDER (NUMBER one ) (SIZE personal - size ) pie without any (NOT (TOPPING carrots ) ) ) )")
print("------------------4------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("can i have one high rise dough pie with american cheese and a lot of meatball", "(ORDER (PIZZAORDER (NUMBER one ) (STYLE high rise dough ) (TOPPING american cheese ) (COMPLEX_TOPPING (QUANTITY a lot of ) (TOPPING meatball ) ) ) )", "(ORDER can i have (PIZZAORDER (NUMBER one ) (STYLE high rise dough ) pie with (TOPPING american cheese ) and (COMPLEX_TOPPING (QUANTITY a lot of ) (TOPPING meatball ) ) ) )") 
print("------------------5------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("i'd like a lunch - sized pie without alfredo chicken", "(ORDER (PIZZAORDER (NUMBER a ) (SIZE lunch - sized ) (NOT (TOPPING alfredo chicken ) ) ) )", "(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE lunch - sized ) pie without (NOT (TOPPING alfredo chicken ) ) ) )") 
print("------------------6------------------")
print("1 ",out1)
print("2 ",out2)
out1, out2 = label_input("i'd like a lunch - sized pie without alfredo chicken or beef", "(ORDER (PIZZAORDER (NUMBER a ) (SIZE lunch - sized ) (NOT (TOPPING alfredo chicken ) )(NOT (TOPPING beef ) ) ) )", "(ORDER i'd like (PIZZAORDER (NUMBER a ) (SIZE lunch - sized ) pie without (NOT (TOPPING alfredo chicken ) ) or (NOT (TOPPING beef ) ) ) )") 
print("------------------7------------------")
print("1 ",out1)
print("2 ",out2)

------------------1------------------
1  [('i', 'NONE'), ("'d", 'NONE'), ('like', 'NONE'), ('a', 'NUMBER'), ('large', 'SIZE'), ('vegetarian', 'STYLE'), ('pizza', 'NONE')]
2  [('i', 'NONE'), ("'d", 'NONE'), ('like', 'NONE'), ('a', 'PIZZAORDER'), ('large', 'PIZZAORDER'), ('vegetarian', 'PIZZAORDER'), ('pizza', 'PIZZAORDER')]
------------------2------------------
1  [('a', 'NUMBER'), ('20', 'VOLUME'), ('fl', 'VOLUME'), ('ounce', 'VOLUME'), ('cherry', 'DRINKTYPE'), ('coke', 'DRINKTYPE'), ('bottle', 'CONTAINERTYPE')]
2  [('a', 'DRINKORDER'), ('20', 'DRINKORDER'), ('fl', 'DRINKORDER'), ('ounce', 'DRINKORDER'), ('cherry', 'DRINKORDER'), ('coke', 'DRINKORDER'), ('bottle', 'DRINKORDER')]
------------------3------------------
1  [('four', 'NUMBER'), ('pizzas', 'NONE'), ('with', 'NONE'), ('american', 'TOPPING'), ('cheese', 'TOPPING'), ('and', 'NONE'), ('also', 'NONE'), ('three', 'NUMBER'), ('cans', 'CONTAINERTYPE'), ('of', 'NONE'), ('ice', 'DRINKTYPE'), ('tea', 'DRINKTYPE'), ('and', 'NONE'), ('th