In [56]:
# in this notebook we will do some preprocessing on the data and tokenization
import re
import nltk

def clean_string(input_string):
    """
    Cleans the input string by removing special characters, and unnecessary punctuation.

    Args:
        input_string: The string to be cleaned.

    Returns:
        The cleaned string.
    """
    # Remove special characters and unnecessary punctuation
    # TODO: Add more special characters as needed to be excluded
    cleaned_string = re.sub(r"[^\w\s'-]", "", input_string)  # Keeps only alphanumeric characters and spaces and apostrophes and hyphens
    cleaned_string = cleaned_string.lower()
    # Remove extra whitespace
    cleaned_string = re.sub(r"\s+", " ", cleaned_string).strip()
    return cleaned_string

def tokenize_string(input_string):
    """
    Tokenizes the input string into tokens.
    
    Args:
        input_string: The string to be tokenized.

    Returns:
        A list of tokens.
    """
    tokens = nltk.word_tokenize(input_string)
    return tokens

def label_tokens(input_tokens, structure_text):
    """
    Labels the input text based on a structured representation and a list of attributes.

    Args:
        input_tokens: The tokenized input text.
        structure_text: The structured text containing attributes and their values.

    Returns:
        A list of tuples where each token in the input text is paired with its corresponding label.
    """
    attribute_values = {"NUMBER", "SIZE", "TOPPING", "STYLE", "DRINKTYPE", "CONTAINERTYPE", "VOLUME"}
    structure_map = {}
    for attribute in attribute_values:
        # Match the attribute and its value in the structure text
        pattern = r"\(\s*"+ attribute + r"\s+([^\)]*)\s*\)"
        match = re.search(pattern, structure_text)
        if match:
            value = match.group(1).strip()
            structure_map[value] = attribute
    labeled_output = []
    for token in input_tokens:
        label = "NONE"
        if token in structure_map:
            label = structure_map[token]
        # else check if its part of the key
        else:
            for key in structure_map.keys():
                if token in key.split():
                    label = structure_map[key]
                    break
        labeled_output.append((token, label))
    return labeled_output

def label_input(input_text, structure_text):
    """
    It is a similar function to the previous one, but it is used for adding another layer for the input
    which is the preprocessing of the input text and then tokenizing it.

    Args:
        input_text: The raw input text.
        structure_text: The structured text containing attributes and their values.
    
    Returns:
        A list of tuples where each token in the input text is paired with its corresponding label.
    """
    cleaned_text = clean_string(input_text)
    tokens = tokenize_string(cleaned_text)
    labeled_output = label_tokens(tokens, structure_text)    
    return labeled_output

In [57]:
out1 = label_input("i'd like a large vegetarian pizza", "(ORDER (PIZZAORDER (NUMBER a ) (SIZE large ) (STYLE vegetarian ) ) )")
print("1 ",out1)
out2 = label_input("a 20 fl ounce cherry coke bottle", "(ORDER (DRINKORDER (NUMBER a ) (VOLUME 20 fl ounce ) (DRINKTYPE cherry coke ) (CONTAINERTYPE bottle ) ) )")
print("2 ",out2)
out3 = label_input("three three-liter san pellegrinos in cans", "(ORDER (DRINKORDER (NUMBER three ) (VOLUME three-liter ) (DRINKTYPE san pellegrinos ) (CONTAINERTYPE in cans ) ) )")
print("3 ",out3)
out4 = label_input("i want a personal pie without hams", "(ORDER (PIZZAORDER (NUMBER a ) (SIZE personal ) (NOT (TOPPING hams ) ) ) )")
print("4 ",out4)
out5 = label_input("i want one personal - size pie without any carrots", "(ORDER (PIZZAORDER (NUMBER one ) (SIZE personal - size ) (NOT (TOPPING carrots ) ) ) )")
print("5 ",out5)

1  [('i', 'NONE'), ("'d", 'NONE'), ('like', 'NONE'), ('a', 'NUMBER'), ('large', 'SIZE'), ('vegetarian', 'STYLE'), ('pizza', 'NONE')]
2  [('a', 'NUMBER'), ('20', 'VOLUME'), ('fl', 'VOLUME'), ('ounce', 'VOLUME'), ('cherry', 'DRINKTYPE'), ('coke', 'DRINKTYPE'), ('bottle', 'CONTAINERTYPE')]
3  [('three', 'NUMBER'), ('three-liter', 'VOLUME'), ('san', 'DRINKTYPE'), ('pellegrinos', 'DRINKTYPE'), ('in', 'CONTAINERTYPE'), ('cans', 'CONTAINERTYPE')]
4  [('i', 'NONE'), ('want', 'NONE'), ('a', 'NUMBER'), ('personal', 'SIZE'), ('pie', 'NONE'), ('without', 'NONE'), ('hams', 'TOPPING')]
5  [('i', 'NONE'), ('want', 'NONE'), ('one', 'NUMBER'), ('personal', 'SIZE'), ('-', 'SIZE'), ('size', 'SIZE'), ('pie', 'NONE'), ('without', 'NONE'), ('any', 'NONE'), ('carrots', 'TOPPING')]
