# JERTmate data processing

Compile synthetic data and process into useable formats

#### Basic definitions

In [3]:
import json
import random
import os
import re
import numpy as np
from transformers import AutoTokenizer
from transformers import TFBertModel

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert = TFBertModel.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

#### Compile Synthetic Data

In [1]:
sentence_files = {
    # miscellaneous
    "greeting": "../Greeting_Farewell/greeting.txt",
    "farewell": "../Greeting_Farewell/farewell.txt",
    "out_of_scope": "../Out_of_Scope/out_of_scope.txt",
    "prompted_name": "../Prompted_Name/prompted_name.txt",
    "confirm": "../Confirm_Deny/confirm.txt",
    "deny": "../Confirm_Deny/deny.txt",

    # reservation
    "cancel_res": "../Reservation/cancel_res.txt",
    "view_res": "../Reservation/view_res.txt",
    "order_status": "../Reservation/order_status.txt",
    "change_res_info": "../Reservation/change_res_info.txt",
    "add_res_info": "../Reservation/add_res_info.txt",
    "prompted_res_inputs": "../Reservation/prompted_res_inputs.txt",
    
    # order
    "add_order_info": "../Order/add_order_info.txt",
    "change_order_info": "../Order/change_order_info.txt",
    "view_order": "../Order/view_order.txt",
    "cancel_order": "../Order/cancel_order.txt",
    "checkout": "../Order/checkout.txt",
    "prompted_order_inputs": "../Order/prompted_order_inputs.txt",

    # inquiry
    "menu_inquiry": "../Inquiry/menu_inquiry.txt",
    "location_inquiry": "../Inquiry/location_inquiry.txt",
    "hours_inquiry": "../Inquiry/hours_inquiry.txt",
}

# Shared storage objects to store found arrays of sentences
storage = {}
storage_copy = {}
with_strings_storage = {}
without_strings_storage = {}

for intent, file_path in sentence_files.items():
    with open(file_path, 'r') as file:
        storage[intent] = [line.strip() for line in file.readlines()]

storage_copy = storage.copy()

def get_random_line(intent):
    if len(storage[intent]) == 0:
        storage[intent] = storage_copy[intent]
        
    selected_entity = random.choice(storage[intent]).strip()
    storage[intent].remove(selected_entity)
    return selected_entity

# select a random line that contains all strings in the strings array effectively
def get_random_line_with_strings(intent, strings):
    combo_key = tuple(sorted(strings))
    
    if combo_key not in with_strings_storage or len(with_strings_storage[combo_key]) == 0:
        filtered_sentences = [sentence for sentence in storage_copy[intent] if all(string in sentence for string in strings)]
        with_strings_storage[combo_key] = filtered_sentences
    
    if with_strings_storage[combo_key]:
        selected_entity = random.choice(with_strings_storage[combo_key]).strip()
        with_strings_storage[combo_key].remove(selected_entity)
        return selected_entity
    else:
        raise ValueError(f"No matching sentence found containing all strings: {strings}")

def get_random_line_without_strings(intent, strings):
    combo_key = tuple(sorted(strings))
    
    if combo_key not in without_strings_storage or len(without_strings_storage[combo_key]) == 0:
        filtered_sentences = [sentence for sentence in storage_copy[intent] if all(string not in sentence for string in strings)]
        without_strings_storage[combo_key] = filtered_sentences
    
    if without_strings_storage[combo_key]:
        selected_entity = random.choice(without_strings_storage[combo_key]).strip()
        without_strings_storage[combo_key].remove(selected_entity)
        return selected_entity
    else:
        raise ValueError(f"No matching sentence found without any of the strings: {strings}")

def randomize_greeting():
    if random.random() < 0.7:
        return '|greeting:' + get_random_line(sentence_files["greeting"])
    else:
        return ''

def randomize_farewell():
    if random.random() < 0.7:
        return '|farewell:' + get_random_line(sentence_files["farewell"])
    else:
        return ''

def compile_reservation_sentences(output_file, num_reservation_sentences):
    with open(output_file, "w") as file:
        for i in range(num_reservation_sentences):
            conversation = ''

            # greeting
            conversation += randomize_greeting()

            # track provided entities
            provided_entities = []
            all_entities = ["NAME", "TIME", "DATE", "NUMBER"]

            # define chances
            sentence_type_chances = {
                "add_res_info": [0.80],
                "cancel_res": [0.03],
                "view_res": [0.05],
                "change_res_info": [0.06],

                "menu_inquiry": [0.02],
                "hours_inquiry": [0.02],
                "location_inquiry": [0.01],

                "out_of_scope": [0.01],
            }

            # order content
            sentence_count = 0
            while True:
                # select intent with specified chances
                selected_intent = ''
                rand = random.random()
                chance_counter = 0
                for key, value in sentence_type_chances.items():
                    chance_counter += value[min(sentence_count, len(value) - 1)]
                    if rand < chance_counter:
                        selected_intent = key
                        break

                # generate a sentences within specified intent
                if sentence_count > 0 and (selected_intent == "cancel_res" or selected_intent == "view_res" or selected_intent == "change_res"):
                    # if data has already been provided, we can only reference the res obj using that data
                    unused_entities = [entity for entity in all_entities if entity not in provided_entities]
                    conversation += '|' + selected_intent + ":" + get_random_line_without_strings(selected_intent, unused_entities)
                elif selected_intent == "add_res_info":
                    # define possible entities
                    possible_entities = [entity for entity in all_entities if entity not in provided_entities]

                    # collect selected entities and the chances for more than one entity
                    selected_entities = []
                    extra_entity_chances = []

                    # define different chances for more than one entity depending on intent
                    if sentence_count == 0:
                        extra_entity_chances = [1, 0.6, 0.2, 0.03]
                    else:
                        extra_entity_chances = [1, 0.25, 0.10, 0]

                    for probability in extra_entity_chances:
                        if random.random() < probability and len(possible_entities) > 0:
                            #select a random entity and remove it from further possible choices
                            selected_entity = random.choice(possible_entities)
                            possible_entities.remove(selected_entity)

                            selected_entities.append(selected_entity)

                    # signal to the script which entities we are going to provide so they cannot be added again
                    provided_entities.extend(selected_entities)

                    # randomize the chance for a single inputted slot
                    if selected_intent == "add_res_info" and len(selected_entities) == 1 and random.random() < 0.7:
                        if random.random() < 0.5:
                            conversation += '|add_res_info:' + selected_entities[0] + ",0,0"
                        else:
                            conversation += '|add_res_info:' + get_random_line_with_strings("prompted_res_inputs", selected_entities)
                    else:
                        conversation += '|' + selected_intent + ":" + get_random_line_with_strings(selected_intent, selected_entities)

                    # check if we have used all necessary entities
                    if len(possible_entities) == 0:
                        # simulate confirmation or denial
                        while random.random() < 0.1:
                            conversation += '|deny:' + get_random_line(sentence_files["deny"])

                            rand = random.random()
                            if rand < 0.5:
                                conversation += '|change_res_info:' + get_random_line(sentence_files["change_res_info"])
                            else:
                                conversation += '|add_res_info:' + get_random_line(sentence_files["add_res_info"])
                        conversation += '|confirm:' + get_random_line(sentence_files["confirm"])

                    sentence_count += 1
                else:
                    conversation += '|' + selected_intent + ":" + get_random_line(sentence_files[selected_intent])

                # perform alterations based on intent and generated sentence
                if selected_intent == "cancel_res" and random.random() < 0.35:
                    break # end convo
                elif selected_intent == "cancel_res":
                    # reset conversation ticker
                    sentence_count = 0

            # farewell
            conversation += randomize_farewell()

            # write conversation to file
            file.write(conversation + '\n')

def compile_order_sentences(output_file, num_order_sentences):
    with open(output_file, "w") as file:
        for i in range(num_order_sentences):
            conversation = ''

            # greeting
            conversation += randomize_greeting()

            # define chances
            sentence_type_chances = {
                "add_order_info": [0.8, 0.75, 0.7, 0.5, 0],
                "view_order": [0.05, 0.05, 0.06, 0.06, 0],
                "cancel_order": [0.04, 0.04, 0.03, 0.03, 0.14],
                "change_order_info": [0.05, 0.05, 0.05, 0.05, 0],
                "checkout": [0, 0.5, 0.1, 0.3, 0.85],

                "order_status": [],
                "menu_inquiry": [0.02, 0.02, 0.02, 0.02, 0],
                "hours_inquiry": [0.02, 0.02, 0.02, 0.02, 0],
                "location_inquiry": [0.01, 0.01, 0.01, 0.01, 0],

                "out_of_scope": [0.01],
            }

            # order content
            sentence_count = 0
            cart = {}
            slot_count = 0
            while True:
                # select intent with specified chances
                selected_intent = ''
                rand = random.random()
                chance_counter = 0
                for key, value in sentence_type_chances.items():
                    chance_counter += value[min(sentence_count, len(value) - 1)]
                    if rand < chance_counter:
                        selected_intent = key
                        break

                if selected_intent == 'add_order_info':
                    sentence_count += 1
                    sentence = ''

                    if sentence_count > 0 and random.random() < 0.5:
                        sentence = get_random_line(sentence_files["prompted_order_inputs"])
                        conversation += '|add_order_info:' + sentence
                    else:
                        sentence = get_random_line(sentence_files[selected_intent])
                        conversation += '|' + selected_intent + ":" + sentence

                    # update cart
                    slots = re.findall(r'\*?(?:PREV_)?[A-Z]+,\d+,\d+', sentence)
                    for i in range(slots):
                        slot_count += 1
                        split = slots[i].split(',')

                        parent = None
                        grandparent = None
                        if slots[int(split[2])] is not None:
                            parent = slots[int(split[2])]
                            parent_split = parent.split(',')
                            if slots[int(parent_split[2])] is not None:
                                grandparent = slots[int(parent_split[2])]
                                grandparent_split = grandparent.split(',')

                                if cart[slot_count + int(grandparent_split[2])] is None:
                                    cart[slot_count + int(grandparent_split[2])] = ['', {}]

                            if cart[slot_count + int(parent_split[2])] is None:
                                cart[slot_count + int(parent_split[2])] = ['', []]

                        if parent is None:
                            cart[slot_count + i + 1] = [slots[i], {}]
                        if grandparent is None:
                            cart[slot_count + int(parent_split[2])][1][slot_count + i + 1] = [slots[i], {}]
                        else:
                            cart[slot_count + int(parent_split[2])][1][slot_count + int(grandparent_split[2])][1].append(slots[i])
                elif selected_intent == 'cancel_order' or selected_intent == 'change_order_info' or selected_intent == 'order_status' or selected_intent == 'view_order':
                    if sentence_count > 0 and random.random() < 0.9: # follow PREV_ guidelines
                        # find a randomized line that fits the conversation
                        possible_lines = []
                        for line in storage[selected_intent]:
                            necessary_items = {}
                            slots = re.findall(r'\*?PREV_[A-Z_]+,\d+,\d+', line)
                            for i in range(slots):
                                split = slots[i].split(',')

                                parent = None
                                grandparent = None
                                if slots[int(split[2])] is not None:
                                    parent = slots[int(split[2])]
                                    parent_split = parent.split(',')
                                    if slots[int(parent_split[2])] is not None:
                                        grandparent = slots[int(parent_split[2])]
                                        grandparent_split = grandparent.split(',')

                                        if necessary_items[int(grandparent_split[2])] is None:
                                            necessary_items[int(grandparent_split[2])] = ['', {}]

                                    if necessary_items[int(parent_split[2])] is None:
                                        necessary_items[int(parent_split[2])] = ['', {}]

                                if parent is None:
                                    necessary_items[i + 1] = [split[0], {}]
                                if grandparent is None:
                                    necessary_items[int(parent_split[2])][1][slot_count + i + 1] = [split[0], {}]
                                else:
                                    necessary_items[int(parent_split[2])][1][int(grandparent_split[2])][1].append(split[0])

                            possible_parent_positions = []
                            for key1, necessary_item_grandparent in necessary_items.items():
                                possible_parent_positions.append({})
                                for key2, cart_grandparent in cart.items():
                                    if necessary_item_grandparent[0] == cart_grandparent[0]:
                                        if len(necessary_item_grandparent[1]) == 0:
                                            possible_parent_positions[-1][key1] = key2
                                        else:
                                            for key3, necessary_item_parent in necessary_item_grandparent[1]:
                                                for key4, cart_parent in cart_grandparent[1]:
                                                    if necessary_item_parent[0] == cart_parent[0]:
                                                        if len(necessary_item_parent[1]) == 0:
                                                            if possible_parent_positions[-1][key1] is None:
                                                                possible_parent_positions[-1][key1] = key2
                                                            possible_parent_positions[-1][key3] = key4
                                                        else:
                                                            for necessary_item_child in necessary_item_parent[1]:
                                                                for cart_child in cart_parent[1]:
                                                                    if possible_parent_positions[-1][key1] is None:
                                                                        possible_parent_positions[-1][key1] = key2
                                                                    if possible_parent_positions[-1][key3] is None:
                                                                        possible_parent_positions[-1][key3] = key4
                                                                    possible_parent_positions[-1][necessary_item_child] = cart_child
                                
                                if len(possible_parent_positions[-1]) != 0:
                                    possible_lines.append(line)

                        sentence = random.choice(possible_lines).strip()
                        #!find the corresponding possible parent positions
                        possible_parent_positions = {}

                        # set correct parent relationships


                        # update cart
                        # delete items

                        # add items

                    else: # eliminate PREV_ all together
                        sentence = get_random_line(sentence_files[selected_intent])
                        conversation += '|' + selected_intent + ":" + sentence.replace('PREV_', '')
                else:
                    conversation += '|' + selected_intent + ":" + get_random_line(sentence_files[selected_intent])

                if (selected_intent == "cancel_order" and random.random() < 0.5) or selected_intent == "checkout":

                    #simulate confirmation or denial
                    if selected_intent == "checkout":
                        if not re.search(r"make_order:[^|]*\bNAME\b", conversation): # check if the user has already inputted their name
                            # randomize chance for only a single prompted name
                            if random.random() < 0.5:
                                conversation += '|add_order_info:' + get_random_line(sentence_files["prompted_name"])
                            else: 
                                conversation += '|add_order_info:NAME,0,0'

                        while random.random() < 0.1:
                            conversation += '|deny:' + get_random_line(sentence_files["deny"])

                            rand = random.random()
                            if rand < 0.5:
                                conversation += '|change_order_info:' + get_random_line(sentence_files["change_order_info"])
                            else:
                                conversation += '|add_order_info:' + get_random_line(sentence_files["add_order_info"])
                        conversation += '|confirm:' + get_random_line(sentence_files["confirm"])

                        if random.random() < 0.03:
                            conversation += '|order_status:' + get_random_line(sentence_files["order_status"])

                    break # end convo
                elif selected_intent == "cancel_order":
                    # reset conversation ticker
                    sentence_count = 0

            # farewell
            conversation += randomize_farewell()

            # write conversation to file
            file.write(conversation + '\n')

def compile_inquiry_sentences(output_file, num_inquiry_sentences):
    with open(output_file, "w") as file:
        for i in range(num_inquiry_sentences):
            conversation = ''

            # greeting
            conversation += randomize_greeting()

            # define chances
            sentence_type_chances = {
                "out_of_scope": [0.05],
                "menu_inquiry": [0.25],
                "order_status": [0.25],
            }
            new_sentence_chances = [1, 0.25, 0]

            # order content
            sentence_count = 0
            while random.random() < new_sentence_chances[min(sentence_count, len(new_sentence_chances) - 1)]:
                rand = random.random()
                chance_counter = 0
                for key, value in sentence_type_chances.items():
                    chance_counter += value[min(sentence_count, len(value) - 1)]
                    if rand < chance_counter:
                        conversation += '|' + key + ":" + get_random_line(sentence_files[key])

                sentence_count += 1

            # farewell
            conversation += randomize_farewell()

            # write conversation to file
            file.write(conversation + '\n')

IndentationError: expected an indented block after 'for' statement on line 252 (3531711548.py, line 253)

In [10]:
#compile_reservation_sentences("../Reservation/res_conversations.txt", 10000)
#compile_order_sentences("../Order/order_conversations.txt", 10000)
#compile_inquiry_sentences("../Inquiry/inquiry_conversations.txt", 10000)

['PREV_SOMETHING_RANDOM,8,3']


#### Process Created Data

In [4]:
original_slot_map = {
    "[PAD]": 0,
    "B-NAME": 1,
    "I-NAME": 2,
    "B-DATE": 3,
    "I-DATE": 4,
    "B-TIME": 5,
    "I-TIME": 6,
    "B-NUMBER": 7,
    "I-NUMBER": 8,
    "B-ITEM": 9,
    "I-ITEM": 10,
    "B_DRINK": 11,
    "I_DRINK": 12,
    "B-ADDON": 13,
    "I-ADDON": 14,
    "B-SIZE": 15,
    "I-SIZE": 16
}

intent_map = {
    "out_of_scope": 0,
    "greeting": 1,
    "farewell": 2,
    "view_res": 3,
    "add_res_info": 4,
    "change_res_info": 5,
    "cancel_res": 6,
    "add_order_info": 7,
    "change_order_info": 8,
    "view_order": 9,
    "cancel_order": 10,
    "checkout": 11,
    "menu_inquiry": 12,
    "location_inquiry": 13,
    "hours_inquiry": 14
}

entity_files = {
    "DRINK_ADDON": "../Filler_Data/drink_addon.txt",
    "ADDON_SIZE": "../Filler_Data/addon_size.txt",
    "TIME": "../Filler_Data/times.txt",
    "NAME": "../Filler_Data/names.txt",
    "DATE": "../Filler_Data/dates.txt",
    "ITEM": "../Filler_Data/food_items.txt",
    "NUMBER": "../Filler_Data/numbers.txt",
    "ADDON": "../Filler_Data/addon.txt",
    "SIZE": "../Filler_Data/size.txt",
    "DRINK": "../Filler_Data/drink_items.txt",
}

def process_sentence(sentence, entity_files):
    # isolate intent
    intent = intent_map[sentence.split(':')[0]]
    sentence = sentence.split(':')[1]

    slot_map = []
    slot_memory = {
        "NAME": None,
        "ORDER": [],
        "DATE": None,
        "TIME": None,
        "NUMBER": None,
    }
    splitted = sentence.split()

    for idx in range(len(splitted)):
        is_slot = False
        # check if its a phantom slot
        if not splitted[idx].includes('*'):
            for placeholder, filepath in entity_files.items():
                if placeholder in splitted[idx]:
                    is_slot = True
                    processed_slot = splitted[idx].split(',')

                    replacement = None
                    if processed_slot[0].includes('PREV'):
                        processed_slot[0].replace('PREV_', '')

                        if slot_memory[placeholder] is not None:
                            replacement = slot_memory[placeholder]

                        #! How can we do this for cart items? -> simply check for parent and fill with parent filler
                    
                    if replacement is None:
                        replacement = get_random_line(filepath)
                        if placeholder == 'NAME' and random.random() < 0.5: # assign two names on a 50% chance
                            replacement += " " + get_random_line(filepath)



                    # swap slot for drink_addon and addon_size
                    if placeholder == 'DRINK_ADDON':
                        processed_slot[0] = 'ADDON'
                    elif placeholder == 'ADDON_SIZE':
                        processed_slot[0] = 'SIZE'

                    slot_map.append([original_slot_map["B-" + processed_slot[0]], int(processed_slot[1]), int(processed_slot[2])])

                    # add indices for number of words in replacement
                    for i in range(len(replacement.split()) - 1):
                        slot_map.append([original_slot_map["I-" + processed_slot[0]], int(processed_slot[1]), int(processed_slot[2])])

                    # 

                    sentence = re.sub(r'[A-Z_]+,\d+,\d+', replacement, sentence, count=1)
                    break
        else: # it's a phantom slot
            is_slot = True
            processed_slot = splitted[idx].split(',')

            slot_map.append([original_slot_map["B-" + processed_slot[0].replace('*', '')], int(processed_slot[1]), int(processed_slot[2])])

        if not is_slot:
            slot_map.append([0,0,0])

    return sentence, intent, slot_map

def process_conversation(conversation, entity_files):
    data = []
    memory = []

    for sentence in conversation.split('|'):
        processed_sentence, intent, slot_map = process_sentence(sentence, entity_files)

        data.append({
            "text": processed_sentence,
            "intent": intent,
            "slots": slot_map,
            "memory": memory.copy()
        })

        memory.append({
            "text": processed_sentence,
            "intent": intent,
            "slots": slot_map
        })
    
    return data

def process_file(input_file, output_file, entity_files):
    data = []
    with open(input_file, 'r') as file:
        conversations = [line.strip() for line in file.readlines()]

    for conversation in conversations:
        data.extend(process_conversation(conversation, entity_files))

    with open(output_file, 'w') as file:
        json.dump(data, file, indent=4)

#### Call Functions

In [5]:
print(process_conversation("add_res_info:confirm booking details for NUMBER,0,0 visitor party coming on DATE,0,0|view_res:what time is my reservation this DATE,0,0", entity_files))

NameError: name 'random' is not defined