# JERTmate data processing

#### Compiling Imports

In [19]:
import random
import copy

#### Compile Conversations

In [20]:
sentence_files = {
    # miscellaneous
    "greetingSingle": "../Misc/greeting.txt",
    "greetingMultiple": "../Misc/greetingMultiIntent.txt",
    "farewell": "../Misc/farewell.txt",
    "out_of_scope": "../Misc/Out_of_Scope/out_of_scope.txt",
    "prompted_name": "../Misc/Prompted_Name/prompted_name.txt",
    "confirm": "../Misc/confirm.txt",
    "deny": "../Misc/deny.txt",
    "allergy_declaration": '../Misc/allergy_declaration.txt',

    # reservation
    "cancel_res": "../Reservation/cancel_res.txt",
    "view_res": "../Reservation/view_res.txt",
    "change_res_info": "../Reservation/change_res_info.txt",
    "add_res_info": "../Reservation/add_res_info.txt",
    "prompted_res_inputs": "../Reservation/prompted_res_inputs.txt",
    
    # order
    "add_order_info": "../Order/manage_order_info/add_order_info.txt",
    "delete_order_info": "../Order/manage_order_info/delete_order_info.txt",
    "prompted_add_info": "../Order/manage_order_info/prompted_add_info.txt",
    "swap_order_info": "../Order/manage_order_info/swap_order_info.txt",
    "access_prev_order": "../Order/access_prev_order.txt",
    "view_order": "../Order/view_order.txt",
    "cancel_order": "../Order/cancel_order.txt",
    "checkout": "../Order/checkout.txt",
    "create_order": "../Order/create_order.txt",

    # inquiry
    'allergen_inquiry': "../Inquiry/allergen_inquiry.txt", 
    'basic_inquiry': "../Inquiry/basic_inquiry.txt", 
    'hours_inquiry': "../Inquiry/hours_inquiry.txt", 
    'language_inquiry': "../Inquiry/language_inquiry.txt", 
    'location_inquiry': "../Inquiry/location_inquiry.txt", 
    'manager_inquiry': "../Inquiry/manager_inquiry.txt", 
    'object_description_inquiry': "../Inquiry/object_description_inquiry.txt", 
    'object_ingredient_inquiry': "../Inquiry/object_ingredient_inquiry.txt", 
    'object_nutritional_inquiry': "../Inquiry/object_nutritional_inquiry.txt", 
    'object_recommendation_inquiry': "../Inquiry/object_recommendation_inquiry.txt", 
    'order_queue_inquiry': "../Inquiry/order_queue_inquiry.txt", 
    'possible_object_inquiry': "../Inquiry/possible_object_inquiry.txt", 
    'possible_order_inquiry': "../Inquiry/possible_order_inquiry.txt", 
    'possible_reservation_inquiry': "../Inquiry/possible_reservation_inquiry.txt", 
    'price_inquiry': "../Inquiry/price_inquiry.txt", 
    'profile_inquiry': "../Inquiry/profile_inquiry.txt", 
    'repeat_inquiry': "../Inquiry/repeat_inquiry.txt", 
    'sms_inquiry': "../Inquiry/sms_inquiry.txt", 
    'software_inquiry': "../Inquiry/software_inquiry.txt", 
    'table_queue_inquiry': "../Inquiry/table_queue_inquiry.txt", 
    'volume_inquiry': "../Inquiry/volume_inquiry.txt",
}

inquiry_intents = ['allergen_inquiry', 'basic_inquiry', 'hours_inquiry', 'language_inquiry', 'location_inquiry', 'manager_inquiry', 'object_description_inquiry', 'object_ingredient_inquiry', 'object_nutritional_inquiry', 'object_recommendation_inquiry', 'order_queue_inquiry', 'possible_object_inquiry', 'possible_order_inquiry', 'possible_reservation_inquiry', 'price_inquiry', 'profile_inquiry', 'repeat_inquiry', 'sms_inquiry', 'software_inquiry', 'table_queue_inquiry', 'volume_inquiry']

reservation_slots = ['NAME', 'DATE', 'TIME', 'NUMBER']

# Shared storage objects to store found arrays of sentences
storage = {}
storage_copy = {}

for intent, file_path in sentence_files.items():
    with open(file_path, 'r') as file:
        storage[intent] = [line.strip() for line in file.readlines() if line.strip() and not line.strip().startswith("//")]

storage_copy = copy.deepcopy(storage)

def get_random_line(intent):
    if len(storage[intent]) == 0:
        storage[intent] = storage_copy[intent].copy()
        
    selected_entity = random.choice(storage[intent]).strip()
    storage[intent].remove(selected_entity)
    return selected_entity

def process_sentence(sentence, intent):
    sentence = sentence.split(':')
    if len(sentence) == 1:
        return '|' + intent + ':' + sentence[0]
    elif len(sentence) == 2:
        if '[' in sentence[0]:
            sentence[0] = sentence[0].replace("[", "").replace("]", "")
            return '|' + intent + ',' + sentence[0] + ':' + sentence[1]
        else:
            return '|' + sentence[0] + ':' + sentence[1]
    elif len(sentence) == 3:
        sentence[1] = sentence[1].replace("[", "").replace("]", "")
        return '|' + sentence[0] + ',' + sentence[1] + ':' + sentence[1]

def process_line(line, intent):
    output = ''
    line = line.split('|')
    for sentence in line:
        output += process_sentence(sentence, intent)

    return output

def randomize_greeting():
    if random.random() < 0.7:
        if random.random() < 0.2:
            return process_line(get_random_line("greetingMultiple"), 'greeting')
        else:
            return process_line(get_random_line("greetingSingle"), 'greeting')
    else:
        return ''

def randomize_farewell():
    if random.random() < 0.7:
        return process_line(get_random_line("farewell"), 'farewell')
    else:
        return ''
    
def randomize_oos():
    if random.random() < 0.007:
        return '|out_of_scope:' + get_random_line("out_of_scope")
    else:
        return ''

def randomize_inquiry():
    intent = random.choice(inquiry_intents)
    return process_line(get_random_line(intent), intent)
    
def compile_inquiry_sentences():
    conversation = ''

    # define chances
    new_sentence_chances = [1, 0.5, 0.15, 0.5, 0]

    # order content
    sentence_count = 0
    while random.random() < new_sentence_chances[min(sentence_count, len(new_sentence_chances) - 1)]:
        sentence_count += 1
        conversation += randomize_oos()
        conversation += randomize_inquiry()

    return conversation

def compile_reservation_sentences():
    conversation = ''

    # define chances
    sentence_type_chances = {
        "add_res_info": [0.79, 0.05],
        "prompted_res_inputs": [0, 0.74],
        "cancel_res": [0.05],
        "view_res": [0.05],
        "change_res_info": [0.05],
        "inquiry": [0.02],
        "confirm": [0.02],
        "deny": [0.02]
    }
    new_sentence_chances = [1, 0.9, 0.9, 0.8, 0.3, 0.5, 0]

    # order content
    sentence_count = 0
    while random.random() < new_sentence_chances[min(sentence_count, len(new_sentence_chances) - 1)]:
        conversation += randomize_oos()

        # select intent with specified chances
        selected_intent = ''
        rand = random.random()
        chance_counter = 0
        for key, value in sentence_type_chances.items():
            chance_counter += value[min(sentence_count, len(value) - 1)]
            if rand < chance_counter:
                selected_intent = key
                break

        if selected_intent == 'prompted_res_inputs':
            if random.random() < 0.4:
                slot = random.choice(reservation_slots)
                conversation += '|add_res_info:' + slot + ",0,0,0"
            else:
                conversation += process_line(get_random_line("prompted_res_inputs"), 'add_res_info')
        elif selected_intent == 'inquiry':
            conversation += randomize_inquiry()
        else:
            conversation += process_line(get_random_line(selected_intent), selected_intent)

        sentence_count += 1

    return conversation


def compile_order_sentences():
    conversation = ''

    # define chances
    sentence_type_chances = {
        "create_order": [0.4, 0.04, 0.04, 0.03, 0.03, 0.02, 0.02, 0.01],
        "access_prev_order": [0.32, 0.04, 0.04, 0.03, 0.03, 0.02, 0.02, 0.01],
        "add_order_info": [0.05, 0.37, 0.29, 0.26, 0.25, 0.20, 0.18, 0.15],
        "prompted_add_info": [0, 0.29, 0.3, 0.30, 0.26, 0.28, 0.2, 0.15],
        "swap_order_info": [0.04],
        "delete_order_info": [0.03],
        "cancel_order": [0.04],
        "view_order": [0.04, 0.04, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1],
        "checkout": [0.01, 0.04, 0.1, 0.1, 0.15, 0.2, 0.3, 0.4],

        "allergy_declaration": [0.01],
        "inquiry": [0.02],
        "confirm": [0.02],
        "deny": [0.02],
    }
    new_sentence_chances = [1, 0.95, 0.9, 0.85, 0.75, 0.5, 0.25, 0.1, 0]

    # order content
    sentence_count = 0
    while random.random() < new_sentence_chances[min(sentence_count, len(new_sentence_chances) - 1)]:
        conversation += randomize_oos()

        # select intent with specified chances
        selected_intent = ''
        rand = random.random()
        chance_counter = 0
        for key, value in sentence_type_chances.items():
            chance_counter += value[min(sentence_count, len(value) - 1)]
            if rand < chance_counter:
                selected_intent = key
                break

        if selected_intent == "add_order_info" or selected_intent == "prompted_add_info" or selected_intent == "swap_order_info" or selected_intent == "delete_order_info":
            conversation += process_line(get_random_line(selected_intent), 'manage_order_info')
        elif selected_intent == "checkout":
            # randomize chance for only a single prompted name
            rand = random.random()
            if rand < 0.4:
                conversation += process_line(get_random_line("prompted_name"), 'manage_order_info')
            elif rand < 0.8: 
                conversation += '|manage_order_info:NAME,0,0,0'
        elif selected_intent == 'inquiry':
            conversation += randomize_inquiry()
        else:
            conversation += process_line(get_random_line(selected_intent), selected_intent)

        sentence_count += 1

    return conversation

def compile_conversations(output_file, total):
    with open(output_file, "w") as file:
        for i in range(total):
            conversation = ''
            guidedIntent = None

            # greeting
            conversation += randomize_greeting()

            if "order" in conversation:
                guidedIntent = 'order'
            elif "res" in conversation:
                guidedIntent = 'res'

            if random.random() < 0.15:
                conversation += compile_inquiry_sentences()

                rand = random.random()
                if rand < 0.4 or guidedIntent == 'order':
                    conversation += compile_order_sentences()

                    if random.random() < 0.05:
                        conversation += compile_reservation_sentences()
                elif rand < 0.4:
                    conversation += compile_reservation_sentences()

                    if random.random() < 0.05:
                        conversation += compile_order_sentences()
            else:
                if random.random() < 0.5 or guidedIntent == 'res':
                    conversation += compile_reservation_sentences()

                    if random.random() < 0.05:
                        conversation += compile_order_sentences()
                else:
                    conversation += compile_order_sentences()

                    if random.random() < 0.05:
                        conversation += compile_reservation_sentences()


            # farewell
            conversation += randomize_farewell()

            # write conversation to file
            file.write(conversation + '\n')

In [21]:
compile_conversations('./conversations.txt', 3000)

## Encode Data Points

### imports

In [22]:
import json
import re
import numpy as np
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# environment variables
sentence_memory = 2

max_slot_length = 50
slot_data_size = 6

max_phantom_slot_length = 5
phantom_slot_data_size = 6

In [23]:
original_slot_map = {
    "[PAD]": 0,
    "B-NAME": 1,
    "I-NAME": 2,
    "B-DATE": 3,
    "I-DATE": 4,
    "B-TIME": 5,
    "I-TIME": 6,
    "B-NUMBER": 7,
    "I-NUMBER": 8,
    "B-OBJECT": 9,
    "I-OBJECT": 10,
    "B-ALLERGEN": 11,
    "I-ALLERGEN": 12,
    "B-SIZE": 13,
    "I-SIZE": 14
}

intent_map = {
    "out_of_scope": 0,
    "greeting": 1,
    "farewell": 2,
    "confirm": 3,
    "deny": 4,
    "allergy_declaration": 5,

    "manage_order_info": 6,
    "create_order": 7,
    "access_prev_order": 8,
    "cancel_order": 9,
    "view_order": 10,
    "checkout": 11,

    "add_res_info": 12,
    "view_res": 13,
    "change_res_info": 14,
    "cancel_res": 15,

    'allergen_inquiry': 16, 
    'basic_inquiry': 17, 
    'hours_inquiry': 18, 
    'language_inquiry': 19, 
    'location_inquiry': 20, 
    'manager_inquiry': 21, 
    'object_description_inquiry': 22, 
    'object_ingredient_inquiry': 23, 
    'object_nutritional_inquiry': 24, 
    'object_recommendation_inquiry': 25, 
    'order_queue_inquiry': 26, 
    'possible_object_inquiry': 27, 
    'possible_order_inquiry': 28, 
    'possible_reservation_inquiry': 29, 
    'price_inquiry': 30, 
    'profile_inquiry': 31, 
    'repeat_inquiry': 32, 
    'sms_inquiry': 33, 
    'software_inquiry': 34, 
    'table_queue_inquiry': 35, 
    'volume_inquiry': 36,

    "reference_prev_call": 37,
}

entity_files = {
    "TIME": "../Filler_Data/time.txt",
    "NAME": "../Filler_Data/names.txt",
    "DATE": "../Filler_Data/date.txt",
    "ITEM": "../Filler_Data/items.txt",
    "NUMBER": "../Filler_Data/numbers.txt",
    "ADDON": "../Filler_Data/addon.txt",
    "SIZE": "../Filler_Data/size.txt",
    "ALLERGEN": "../Filler_Data/allergen.txt",
}

# Shared storage objects to store found arrays of sentences
storage = {}
storage_copy = {}

for intent, file_path in entity_files.items():
    with open(file_path, 'r') as file:
        storage[intent] = [line.strip() for line in file.readlines() if line.strip() and not line.strip().startswith("//")]

storage_copy = copy.deepcopy(storage)

def get_random_line(intent):
    if len(storage[intent]) == 0:
        storage[intent] = storage_copy[intent].copy()

    selected_entity = random.choice(storage[intent]).strip()
    storage[intent].remove(selected_entity)
    return selected_entity

def process_sentence(sentence):
    # identify and encode intents
    intents = [0] * len(intent_map)
    for intent in (sentence.split(':')[0]).split(','):
        intents[intent_map[intent]] = 1

    # identify and fill slots
    slot_type_map = []
    slot_intent_map = []
    slot_action_map = []
    slot_pointers_map = []
    sentence = sentence.split(':')[1]

    # record and take out phantom slots
    phantom_target_map = []
    phantom_intent_map = []
    phantom_action_map = []
    phantom_pointers_map = []
    if '*' in sentence:
        phantom_slots = sentence.split('*')

        for i in range(1, len(phantom_slots)):
            processed_slot = phantom_slots[i].split(',')

            phantom_target_map.append(int(processed_slot[0]))
            phantom_intent_map.append(int(processed_slot[1]))
            phantom_action_map.append(int(processed_slot[2]))
            phantom_pointers_map.extend([int(x) for x in processed_slot][3:6] + [0] * (6 - len(processed_slot)))

            sentence = sentence.replace(phantom_slots[i], '')

        sentence = sentence.replace('*', '')

    splitted = sentence.split(" ")
    for idx in range(len(splitted)):
        # check for a slot
        if ',' in splitted[idx]:
            for placeholder, filepath in entity_files.items():
                if placeholder in splitted[idx]:
                    splitted[idx] = splitted[idx].replace('\'s', '')
                    splitted[idx] = splitted[idx].replace('s', '')
                    processed_slot = splitted[idx].split(',')

                    replacement = get_random_line(placeholder)
                    if placeholder == 'NAME' and random.random() < 0.5: # assign two names on a 50% chance
                        replacement += get_random_line(placeholder)

                    # swap item and addon for object
                    if placeholder == 'ITEM' or placeholder == 'ADDON':
                        processed_slot[0] = 'OBJECT'

                    # encode slot data
                    slot_type_map.append(original_slot_map["B-" + processed_slot[0]])
                    slot_intent_map.append(int(processed_slot[1]))
                    slot_action_map.append(int(processed_slot[2]))
                    slot_pointers_map.extend([int(x) for x in processed_slot[3:6]] + [0] * (6 - len(processed_slot)))

                    # add indices for number of words in replacement
                    for i in range(len(tokenizer.tokenize(replacement)) - 1):
                        # encode slot data
                        slot_type_map.append(original_slot_map["I-" + processed_slot[0]])
                        slot_intent_map.append(int(processed_slot[1]))
                        slot_action_map.append(int(processed_slot[2]))
                        slot_pointers_map.extend([int(x) for x in processed_slot[3:6]] + [0] * (6 - len(processed_slot)))

                    sentence = sentence.replace(splitted[idx], replacement, 1)
                    break
        else:
            slot_type_map.append(0)
            slot_intent_map.append(0)
            slot_action_map.append(0)
            slot_pointers_map.extend(3 * [0])

    # pad slot data
    slot_type_map += [0] * (max_slot_length - len(slot_type_map))
    slot_intent_map += [0] * (max_slot_length - len(slot_intent_map))
    slot_action_map += [0] * (max_slot_length - len(slot_action_map))
    slot_pointers_map += [0] * ((max_slot_length - len(slot_pointers_map)) * 3)

    # pad phantom slot data
    phantom_target_map += [0] * (max_phantom_slot_length - len(phantom_target_map))
    phantom_intent_map += [0] * (max_phantom_slot_length - len(phantom_intent_map))
    phantom_action_map += [0] * (max_phantom_slot_length - len(phantom_action_map))
    phantom_pointers_map += [0] * ((max_phantom_slot_length - len(phantom_pointers_map)) * 3)

    # combine all slot data
    slot_map = slot_type_map + slot_intent_map + slot_action_map + slot_pointers_map + phantom_target_map + phantom_intent_map + phantom_action_map + phantom_pointers_map

    return sentence, intents, slot_map

def process_conversation(conversation):
    inputs = []
    intentOutputs = []
    slotOutputs = []
    sentenceMemory = []

    for sentence in conversation.split('|'):
        if sentence != '':
            processed_sentence, intents, slot_map = process_sentence(sentence)

            # compile input
            input = []
            textInput = ''
            for i in range(min(len(sentenceMemory), sentence_memory)):
                textInput += sentenceMemory[i] + ' [SEP] '
            textInput += processed_sentence

            # Tokenize, extend, and pad the conversation
            tokenized_text = tokenizer(textInput, padding='max_length', truncation=True, max_length=(max_slot_length * (sentence_memory + 1)))
            input_ids = tokenized_text['input_ids']
            input.extend(input_ids)

            # extend and pad intents
            input.extend([item for sublist in intentOutputs[-sentence_memory:] for item in sublist])
            input += [0] * ((len(intent_map) * (sentence_memory)) + (max_slot_length * (sentence_memory + 1)) - len(input))

            # extend and pad slots
            input.extend([item for sublist in slotOutputs[-sentence_memory:] for item in sublist])
            input += [0] * ((slot_data_size * max_slot_length + phantom_slot_data_size * max_phantom_slot_length) * (sentence_memory) + (((len(intent_map) * (sentence_memory)) + (max_slot_length * (sentence_memory + 1)) - len(input))))

            if(len(input) != 886):
                print(len(input))

            # store input
            inputs.append(input)

            # store intents
            intentOutputs.append(intents)

            # pad slots
            slot_map += [0] * (max_slot_length * slot_data_size + max_phantom_slot_length * phantom_slot_data_size - len(slot_map))

            # store slots
            slotOutputs.append(slot_map)

            # store sentence
            sentenceMemory.append(processed_sentence)
    
    return inputs, intentOutputs, slotOutputs

def process_file(input_file, output_file):
    inputs = []
    intentOutputs = []
    slotOutputs = []
    
    with open(input_file, 'r') as file:
        conversations = [line.strip() for line in file.readlines()]

    for conversation in conversations:
        i, iO, sO = process_conversation(conversation)
        inputs.extend(i)
        intentOutputs.extend(iO)
        slotOutputs.extend(sO)

    with open(output_file, 'w') as file:
        json.dump({"inputs": inputs, "intentOutputs": intentOutputs, "slotOutputs": slotOutputs}, file)

In [24]:
process_file('./conversations.txt', './JERTmate_final_data.json')