# JERTmate data processing

Compile synthetic data and process into useable formats

#### Basic definitions

In [2]:
import json
import random
import os
import re
import numpy as np
from transformers import AutoTokenizer
from transformers import TFBertModel

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert = TFBertModel.from_pretrained(model_name)




To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

#### Compile Synthetic Data

In [6]:
sentence_files = {
    # miscellaneous
    "greeting": "../Greeting_Farewell/greeting.txt",
    "farewell": "../Greeting_Farewell/farewell.txt",
    "out_of_scope": "../Out_of_Scope/out_of_scope.txt",
    "prompted_name": "../Prompted_Name/prompted_name.txt",
    "confirm": "../Confirm_Deny/confirm.txt",
    "deny": "../Confirm_Deny/deny.txt",

    # reservation
    "create_res": "../Reservation/create_res.txt",
    "cancel_res": "../Reservation/cancel_res.txt",
    "view_res": "../Reservation/view_res.txt",
    "add_res_info": "../Reservation/add_res_info.txt",
    "delete_res_info": "../Reservation/delete_res_info.txt",
    "swap_res_info": "../Reservation/swap_res_info.txt",
    
    # order
    "create_order": "../Order/create_order.txt",
    "add_order_info": "../Order/add_order_info.txt",
    "swap_items": "../Order/swap_items.txt",
    "delete_items": "../Order/delete_items.txt",
    "view_order": "../Order/view_order.txt",
    "cancel_order": "../Order/cancel_order.txt",
    "checkout": "../Order/checkout.txt",

    # inquiry
    "menu_inquiry": "../Inquiry/menu_inquiry.txt",
    "location_inquiry": "../Inquiry/location_inquiry.txt",
    "hours_inquiry": "../Inquiry/hours_inquiry.txt",
}

def get_random_line(filepath):
    with open(filepath, 'r') as file:
        lines = file.readlines()
    return random.choice(lines).strip()

# Shared storage objects to store found arrays of sentences
with_strings_storage = {}
without_strings_storage = {}

# select a random line that contains all strings in the strings array effectively
def get_random_line_with_strings(file_path, strings):
    combo_key = tuple(sorted(strings))
    
    if combo_key not in with_strings_storage:
        with open(file_path, 'r') as file:
            sentences = [line.strip() for line in file.readlines()]
            filtered_sentences = [sentence for sentence in sentences if all(string in sentence for string in strings)]
            with_strings_storage[combo_key] = filtered_sentences
    else:
        filtered_sentences = with_strings_storage[combo_key]
    
    if filtered_sentences:
        return random.choice(filtered_sentences).strip()
    else:
        raise ValueError(f"No matching sentence found containing all strings: {strings}")

def get_random_line_without_strings(file_path, strings):
    combo_key = tuple(sorted(strings))
    
    if combo_key not in without_strings_storage:
        with open(file_path, 'r') as file:
            sentences = [line.strip() for line in file.readlines()]
            filtered_sentences = [sentence for sentence in sentences if all(string not in sentence for string in strings)]
            without_strings_storage[combo_key] = filtered_sentences
    else:
        filtered_sentences = without_strings_storage[combo_key]
    
    if filtered_sentences:
        return random.choice(filtered_sentences).strip()
    else:
        raise ValueError(f"No matching sentence found without any of the strings: {strings}")

def randomize_greeting():
    if random.random() < 0.7:
        return '|greeting:' + get_random_line(sentence_files["greeting"])
    else:
        return ''

def randomize_farewell():
    if random.random() < 0.7:
        return '|farewell:' + get_random_line(sentence_files["farewell"])
    else:
        return ''

def compile_reservation_sentences(output_file, sentence_files, num_reservation_sentences):
    with open(output_file, "w") as file:
        for i in range(num_reservation_sentences):
            conversation = ''

            # greeting
            conversation += randomize_greeting()

            # track provided entities
            provided_entities = []
            all_entities = ["NAME", "TIME", "DATE", "NUMBER"]

            # define chances
            sentence_type_chances = {
                "create_res": [0.75, 0],
                "add_res_info": [0.04, 0.80],
                "cancel_res": [0.04, 0.03],
                "view_res": [0.04, 0.04],
                "delete_res_info": [0.04, 0.03],
                "swap_res_info": [0.03, 0.04],

                "menu_inquiry": [0.02, 0.02],
                "hours_inquiry": [0.02, 0.02],
                "location_inquiry": [0.01, 0.01],

                "out_of_scope": [0.01],
            }

            # order content
            sentence_count = 0
            while True:
                # select intent with specified chances
                selected_intent = ''
                random = random.random()
                chance_counter = 0
                for key, value in sentence_type_chances.items():
                    chance_counter += value[min(sentence_count, len(value) - 1)]
                    if random < chance_counter:
                        selected_intent = key
                        break

                # generate a sentences within specified intent
                if selected_intent == "add_res_info" and sentence_count == 0:
                    conversation += '|add_res_info:' + get_random_line(sentence_files["prompted_name"])
                    provided_entities.append("NAME")
                elif sentence_count > 0 and (selected_intent == "cancel_res" or selected_intent == "view_res" or selected_intent == "change_res"):
                    # if data has already been provided, we can only reference the res obj using that data
                    unused_entities = [entity for entity in all_entities if entity not in provided_entities]
                    conversation += '|' + selected_intent + ":" + get_random_line_without_strings(sentence_files[selected_intent], unused_entities)
                elif selected_intent == "add_res_info" or selected_intent == "create_res":
                    # define possible entities
                    possible_entities = [entity for entity in all_entities if entity not in provided_entities]

                    # collect selected entities and the chances for more than one entity
                    selected_entities = []
                    extra_entity_chances = []

                    # define different chances for more than one entity depending on intent
                    if selected_intent == "create_res":
                        extra_entity_chances = [1, 0.6, 0.2, 0.03]
                    else:
                        extra_entity_chances = [1, 0.25, 0.10, 0]

                    for probability in extra_entity_chances:
                        if random.random() < probability and len(possible_entities) > 0:
                            #select a random entity and remove it from further possible choices
                            selected_entity = random.choice(possible_entities)
                            possible_entities.remove(selected_entity)

                            selected_entities.append(selected_entity)

                    # signal to the script which entities we are going to provide so they cannot be added again
                    provided_entities.extend(selected_entities)

                    # randomize the chance for a single inputted slot
                    if selected_intent == "add_res_info" and len(selected_entities) == 1 and random.random() < 0.55:
                        conversation += '|add_res_info:' + selected_entities[0] + ",0,0"
                    else:
                        conversation += '|' + selected_intent + ":" + get_random_line_with_strings(sentence_files[selected_intent], selected_entities)

                    # check if we have used all necessary entities
                    if len(possible_entities) == 0:
                        # simulate confirmation or denial
                        while random.random() < 0.1:
                            conversation += '|deny:' + get_random_line(sentence_files["deny"])

                            random = random.random()
                            if random < 0.5:
                                conversation += '|swap_res_info:' + get_random_line(sentence_files["swap_res_info"])
                            elif random < 0.8:
                                conversation += '|add_res_info:' + get_random_line(sentence_files["add_res_info"])
                            else:
                                conversation += '|delete_res_info:' + get_random_line(sentence_files["delete_res_info"])
                                conversation += '|add_res_info:' + get_random_line(sentence_files["add_res_info"])
                        conversation += '|confirm:' + get_random_line(sentence_files["confirm"])

                else:
                    conversation += '|' + selected_intent + ":" + get_random_line(sentence_files[selected_intent])

                # perform alterations based on intent and generated sentence
                if selected_intent == 'add_res':
                    sentence_count += 1
                elif selected_intent == "cancel_res" and random.random() < 0.35:
                    break # end convo
                elif selected_intent == "cancel_res":
                    # reset conversation ticker
                    sentence_count = 0

            # farewell
            conversation += randomize_farewell()

            # write conversation to file
            file.write(conversation + '\n')

def compile_order_sentences(output_file, sentence_files, num_order_sentences):
    with open(output_file, "w") as file:
        for i in range(num_order_sentences):
            conversation = ''

            # greeting
            conversation += randomize_greeting()

            # define chances
            sentence_type_chances = {
                "create_order": [0.80, 0],
                "add_order_info": [0, 0.75, 0.7, 0.5, 0],
                "view_order": [0.05, 0.03, 0.03, 0.03, 0],
                "cancel_order": [0.03, 0.04, 0.03, 0.03, 0.14],
                "swap_items": [0.03, 0.04, 0.04, 0.04, 0],
                "delete_items": [0.03, 0.03, 0.04, 0.04, 0],
                "checkout": [0, 0.5, 0.1, 0.3, 0.85],

                "menu_inquiry": [0.02, 0.02, 0.02, 0.02, 0],
                "hours_inquiry": [0.02, 0.02, 0.02, 0.02, 0],
                "location_inquiry": [0.01, 0.01, 0.01, 0.01, 0],

                "out_of_scope": [0.01],
            }

            # order content
            sentence_count = 0
            while True:
                # select intent with specified chances
                selected_intent = ''
                random = random.random()
                chance_counter = 0
                for key, value in sentence_type_chances.items():
                    chance_counter += value[min(sentence_count, len(value) - 1)]
                    if random < chance_counter:
                        selected_intent = key
                        break
                
                conversation += '|' + selected_intent + ":" + get_random_line(sentence_files[selected_intent])

                if selected_intent == 'add_order_info' or selected_intent == "create_order":
                    sentence_count += 1
                elif (selected_intent == "cancel_order" and random.random() < 0.5) or selected_intent == "checkout":
                    if not re.search(r"make_order:[^|]*\bNAME\b", conversation): # check if the user has already inputted their name
                        # randomize chance for only a single prompted name
                        if random.random() < 0.5:
                            conversation += '|add_order_info:' + get_random_line(sentence_files["prompted_name"])
                        else: 
                            conversation += '|add_order_info:NAME,0,0'

                    #simulate confirmation or denial
                    if selected_intent == "checkout":
                        while random.random() < 0.1:
                            conversation += '|deny:' + get_random_line(sentence_files["deny"])

                            random = random.random()
                            if random < 0.5:
                                conversation += '|swap_items:' + get_random_line(sentence_files["swap_items"])
                            elif random < 0.8:
                                conversation += '|add_order_info:' + get_random_line(sentence_files["add_order_info"])
                            else:
                                conversation += '|delete_items_info:' + get_random_line(sentence_files["delete_items_info"])
                                conversation += '|add_order_info:' + get_random_line(sentence_files["add_order_info"])
                        conversation += '|confirm:' + get_random_line(sentence_files["confirm"])

                    break # end convo
                elif selected_intent == "cancel_order":
                    # reset conversation ticker
                    sentence_count = 0

            # farewell
            conversation += randomize_farewell()

            # write conversation to file
            file.write(conversation + '\n')

def compile_inquiry_sentences(output_file, sentence_files, num_inquiry_sentences):
    with open(output_file, "w") as file:
        for i in range(num_inquiry_sentences):
            conversation = ''

            # greeting
            conversation += randomize_greeting()

            # define chances
            sentence_type_chances = {
                "out_of_scope": [0.05],
                "menu_inquiry": [0.25],
            }
            new_sentence_chances = [1, 0.5, 0.2]

            # order content
            sentence_count = 0
            while random.random() < new_sentence_chances[min(sentence_count, len(new_sentence_chances) - 1)]:
                random = random.random()
                chance_counter = 0
                for key, value in sentence_type_chances.items():
                    chance_counter += value[min(sentence_count, len(value) - 1)]
                    if random < chance_counter:
                        conversation += '|' + key + ":" + get_random_line(sentence_files[key])

                sentence_count += 1

            # farewell
            conversation += randomize_farewell()

            # write conversation to file
            file.write(conversation + '\n')

#### Process Created Data

In [20]:
original_slot_map = {
    "[PAD]": 0,
    "B-NAME": 1,
    "I-NAME": 2,
    "B-DATE": 3,
    "I-DATE": 4,
    "B-TIME": 5,
    "I-TIME": 6,
    "B-NUMBER": 7,
    "I-NUMBER": 8,
    "B-ITEM": 9,
    "I-ITEM": 10,
    "B-ADDON": 11,
    "I-ADDON": 12,
    "B-SIZE": 13,
    "I-SIZE": 14
}

intent_map = {
    "out_of_scope": 1,
    "greeting": 2,
    "farewell": 3,
    "create_res": 4,
    "view_res": 5,
    "add_res_info": 6,
    "delete_res_info": 7,
    "swap_res_info": 8,
    "cancel_res": 9,
    "create_order": 10,
    "add_order_info": 11,
    "swap_items": 12,
    "delete_items": 13,
    "view_order": 14,
    "cancel_order": 15,
    "checkout": 16,
    "menu_inquiry": 17,
    "location_inqiury": 18,
    "hours_inquiry": 19
}

entity_files = {
    "DRIN_ADD": "../Filler_Data/drink_addon.txt",
    "TIME": "../Filler_Data/times.txt",
    "NAME": "../Filler_Data/names.txt",
    "DATE": "../Filler_Data/dates.txt",
    "ITEM": "../Filler_Data/food_items.txt",
    "NUMBER": "../Filler_Data/numbers.txt",
    "ADDON": "../Filler_Data/addon.txt",
    "SIZE": "../Filler_Data/size.txt",
    "DRINK": "../Filler_Data/drink_items.txt",
}

def process_sentence(sentence, entity_files):
    # isolate intent
    intent = intent_map[sentence.split(':')[0]]
    sentence = sentence.split(':')[1]

    slot_map = []
    splitted = sentence.split()

    for idx in range(len(splitted)):
        is_slot = False
        for placeholder, filepath in entity_files.items():
            if placeholder in splitted[idx]:
                is_slot = True
                processed_slot = splitted[idx].split(',')

                replacement = get_random_line(filepath)
                if placeholder == 'NAME' and random.random() < 0.5: # assign two names on a 50% chance
                    replacement += " " + get_random_line(filepath)

                slot_map.append([original_slot_map["B-" + placeholder], int(processed_slot[1]), int(processed_slot[2])])

                # add indices for number of words in replacement
                for i in range(len(replacement.split()) - 1):
                    slot_map.append([original_slot_map["I-" + placeholder], int(processed_slot[1]), int(processed_slot[2])])

                sentence = re.sub(r'[A-Z_]+,\d+,\d+', replacement, sentence, count=1)
                break

        if not is_slot:
            slot_map.append([0,0,0])

    return sentence, intent, slot_map

def process_conversation(conversation, entity_files):
    data = []
    memory = []

    for sentence in conversation.split('|'):
        processed_sentence, intent, slot_map = process_sentence(sentence, entity_files)

        data.append({
            "text": processed_sentence,
            "intent": intent,
            "slots": slot_map,
            "memory": memory.copy()
        })

        memory.append({
            "text": processed_sentence,
            "intent": intent,
            "slots": slot_map
        })
    
    return data

def process_file(input_file, output_file, entity_files):
    data = []
    with open(input_file, 'r') as file:
        conversations = [line.strip() for line in file.readlines()]

    for conversation in conversations:
        data.extend(process_conversation(conversation, entity_files))

    with open(output_file, 'w') as file:
        json.dump(data, file, indent=4)

#### Call Functions

In [21]:
print(process_conversation("create_res:confirm booking details for NUMBER,0,0 visitor party coming on DATE,0,0|view_res:what time is my reservation this DATE,0,0", entity_files))

[{'text': 'confirm booking details for thirteen visitor party coming on september twenty seventh', 'intent': 3, 'slots': [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [7, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [3, 0, 0], [4, 0, 0], [4, 0, 0]], 'memory': []}, {'text': 'what time is my reservation this the twelfth of may', 'intent': 4, 'slots': [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [3, 0, 0], [4, 0, 0], [4, 0, 0], [4, 0, 0]], 'memory': [{'text': 'confirm booking details for thirteen visitor party coming on september twenty seventh', 'intent': 3, 'slots': [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [7, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [3, 0, 0], [4, 0, 0], [4, 0, 0]]}]}]
