# JERTmate data processing

Compile synthetic data and process into useable formats

#### Basic definitions

In [2]:
import json
import random
import os
import re
import numpy as np
from transformers import AutoTokenizer
from transformers import TFBertModel

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert = TFBertModel.from_pretrained(model_name)




To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

#### Compile Synthetic Data

In [6]:
sentence_files = {
    # miscellaneous
    "greeting": "../Greeting_Farewell/greeting.txt",
    "farewell": "../Greeting_Farewell/farewell.txt",
    "out_of_scope": "../Out_of_Scope/out_of_scope.txt",

    # reservation
    "add_res": "../Reservation/add_res.txt",
    "delete_res": "../Reservation/delete_res.txt",
    "view_res": "../Reservation/view_res.txt",
    "edit_res": "../Reservation/edit_res.txt",

    "add_info": "../Add_Info/add_info.txt",
    
    # order
    "make_order": "../Order/make_order.txt",
    "add_items": "../Order/add_items.txt",
    "view_cart": "../Order/view_cart.txt",
    "clear_cart": "../Order/clear_cart.txt",
    "edit_cart": "../Order/edit_cart.txt",
    "checkout": "../Order/checkout.txt",

    # inquiry
    "menu_inquiry": "../Inquiry/menu_inquiry.txt",
    "location_inquiry": "../Inquiry/location_inquiry.txt",
    "hours_inquiry": "../Inquiry/hours_inquiry.txt",
}

def get_random_line(filepath):
    with open(filepath, 'r') as file:
        lines = file.readlines()
    return random.choice(lines).strip()

# select a random line that contains all strings in the strings array effectively
#!Make more effective
def get_random_line_with_strings(filepath, strings):
    with open(filepath, 'r') as file:
        lines = file.readlines()
    for string in strings:
        lines = [line for line in lines if string not in line]
    return random.choice(lines).strip()

def randomize_greeting():
    if random.random() < 0.7:
        return get_random_line(sentence_files["greeting"])
    else:
        return ''

def randomize_farewell():
    if random.random() < 0.7:
        return '|' + get_random_line(sentence_files["farewell"])
    else:
        return ''

def compile_reservation_sentences(output_file, sentence_files, num_reservation_sentences):
    with open(output_file, "w") as file:
        for i in range(num_reservation_sentences):
            conversation = ''

            # greeting
            conversation += randomize_greeting()

            # track provided entities
            provided_entities = []

            # define chances
            sentence_type_chances = {
                "add_res": [0.75, 0],
                "add_info": [0.05, 0.80],
                "delete_res": [0.04, 0.03],
                "view_res": [0.05, 0.04],
                "change_res": [0.05, 0.04],
                "inquiry": [0.05, 0.04],
                "out_of_scope": [0.01],
            }

            # order content
            sentence_count = 0
            while True:
                # select intent with specified chances
                selected_intent = ''
                random = random.random()
                for key, value in sentence_type_chances.items():
                    if random < value[min(sentence_count, len(value) - 1)]:
                        selected_intent = key
                        break

                # generate a sentences within specified intent
                if selected_intent == "add_info" and sentence_count == 0:
                    conversation += '|' + get_random_line_with_strings(sentence_files["add_info"], ["NAME"])
                    provided_entities.append("NAME")
                elif selected_intent == "add_info" or selected_intent == "add_res":
                    # define possible entities
                    possible_entities = ["NAME", "TIME", "DATE", "PARTY_SIZE"]
                    possible_entities = [entity for entity in possible_entities if entity not in provided_entities]

                    # collect selected entities
                    selected_entities = []

                    extra_entity_chances = [0.5, 0.15, 0.05]
                    for probability in extra_entity_chances:
                        if random.random() < probability:
                            #select a random entity and remove it from further possible choices
                            selected_entity = random.choice(possible_entities)
                            possible_entities.remove(selected_entity)

                            selected_entities.append(selected_entity)

                    # signal to the script which entities we are going to provide so they cannot be added again
                    provided_entities.extend(selected_entities)

                    conversation += '|' + get_random_line_with_strings(sentence_files[selected_intent], selected_entities) 
                else:
                    conversation += '|' + get_random_line(sentence_files[selected_intent])

                # perform alterations based on intent and generated sentence
                if selected_intent == 'add_res':
                    sentence_count += 1
                elif selected_intent == "delete_res" and random.random() < 0.35:
                    break # end convo
                elif selected_intent == "delete_res":
                    # reset conversation ticker
                    sentence_count = 0

            # farewell
            conversation += randomize_farewell()

            # write conversation to file
            file.write(conversation + '\n')

def compile_order_sentences(output_file, sentence_files, num_order_sentences):
    with open(output_file, "w") as file:
        for i in range(num_order_sentences):
            conversation = ''

            # greeting
            conversation += randomize_greeting()

            # define chances
            sentence_type_chances = {
                "make_order": [0.80, 0],
                "add_items": [0, 0.75, 0.7, 0.5, 0],
                "view_cart": [0.05, 0.05, 0.05, 0.05, 0],
                "clear_cart": [0.04, 0.04, 0.04, 0.04, 0.14],
                "edit_cart": [0.05, 0.05, 0.05, 0.05, 0],
                "checkout": [0, 0.5, 0.1, 0.3, 0.85],
                "inquiry": [0.05, 0.05, 0.05, 0.05, 0],
                "out_of_scope": [0.01],
            }

            # order content
            sentence_count = 0
            while True:
                # select intent with specified chances
                selected_intent = ''
                random = random.random()
                for key, value in sentence_type_chances.items():
                    if random < value[min(sentence_count, len(value) - 1)]:
                        selected_intent = key
                        break
                
                conversation += '|' + get_random_line(sentence_files[selected_intent])

                if selected_intent == 'add_items' or selected_intent == 'make_order':
                    sentence_count += 1
                elif (selected_intent == "clear_cart" and random.random() < 0.5) or selected_intent == "checkout":
                    if not re.search(r"make_order:[^|]*\bNAME\b", conversation): # check if the user has already inputted their name
                        conversation += '|' + get_random_line_with_strings(sentence_files["add_info"], ["NAME"])

                    break # end convo
                elif selected_intent == "clear_cart":
                    # reset conversation ticker
                    sentence_count = 0

            # farewell
            conversation += randomize_farewell()

            # write conversation to file
            file.write(conversation + '\n')

def compile_inquiry_sentences(output_file, sentence_files, num_inquiry_sentences):
    with open(output_file, "w") as file:
        for i in range(num_inquiry_sentences):
            conversation = ''

            # greeting
            conversation += randomize_greeting()

            # define chances
            sentence_type_chances = {
                "out_of_scope": [0.05],
                "menu_inquiry": [0.25],
            }
            new_sentence_chances = [1, 0.5, 0.2]

            # order content
            sentence_count = 0
            while random.random() < new_sentence_chances[min(sentence_count, len(new_sentence_chances) - 1)]:
                random = random.random()

                for key, value in sentence_type_chances.items():
                    if random < value[min(sentence_count, len(value) - 1)]:
                        conversation += '|' + get_random_line(sentence_files[key])

                sentence_count += 1

            # farewell
            conversation += randomize_farewell()

            # write conversation to file
            file.write(conversation + '\n')

#### Process Created Data

In [27]:
entity_files = {
    "DRIN_ADD": "../Filler_Data/drink_addon.txt",
    "TIME": "../Filler_Data/times.txt",
    "NAME": "../Filler_Data/names.txt",
    "DATE": "../Filler_Data/dates.txt",
    "ITEM": "../Filler_Data/food_items.txt",
    "NUMBER": "../Filler_Data/numbers.txt",
    "ADDON": "../Filler_Data/addon.txt",
    "SIZE": "../Filler_Data/size.txt",
    "DRINK": "../Filler_Data/drink_items.txt",
}

def process_sentence(sentence, entity_files):
    # isolate intent
    intent = sentence.split(':')[0]
    sentence = sentence.split(':')[1]

    slot_map = []
    slot_memory = []
    splitted = sentence.split()

    for idx in range(len(splitted)):
        is_slot = False
        print(splitted[idx])
        for placeholder, filepath in entity_files.items():
            if placeholder in splitted[idx]:
                is_slot = True
                processed_slot = splitted[idx].split(',')

                replacement = get_random_line(filepath)
                if placeholder == 'NAME' and random.random() < 0.5: # assign two names on a 50% chance
                    replacement += " " + get_random_line(filepath)

                slot_map.append(["B-" + placeholder, processed_slot[1], processed_slot[2]])
                slot_memory.append([replacement, placeholder, processed_slot[1], processed_slot[2]])

                # add indices for number of words in replacement
                for i in range(len(replacement.split()) - 1):
                    slot_map.append(["I-" + placeholder, processed_slot[1], processed_slot[2]])

                sentence = re.sub(r'[A-Z_]+,\d+,\d+', replacement, sentence, count=1)
                break

        if not is_slot:
            slot_map.append([0,0,0])

    tokenized_sentence = tokenizer.tokenize(sentence, padding=True, truncation=True, return_tensors="tf")

    return tokenized_sentence, intent, slot_map, slot_memory

def process_conversation(conversation, entity_files):
    data = []
    memory = []

    for sentence in conversation.split('|'):
        tokenized_sentence, intent, slot_map, slot_memory = process_sentence(sentence, entity_files)

        data.append({
            "text": tokenized_sentence,
            "intent": intent,
            "slot_map": slot_map,
            "slot_memory": memory
        })

        memory.extend(slot_memory)
    
    return data

def process_file(input_file, output_file, entity_files):
    data = []
    with open(input_file, 'r') as file:
        conversations = [line.strip() for line in file.readlines()]

    for conversation in conversations:
        data.extend(process_conversation(conversation, entity_files))

    with open(output_file, 'w') as file:
        json.dump(data, file, indent=4)

#### Call Functions

In [28]:
print(process_sentence("add_res:can i make a reservation for TIME,0,0 on DATE,0,0", entity_files))

can
i
make
a
reservation
for
TIME,0,0
on
DATE,0,0
(['can', 'i', 'make', 'a', 'reservation', 'for', 'ten', 'forty', 'pm', 'on', 'the', 'fifth', 'of', 'july'], 'add_res', [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ['B-TIME', '0', '0'], ['I-TIME', '0', '0'], ['I-TIME', '0', '0'], [0, 0, 0], ['B-DATE', '0', '0'], ['I-DATE', '0', '0'], ['I-DATE', '0', '0'], ['I-DATE', '0', '0']], [['ten forty pm', 'TIME', '0', '0'], ['the fifth of july', 'DATE', '0', '0']])
