In [14]:
from typing import List
from copy import deepcopy
from data.dataset import ReimburseGraphDataset, DataAugmentationLevel, DialogNode, NodeType
from environment.goal import GoalPath, VariableValue, Condition
from data.parsers.answerTemplateParser import AnswerTemplateParser

data_human = ReimburseGraphDataset('en/reimburse/train_graph.json', 'en/reimburse/train_answers.json', True, augmentation=DataAugmentationLevel.NONE, augmentation_path=None, resource_dir="./resources")

===== Dataset Statistics =====
- files:  en/reimburse/train_graph.json en/reimburse/train_answers.json
- synonyms: True
- depth: 20  - degree: 13
- answers: 312
- questions: 279
- loaded original data: True
- loaded generated data: False
- question limit: 0  - maximum loaded:  7
- answer limit: 0  - maximum loaded:  9


# Calculate amount of hard goals

In [6]:
def expand_path(goal_node: DialogNode, start_node: DialogNode, answerParser: AnswerTemplateParser):
    active_paths: List[GoalPath] = [GoalPath(current_node=start_node, visited_nodes=[], visited_ids=set(), chosen_answers={}, constraints={})]
    valid_paths: List[GoalPath] = []

    while len(active_paths) > 0:
        # expand current level
        path = active_paths.pop(0)
        current_node = path.current_node

        # expand constraints if current node is variable node
        next_constraints = path.constraints.copy()
        next_visited_ids = path.visited_ids.union([current_node.key])
        next_visited_nodes = path.visited_nodes.copy() + [current_node]

        if current_node.node_type == NodeType.VARIABLE:
            assert len(current_node.answers) == 1, "Should have exactly 1 answer"
            variable = answerParser.find_variable(current_node.answer_by_index(0).text)
            if not variable.name in next_constraints:
                next_constraints[variable.name] = VariableValue(var_name=variable.name, var_type=variable.type)

        if current_node.key == goal_node.key:
            # we reached the goal node -> save path
            valid_paths.append(GoalPath(current_node=current_node,
                                        visited_nodes=next_visited_nodes,
                                        chosen_answers=path.chosen_answers,
                                        constraints=next_constraints,
                                        visited_ids=next_visited_ids))
        else:
            # extend path by visiting all neighbours of current node
            if current_node.connected_node and not (current_node.connected_node.key in path.visited_ids):
                # variable node or info node
                active_paths.append(GoalPath(current_node=current_node.connected_node,
                                            visited_nodes=next_visited_nodes,
                                            chosen_answers=path.chosen_answers,
                                            constraints=next_constraints,
                                            visited_ids=next_visited_ids))
            elif len(current_node.answers) > 0:
                # question node or logic node

                # collect all conditions, find default condition
                conditions: List[Condition] = []
                if current_node.node_type == NodeType.LOGIC:
                    for condition in current_node.answers:
                        var_name, op, var_value = f"{current_node.text.replace('{{', '')} {condition.text.replace('}}', '')}".split()
                        assert var_name in next_constraints, f"Logic node for {var_name} on path wihtout preceeding variable node!"
                        conditions.append(Condition(var_name=var_name, op=op, 
                                                    var_value=var_value.strip().replace('"', ''),
                                                    default=var_value == "DEFAULT"))
                
                # create new paths for each answer
                for answer_idx, answer in enumerate(current_node.answers):
                    # visit neighbour (with loop breaker)
                    if answer.connected_node and not (answer.connected_node.key in path.visited_ids):
                        final_constraints = next_constraints
                        if current_node.node_type == NodeType.LOGIC:
                            final_constraints = deepcopy(next_constraints)
                            condition = conditions[answer_idx]
                            if condition.default == True:
                                compatible = final_constraints[condition.var_name].add_default_condition([(other_cond.op, other_cond.var_value) for other_cond in conditions if not other_cond.default])
                            else:
                                compatible = final_constraints[condition.var_name].add_condition(condition.op, condition.var_value)
                            if not compatible:
                                continue # prune impossible path       
                        next_chosen_answers = path.chosen_answers.copy()
                        next_chosen_answers[current_node.key] = answer
                        active_paths.append(GoalPath(current_node=answer.connected_node,
                                            visited_nodes=next_visited_nodes,
                                            chosen_answers=next_chosen_answers,
                                            constraints=final_constraints,
                                            visited_ids=next_visited_ids))

    return valid_paths

In [11]:
start_node = data_human.start_node.connected_node
print(start_node)
answerParser = AnswerTemplateParser()

DialogNode.QUESTION(key: 16348058621438633, answers: 6, questions: 0)
        - connected_node: None
        - text: What topic do you have questions about? You can either click on an answer from the suggested topics 
        


In [16]:
hard_goal_nodes = set()

for goal_node in data_human.nodes_by_type[NodeType.INFO]:
    # calculate all possible paths to node
    paths = expand_path(goal_node, start_node, answerParser)

    # check if each path contains either
    # a) a template node
    # b) a logic node
    all_paths_need_variables = True
    for path in paths:
        needs_variable = False
        for node in path.visited_nodes:
            if "{{" in node.text:
                # condition a)
                needs_variable = True
                break
            if node.node_type == NodeType.LOGIC:
                # condition b)
                needs_variable = True
                break
        if not needs_variable:
            all_paths_need_variables = False
            break
    
    if all_paths_need_variables:
        hard_goal_nodes.add(goal_node.key)

print("Hard goals:", len(hard_goal_nodes))
print("Total goals:", len(data_human.nodes_by_type[NodeType.INFO]))
print("Hard goal ratio:", len(hard_goal_nodes)/len(data_human.nodes_by_type[NodeType.INFO]))
    

Hard goals: 35
Total goals: 80
Hard goal ratio: 0.4375


# Nodes, Tree Depth, Questions, Answers

In [28]:
data_human_train_en = ReimburseGraphDataset('en/reimburse/train_graph.json', 'en/reimburse/train_answers.json', True, augmentation=DataAugmentationLevel.NONE, augmentation_path=None, resource_dir="./resources")

print("#Nodes:", len(data_human_train_en.node_list))
print("Avg questions / info node:",  len(data_human_train_en.question_list) / len(data_human_train_en.nodes_by_type[NodeType.INFO]) )
print("Avg answers / question node:", sum([len(data_human_train_en.answer_synonyms[ans]) for ans in data_human_train_en.answer_synonyms ]) / len(data_human_train_en.answer_synonyms) )

===== Dataset Statistics =====
- files:  en/reimburse/train_graph.json en/reimburse/train_answers.json
- synonyms: True
- depth: 20  - degree: 13
- answers: 248
- questions: 279
- loaded original data: True
- loaded generated data: False
- question limit: 0  - maximum loaded:  7
- answer limit: 0  - maximum loaded:  9
#Nodes: 123
Avg questions / info node: 3.4875
Avg answers / question node: 3.3972602739726026


In [29]:
data_human_test_en = ReimburseGraphDataset('en/reimburse/test_graph.json', 'en/reimburse/test_answers.json', True, augmentation=DataAugmentationLevel.NONE, augmentation_path=None, resource_dir="./resources")

print("#Nodes:", len(data_human_test_en.node_list))
print("Avg questions / info node:",  len(data_human_test_en.question_list) / len(data_human_test_en.nodes_by_type[NodeType.INFO]) )
print("Avg answers / question node:", sum([len(data_human_test_en.answer_synonyms[ans]) for ans in data_human_test_en.answer_synonyms ]) / len(data_human_test_en.answer_synonyms) )

===== Dataset Statistics =====
- files:  en/reimburse/test_graph.json en/reimburse/test_answers.json
- synonyms: True
- depth: 20  - degree: 13
- answers: 162
- questions: 173
- loaded original data: True
- loaded generated data: False
- question limit: 0  - maximum loaded:  4
- answer limit: 0  - maximum loaded:  4
#Nodes: 123
Avg questions / info node: 2.1625
Avg answers / question node: 2.219178082191781


In [30]:
data_human_train_de = ReimburseGraphDataset('de/reimburse/train_graph.json', 'de/reimburse/train_answers.json', True, augmentation=DataAugmentationLevel.NONE, augmentation_path=None, resource_dir="./resources")


print("#Nodes:", len(data_human_train_de.node_list))
print("Avg questions / info node:",  len(data_human_train_de.question_list) / len(data_human_train_de.nodes_by_type[NodeType.INFO]) )
print("Avg answers / question node:", sum([len(data_human_train_de.answer_synonyms[ans]) for ans in data_human_train_de.answer_synonyms ]) / len(data_human_train_de.answer_synonyms) )

===== Dataset Statistics =====
- files:  de/reimburse/train_graph.json de/reimburse/train_answers.json
- synonyms: True
- depth: 20  - degree: 13
- answers: 246
- questions: 279
- loaded original data: True
- loaded generated data: False
- question limit: 0  - maximum loaded:  7
- answer limit: 0  - maximum loaded:  9
#Nodes: 123
Avg questions / info node: 3.5316455696202533
Avg answers / question node: 3.3698630136986303


In [31]:
data_human_test_de = ReimburseGraphDataset('de/reimburse/test_graph.json', 'de/reimburse/test_answers.json', True, augmentation=DataAugmentationLevel.NONE, augmentation_path=None, resource_dir="./resources")


print("#Nodes:", len(data_human_test_de.node_list))
print("Avg questions / info node:",  len(data_human_test_de.question_list) / len(data_human_test_de.nodes_by_type[NodeType.INFO]) )
print("Avg answers / question node:", sum([len(data_human_test_de.answer_synonyms[ans]) for ans in data_human_test_de.answer_synonyms ]) / len(data_human_test_de.answer_synonyms) )

===== Dataset Statistics =====
- files:  de/reimburse/test_graph.json de/reimburse/test_answers.json
- synonyms: True
- depth: 20  - degree: 13
- answers: 162
- questions: 173
- loaded original data: True
- loaded generated data: False
- question limit: 0  - maximum loaded:  4
- answer limit: 0  - maximum loaded:  4
#Nodes: 123
Avg questions / info node: 2.189873417721519
Avg answers / question node: 2.219178082191781


# Onboard

In [17]:

from data.dataset import StandardGraphDataset


data_onboard = GraphDataset('en/onboarding/train_graph.json', 'en/onboarding/train_answers.json', True, DataAugmentationLevel.NONE, augmentation_path=None, resource_dir='./resources')

print("#Nodes", len(data_onboard.node_list))
print("#Answers (we don't have synonyms): ", sum([len(data_onboard.answer_synonyms[ans]) for ans in data_onboard.answer_synonyms ]))
print("Avg questions / info node:",  len(data_onboard.question_list) / len(data_onboard.nodes_by_type[NodeType.INFO]) )
print("Avg answers / question node:", sum([len(data_onboard.answer_synonyms[ans]) for ans in data_onboard.answer_synonyms ]) / len(data_onboard.answer_synonyms) )

===== Dataset Statistics =====
- files:  en/onboarding/train_graph.json en/onboarding/train_answers.json
- synonyms: True
- depth: 12  - degree: 9
- answers: 175
- questions: 141
- loaded original data: True
- loaded generated data: False
- question limit: 0  - maximum loaded:  4
- answer limit: 0  - maximum loaded:  4
#Nodes 88
#Answers (we don't have synonyms):  175
Avg questions / info node: 2.389830508474576
Avg answers / question node: 3.0701754385964914


In [16]:
# CHECK TRAIN-TEST GRA
from data.dataset import StandardGraphDataset, GraphDataset


data_onboard = GraphDataset('en/onboarding/test_graph.json', 'en/onboarding/test_answers.json', True, DataAugmentationLevel.NONE, augmentation_path=None, resource_dir='./resources')

print("#Nodes", len(data_onboard.node_list))
print("#Answers (we don't have synonyms): ", sum([len(data_onboard.answer_synonyms[ans]) for ans in data_onboard.answer_synonyms ]))
print("Avg questions / info node:",  len(data_onboard.question_list) / len(data_onboard.nodes_by_type[NodeType.INFO]) )
print("Avg answers / question node:", sum([len(data_onboard.answer_synonyms[ans]) for ans in data_onboard.answer_synonyms ]) / len(data_onboard.answer_synonyms) )

===== Dataset Statistics =====
- files:  en/onboarding/test_graph.json en/onboarding/test_answers.json
- synonyms: True
- depth: 12  - degree: 9
- answers: 152
- questions: 117
- loaded original data: True
- loaded generated data: False
- question limit: 0  - maximum loaded:  3
- answer limit: 0  - maximum loaded:  4
#Nodes 88
#Answers (we don't have synonyms):  152
Avg questions / info node: 1.9830508474576272
Avg answers / question node: 2.6666666666666665


# DIAGNOSE

In [18]:
data_onboard = GraphDataset('en/diagnose/train_graph.json', 'en/diagnose/train_answers.json', True, DataAugmentationLevel.NONE, augmentation_path=None, resource_dir='./resources')

print("#Nodes", len(data_onboard.node_list))
print("#Answers (we don't have synonyms): ", sum([len(data_onboard.answer_synonyms[ans]) for ans in data_onboard.answer_synonyms ]))
print("Avg questions / info node:",  len(data_onboard.question_list) / len(data_onboard.nodes_by_type[NodeType.INFO]) )
print("Avg answers / question node:", sum([len(data_onboard.answer_synonyms[ans]) for ans in data_onboard.answer_synonyms ]) / len(data_onboard.answer_synonyms) )

===== Dataset Statistics =====
- files:  en/diagnose/train_graph.json en/diagnose/train_answers.json
- synonyms: True
- depth: 9  - degree: 6
- answers: 298
- questions: 219
- loaded original data: True
- loaded generated data: False
- question limit: 0  - maximum loaded:  3
- answer limit: 0  - maximum loaded:  3
#Nodes 98
#Answers (we don't have synonyms):  298
Avg questions / info node: 2.92
Avg answers / question node: 2.98


In [19]:
data_onboard = GraphDataset('en/diagnose/test_graph.json', 'en/diagnose/test_answers.json', True, DataAugmentationLevel.NONE, augmentation_path=None, resource_dir='./resources')

print("#Nodes", len(data_onboard.node_list))
print("#Answers (we don't have synonyms): ", sum([len(data_onboard.answer_synonyms[ans]) for ans in data_onboard.answer_synonyms ]))
print("Avg questions / info node:",  len(data_onboard.question_list) / len(data_onboard.nodes_by_type[NodeType.INFO]) )
print("Avg answers / question node:", sum([len(data_onboard.answer_synonyms[ans]) for ans in data_onboard.answer_synonyms ]) / len(data_onboard.answer_synonyms) )

===== Dataset Statistics =====
- files:  en/diagnose/test_graph.json en/diagnose/test_answers.json
- synonyms: True
- depth: 9  - degree: 6
- answers: 298
- questions: 150
- loaded original data: True
- loaded generated data: False
- question limit: 0  - maximum loaded:  3
- answer limit: 0  - maximum loaded:  3
#Nodes 98
#Answers (we don't have synonyms):  298
Avg questions / info node: 2.0
Avg answers / question node: 2.98
