# Filter generated data from ChatGPT

1. Generation:
    1. Generated questions data with ChatGPT, using the node text as context
    2. Generated synonyms for the questions from 1)
3. Filtering:
    1. Asked ChatGPT to rate if the question was answerable using the node's context
    2. This notebook: clean the unanswerable quesions from the generated data and save to new file

In [1]:
import json

with open("../../../resources/en/reimburse/generated/chatgpt/augmentation_with_judgement.json", "r") as f:
    data = json.load(f)

print(len(data), "samples")

740 samples


In [2]:
import sys
import os 

sys.path.append('../../..')
print(os.path.realpath("."))

/mount/arbeitsdaten41/projekte/asr-2/vaethdk/cts_newcodebase_rollback/conversational-tree-search/generation/reimburse/chatgpt


In [3]:
judgements = set()

positive_start_keys = ["Yes.", "The answer is yes.", "Yes,", "The given text answers the question", "The answer is:", "The per diem reduction is a percentage."]
positive_end_keys = ["the answer is yes.", "Therefore, the answer is yes, but only if there is a pressing personal reason."]
undecided_keys = ["it does not answer the question with a simple yes or no", "it does not directly answer the question", "The answer is unclear.", "The answer is not explicitly given in the text.",
                  'The given text does not directly answer the question.', "The given text partially answers the question."]
negative_start_keys = ["No.", "The given text does not answer the question", "The given text does not provide a clear answer", "The answer is no.",
                "The given text does not provide an answer to the question.", "The given text does not provide information about", "There is no evidence in the given text to answer the question."]
negative_end_keys = ["Therefore, the answer is no."]

filtered = []
undecided = []
remove = []

for sample_key in data:
    sample = data[sample_key]
    dialog_node_key = sample['dialog_node_key']
    question = sample['text']
    synonyms = sample['synonyms']
    judgement = sample['judgement']

    keep = False
    for positive_key in positive_start_keys:
        if judgement.startswith(positive_key):
            filtered.append(sample)
            keep = True
            break
    if not keep:
        for positive_key in positive_end_keys:
            if judgement.endswith(positive_key):
                filtered.append(sample)
                keep = True
                break
    if not keep:
        negative_match = False
        for negative_key in negative_start_keys:
            if judgement.startswith(negative_key):
                remove.append(sample)
                negative_match = True
                break
        if not negative_match:
            for negative_key in negative_end_keys:
                if judgement.endswith(negative_key):
                    remove.append(sample)
                    negative_match = True
                    break
        if not negative_match:
            # check undecided
            for undecided_key in undecided_keys:
                if undecided_key in judgement:
                    undecided.append(sample)
                    negative_match = True
                    break
        if not negative_match:
            judgements.add(judgement)
            print(sample)
    

print("Unmatched judgmenent responses:", judgements)
print(len(judgements))
print("Positive matches:", len(filtered))
print("Negative matches:", len(remove))
print("Undecided:", len(undecided))

Unmatched judgmenent responses: set()
0
Positive matches: 363
Negative matches: 366
Undecided: 11


In [4]:
from data.dataset import ReimburseGraphDataset, DataAugmentationLevel, NodeType
human_data_train = ReimburseGraphDataset('en/reimburse/train_graph.json', 'en/reimburse/train_answers.json', False, augmentation=DataAugmentationLevel.NONE, resource_dir="../../../resources", language="en")

- not using synonyms
===== Dataset Statistics =====
- files:  en/reimburse/train_graph.json en/reimburse/train_answers.json
- synonyms: False
- depth: 20  - degree: 13
- answers: 73
- questions: 279
- loaded original data: True
- loaded generated data: False
- question limit: 0  - maximum loaded:  7
- answer limit: 0  - maximum loaded:  1


In [7]:
# Add synonyms of accepted answers and format all data to match data loader
import time 


filtered_with_synonyms = {}
for sample in filtered:
    node = human_data_train.nodes_by_key[sample['dialog_node_key']]
    if node.node_type == NodeType.INFO:
        key = str(time.time()).replace(".", "")
        filtered_with_synonyms[key] = {
            'key': key,
            'dialog_node_key': sample['dialog_node_key'],
            'text': sample['text'],
            'node_text': node.text,
            'node_type': node.node_type.value
        }
        for synonym in sample['synonyms']:
            key = str(time.time()).replace(".", "")
            filtered_with_synonyms[key] = {
                'key': key,
                'dialog_node_key': sample['dialog_node_key'],
                'text': synonym,
                'node_text': node.text,
                'node_type': node.node_type.value
            }
    else:
        print("SKIPPING NODE TYPE", node.node_type.value)

print("Final dataset size", len(filtered_with_synonyms))

Final dataset size 2178


In [8]:
with open("../../../resources/en/reimburse/generated/chatgpt/train_questions_filtered_synonyms.json", "w") as f:
    json.dump(filtered_with_synonyms, f)

In [2]:
# Lowercase answer synonym keys
import json

answer_synonyms = {}
with open("../../../resources/en/reimburse/generated/chatgpt/train_answers.json", "r") as f:
    data = json.load(f)
    for syn_key in data:
        answer_synonyms[syn_key.lower()] = data[syn_key]

print(answer_synonyms)

with open("../../../resources/en/reimburse/generated/chatgpt/train_answers.json", "w") as f:
    json.dump(answer_synonyms, f)

{'business trip': ['Work-related journey', 'Professional excursion', 'Corporate travel', 'Job-related expedition', 'Business excursion', 'Work trip', 'Company travel', 'Career-related voyage', 'Occupational journey', 'Business journey'], 'intracity business trip': ['A business trip within the city limits', 'A work-related journey within the city', 'A local business excursion', 'A professional visit within the city', 'A business expedition within the urban area', 'A work-related travel within the city boundaries', 'A business tour within the metropolitan area', 'A professional trip within the city limits', 'A local work-related excursion', 'A business visit within the city precincts'], 'what is the difference between an intracity business trip and a business trip?': ['How does an intracity business trip differ from a regular business trip?', 'What sets apart an intracity business trip from a typical business trip?', 'In what ways are intracity business trips distinct from regular busine