## Synthetic Data Generation with Gpt

In [2]:
import os
from utils.AzureAdapter import AzureAdapter
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("AZURE_API_KEY")
api_endpoint = os.getenv("AZURE_API_ENDPOINT")
api_version = os.getenv("AZURE_API_VERSION")
deployment_name = "gpt-4o"

llm = AzureAdapter(api_key=api_key, api_endpoint=api_endpoint, api_version=api_version)

In [2]:
import os
import json
import time
from random import randint
from utils.save_and_load_json import save_json

def parse_and_save(llm_response, folder_path="", save_data=True):
    seed = int(time.time())

    # Parse the response from the llm
    data = json.loads(llm_response)

    # Add sel_idx and id
    if type(data) == dict and 'puzzles' in data.keys():
        data = data['puzzles']

    elif type(data) == dict and 'questions' in data.keys():
        data = data['questions']

    elif type(data) == dict:
        data = [data]

    # Save each puzzle as own json
    for puzzle in data:
        rand =  randint(0, 1000000)
        puzzle['id'] = rand
        puzzle['sel_idx'] = rand
        if save_data:
            save_json(data=puzzle, file_path=f'{folder_path}/{seed}_{rand}.json')

    print((type(data), len(data)))
    return data

def has_enough_json_files(folder_path, target_count):
    """
    Check if a folder contains at least the target number of JSON files.
    """
    if not os.path.isdir(folder_path):
        raise ValueError(f"The folder path '{folder_path}' does not exist or is not a directory")

    json_count = sum(1 for file in os.listdir(folder_path) if file.lower().endswith('.json'))

    return json_count >= target_count

#### Generating puzzles with one llm call

In [6]:
from prompts.prompts import entire_puzzle_generation_prompt

folder_path = 'data/synthetic/raw/gpt_one_call'

while not has_enough_json_files(folder_path, target_count=300):
    prompt = 'Generate generate 10 new puzzles'
    try:
        res = llm.call_model(prompt=prompt, system_prompt=entire_puzzle_generation_prompt, deployment_name=deployment_name)
        data = parse_and_save(folder_path=folder_path, llm_response=res)
        print(f'{len(data)} successfully generated')
    except Exception as ex:
        print('Skipping generation!')
        print(ex)

(<class 'list'>, 1)
1 successfully generated
(<class 'list'>, 2)
2 successfully generated
Skipping generation!
Unterminated string starting at: line 240 column 19 (char 14164)
(<class 'list'>, 9)
9 successfully generated
Skipping generation!
Unterminated string starting at: line 228 column 33 (char 19599)
(<class 'list'>, 2)
2 successfully generated
(<class 'list'>, 2)
2 successfully generated
(<class 'list'>, 2)
2 successfully generated
(<class 'list'>, 5)
5 successfully generated
(<class 'list'>, 1)
1 successfully generated
(<class 'list'>, 5)
5 successfully generated


#### Creating Puzzles in two llm calls

In [10]:
from prompts.prompts import question_generation_prompt, solution_generation_prompt
import json

n = 1
folder_path = 'data/synthetic/raw/gpt_two_calls'
questions = None
retry = False
retry_count = 0

# Ensure folder exists
# os.makedirs(folder_path, exist_ok=True)

while not has_enough_json_files(folder_path=folder_path, target_count=265):
    if not retry:
        try:
            # Create puzzle questions
            prompt_questions = "Generate {} new puzzle questions".format(n)
            res_questions = llm.call_model(prompt=prompt_questions, system_prompt=question_generation_prompt, deployment_name=deployment_name)
            questions = parse_and_save(llm_response=res_questions, save_data=False)
            print(f'{len(questions)} questions successfully generated')
            retry_count = 0
        except Exception as ex:
            print('Skipping question generation!')
            print(ex)
            questions = None
            retry = False
            retry_count = 0

    if questions:
        try:
            formatted_questions = json.dumps(questions, indent=2)
            prompt_solutions = f"Solve all {n} logical puzzles:\n{formatted_questions}"

            res_solutions = llm.call_model(prompt=prompt_solutions, system_prompt=solution_generation_prompt, deployment_name=deployment_name)
            solutions = parse_and_save(folder_path=folder_path, llm_response=res_solutions, save_data=True)
            print(f'{len(solutions)} solutions successfully generated')
            retry = False
            retry_count = 0
        except Exception as ex:
            print('Retrying solution generation!')
            print(ex)
            retry = True
            retry_count += 1
            if retry_count > 3:
                print("Too many retries, generating new questions")
                retry = False
                questions = None
                retry_count = 0

(<class 'list'>, 1)
1 questions successfully generated
(<class 'list'>, 1)
1 solutions successfully generated
(<class 'list'>, 1)
1 questions successfully generated
(<class 'list'>, 1)
1 solutions successfully generated
(<class 'list'>, 1)
1 questions successfully generated
(<class 'list'>, 1)
1 solutions successfully generated
(<class 'list'>, 1)
1 questions successfully generated
(<class 'list'>, 1)
1 solutions successfully generated
(<class 'list'>, 1)
1 questions successfully generated
(<class 'list'>, 1)
1 solutions successfully generated
(<class 'list'>, 1)
1 questions successfully generated
(<class 'list'>, 1)
1 solutions successfully generated
(<class 'list'>, 1)
1 questions successfully generated
(<class 'list'>, 1)
1 solutions successfully generated
(<class 'list'>, 1)
1 questions successfully generated
(<class 'list'>, 1)
1 solutions successfully generated
(<class 'list'>, 1)
1 questions successfully generated
(<class 'list'>, 1)
1 solutions successfully generated
(<class 'l

#### Puzzle verification

##### Duplicates
Sometimes LLMs generate redundant puzzle so we need to filter out redundant puzzles. Here we can use two different approaches.
1. We can use a LLM as classifier to assess based on the semantics if there are duplicates
2. We can use a syntactic assessment via a the levenstein distance

##### Levenstein Distance

In [25]:
from utils.save_and_load_json import load_all_json_files

data = load_all_json_files(folder_path='data/synthetic/raw/gpt_two_calls')

Loaded 20 JSON files from ./data/synthetic/gpt_two_calls


In [22]:
# Levenshtein distance
from utils.duplicate_management import find_similar_items

keys = ['question', 'question_parsing', 'answer', 'cot', 'cot_parsing']
dup = find_similar_items(data, keys_to_compare=keys, threshold=.5)

print(dup)

[]


In [16]:
# Find a specific item by ID
item_1 = next((item for item in data if item.get('id') == 669342), None)
item_2 = next((item for item in data if item.get('id') == 129199), None)

print(item_1)
print(item_2)

{'question': 'There are five students: A, B, C, D, and E who must be assigned to one of two study groups, X or Y. The following conditions apply: 1) If A is in group X, then B must be in group Y. 2) C and D must be in different groups. 3) If E is in group Y, then C must also be in group Y. 4) A and D cannot be in the same group. Question: If C is in group X, which of the following must be true? a) A is in group X. b) B is in group Y. c) E is in group Y. d) D is in group Y.', 'question_parsing': ['Entities: Students A, B, C, D, E.', 'Initial setup: Assign each to group X or Y.', 'Rule 1: If A is in X, then B is in Y.', 'Rule 2: C and D must be in different groups.', 'Rule 3: If E is in Y, then C is in Y.', 'Rule 4: A and D cannot be in the same group.', 'Specific scenario: C is in group X.'], 'answer': 'd', 'cot': "Given C is in group X, D must be in group Y (Rule 2). Since A and D cannot be in the same group (Rule 4), A must be in group X. Rule 1 does not force a contradiction here sin

##### Language Model Classifier

Since the syntactic filter indicates that there are no redundancies we will not implement a semantic duplicate finder via a language model.

### Verify structure of generated json files

In [3]:
from utils.save_and_load_json import load_all_json_files
from utils.structure_verification import verify_all_gpt_items

# one gpt call
folder_path = './data/synthetic/raw/gpt_one_call'
data_one_gpt_call = load_all_json_files(folder_path=folder_path)

invalid_items, valid_items = verify_all_gpt_items(data_one_gpt_call)
print(len(valid_items))
if not invalid_items:
    print("All items match the required structure.")
else:
    for idx, errs in invalid_items:
        print(f"Item at index {idx} has errors:")
        for e in errs:
            print("  -", e)

save_json(data=valid_items, file_path='./data/synthetic/processed/gpt_one_call/gpt_one_call_processed.json')

Loaded 301 JSON files from ./data/synthetic/raw/gpt_one_call
Found 43 wrongly formatted entries.
258
Item at index 7 has errors:
  -   cot_parsing[0] missing keys: {'Verification'}
  -   cot_parsing[0] has unexpected keys: {'verification'}
  -   cot_parsing[1] missing keys: {'Verification'}
  -   cot_parsing[1] has unexpected keys: {'verification'}
  -   cot_parsing[2] missing keys: {'Verification'}
  -   cot_parsing[2] has unexpected keys: {'verification'}
Item at index 15 has errors:
  - Missing keys at top level: {'cot', 'question_parsing', 'cot_parsing', 'answer', 'question'}
  - Unexpected extra keys at top level: {'message'}
Item at index 19 has errors:
  - Missing keys at top level: {'cot', 'question_parsing', 'cot_parsing', 'answer', 'question'}
  - Unexpected extra keys at top level: {'error'}
Item at index 23 has errors:
  - Missing keys at top level: {'cot', 'cot_parsing', 'answer'}
Item at index 34 has errors:
  -   cot_parsing[0] missing keys: {'Verification'}
  -   cot_pa

True

In [4]:
# two gpt calls
folder_path = './data/synthetic/raw/gpt_two_calls'
data_one_gpt_call = load_all_json_files(folder_path=folder_path)
print(type(data_one_gpt_call[0]))

invalid_items, valid_items = verify_all_gpt_items(data_one_gpt_call)
print(len(valid_items))
if not invalid_items:
    print("All items match the required structure.")
else:
    for idx, errs in invalid_items:
        print(f"Item at index {idx} has errors:")
        for e in errs:
            print("  -", e)

save_json(data=valid_items, file_path='./data/synthetic/processed/gpt_two_calls/gpt_two_calls_processed.json')

Loaded 265 JSON files from ./data/synthetic/raw/gpt_two_calls
<class 'dict'>
Found 11 wrongly formatted entries.
254
Item at index 12 has errors:
  -   cot_parsing[0] missing keys: {'Verification'}
  -   cot_parsing[0] has unexpected keys: {'verification'}
  -   cot_parsing[1] missing keys: {'Verification'}
  -   cot_parsing[1] has unexpected keys: {'verification'}
  -   cot_parsing[2] missing keys: {'Verification'}
  -   cot_parsing[2] has unexpected keys: {'verification'}
Item at index 32 has errors:
  - Missing keys at top level: {'cot', 'question_parsing', 'cot_parsing', 'answer', 'question'}
  - Unexpected extra keys at top level: {'questions'}
Item at index 35 has errors:
  -   question_parsing[0] should be a str, got 'dict'
  -   question_parsing[1] should be a str, got 'dict'
  -   question_parsing[2] should be a str, got 'dict'
  -   question_parsing[3] should be a str, got 'dict'
  -   question_parsing[4] should be a str, got 'dict'
  -   question_parsing[5] should be a str, 

True