## Synthetic Data Generation with Gpt

In [1]:
import os
from utils.AzureAdapter import AzureAdapter
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("AZURE_API_KEY")
api_endpoint = os.getenv("AZURE_API_ENDPOINT")
api_version = os.getenv("AZURE_API_VERSION")
deployment_name = "gpt-4o"

llm = AzureAdapter(api_key=api_key, api_endpoint=api_endpoint, api_version=api_version)

In [2]:
import json
import time
from random import randint
from utils.save_json import save_json

def parse_and_save(llm_response, folder_path="", save_data=True):
    seed = int(time.time())

    # Parse the response from the llm
    data = json.loads(llm_response)

    # Add sel_idx and id
    if type(data) == dict and 'puzzles' in data.keys():
        data = data['puzzles']

    if type(data) == dict:
        data = [data]

    # Save each puzzle as own json
    for puzzle in data:
        rand =  randint(0, 1000000)
        puzzle['id'] = rand
        puzzle['sel_idx'] = rand
        if save_data:
            save_json(data=puzzle, file_path=f'{folder_path}/{seed}_{rand}.json')

    print((type(data), len(data)))
    return data

#### Generating puzzles with one llm call

In [3]:
from prompts.prompts import entire_puzzle_generation_prompt

prompt = 'Generate generate 10 new puzzles'
res = llm.call_model(prompt=prompt, system_prompt=entire_puzzle_generation_prompt, deployment_name=deployment_name)

In [6]:
data = parse_and_save(folder_path='./data/synthetic/gpt_one_call', llm_response=res)

(<class 'list'>, 1)


#### Creating Puzzles in two llm calls

In [3]:
from prompts.prompts import question_generation_prompt, solution_generation_prompt

# Create puzzle questions
prompt_questions = "Generate generate 10 new puzzle questions"
res_questions = llm.call_model(prompt=prompt_questions, system_prompt=question_generation_prompt, deployment_name=deployment_name)
questions = parse_and_save(llm_response=res_questions, save_data=False)

prompt_solutions = "Solve the following logical puzzles:\n {}".format(questions)
res_solutions = llm.call_model(prompt=prompt_solutions, system_prompt=solution_generation_prompt, deployment_name=deployment_name)
solutions = parse_and_save(folder_path='./data/synthetic/gpt_two_calls', llm_response=res_solutions, save_data=True)

(<class 'list'>, 1)
(<class 'list'>, 1)


#### Puzzle verification

##### Duplicates
Often LLMs generate redundant puzzle so we need to filter out redundant puzzles. Here we can use two different approaches.
1. We can use a LLM as classifier to assess based on the semantics if there are duplicates
2. We can use a syntactic assessment via a the levenstein distance

##### Quality