In [1]:
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_experimental.tabular_synthetic_data.base import (
    SyntheticDataGenerator,
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_ollama import OllamaLLM, ChatOllama

import time

from pydantic import BaseModel
from typing import Dict
import json
import re

In [2]:
class Answer(BaseModel):
    function: str
    arguments: Dict[str, str]


class func_calls(BaseModel):
    query: str
    answer: Answer

In [3]:
llm = OllamaLLM(model="llama3.1", temperature=0.7)

In [50]:
examples = [
    {"example": """{{"query": "What is your name", "answer": {{"function": "no_such_function", "arguments": {{}}}}}}"""},
    {"example": """{{"query": "Where are you from", "answer": {{"function": "no_such_function", "arguments": {{}}}}}}"""},
    {"example": """{{"query": "Change the channel to 5", "answer": {{"function": "no_such_function", "arguments": {{}}}}}}"""},
    {"example": """{{"query": "What is 5 divided by 7", "answer": {{"function": "no_such_function", "arguments": {{}}}}}}"""},
    {"example": """{{"query": "What phase is the moon going to be on 10/29/24", "answer": {{"function": "no_such_function", "arguments": {{}}}}}}"""},
    {"example": """{{"query": "When is the next solar eclipse", "answer": {{"function": "no_such_function", "arguments": {{}}}}}}"""},
    {"example": """{{"query": "How many calories should I eat to loose 1/2lbs every two weeks as a man who weights 170lbs", "answer": {{"function": "no_such_function", "arguments": {{}}}}}}"""}
]

In [66]:
def fix_json_quotes(json_str):
    json_str = re.sub(r'\"([a-zA-Z]+)\"', r'\\\"\1\\\"', json_str)
    json_str = re.sub(r"'([a-zA-Z])'", r'"\1"', json_str)

    return json_str

def validate_and_fix_json(json_str):
    try:
        parsed_data = json.loads(json_str)
        return parsed_data
    except json.JSONDecodeError:
        fixed_json_str = fix_json_quotes(json_str)
        try:
            parsed_data = json.loads(fixed_json_str)
            return parsed_data
        except json.JSONDecodeError as e:
            print(f"Failed to fix JSON: {e}")
            return None

In [70]:
for i in range(10,21):
    OPENAI_TEMPLATE = PromptTemplate.from_template(template="{example}")

    prompt_template = FewShotPromptTemplate(
        prefix=SYNTHETIC_FEW_SHOT_PREFIX,
        examples=examples,
        suffix=SYNTHETIC_FEW_SHOT_SUFFIX, 
        input_variables=["subject", "extra"],
        example_prompt=OPENAI_TEMPLATE,
    )

    synthetic_data_generator = SyntheticDataGenerator(template=prompt_template, llm=llm, output_schema=func_calls)

    def generate_synthetic_data(runs=1):
        """
        Generates synthetic data by invoking the synthetic_data_generator with specified parameters.
        Args:
            runs (int, optional): The number of times to run the data generation process. Defaults to 1 as anything more currently breaks the pipeline.
        Returns:
            list: A list of synthetic results generated by the synthetic_data_generator. Len(synthetic_results) == runs.
        Example:
            synthetic_results = generate_synthetic_data(runs=1)
        """
        start_time = time.time()
        synthetic_results = synthetic_data_generator.generate(
            subject = "natural language queries for API functions that take 1-4 arguments. Report the function as no_such_function",
            extra = "the arguments must be chosen at random. Make the entire query somewhat unique. Don't report the arguments. Don't repeat the same type of question. Don't make chit-chat and don't have an introduction. Generate 70 examples",
            runs=runs,
        )
        end_time = time.time()

        print(f"It took {end_time - start_time:.2f}seconds to generate data")
        return synthetic_results

    data = generate_synthetic_data(runs=1)
    json_strings = data[0].split('\n\n')

    parsed_json_objects = []
    for json_str in json_strings:
        fixed_json = validate_and_fix_json(json_str)
        if fixed_json:
            parsed_json_objects.append(fixed_json)
        else:
            print(f"Skipping invalid JSON string: {json_str}")

    combined_json = json.dumps({"queries": parsed_json_objects}, indent=4)

    with open(f'dump_json/failed_{i}.json', 'w') as json_file:
        json_file.write(combined_json)
    print(f"Saved to dump_json/failed_{i}.json")

It took 56.50seconds to generate data
Saved to dump_json/failed_10.json
It took 64.94seconds to generate data
Saved to dump_json/failed_11.json
It took 47.83seconds to generate data
Saved to dump_json/failed_12.json
It took 45.25seconds to generate data
Saved to dump_json/failed_13.json
It took 42.97seconds to generate data
Saved to dump_json/failed_14.json
It took 53.91seconds to generate data
Saved to dump_json/failed_15.json
It took 59.65seconds to generate data
Saved to dump_json/failed_16.json
It took 48.48seconds to generate data
Saved to dump_json/failed_17.json
It took 54.00seconds to generate data
Saved to dump_json/failed_18.json
It took 64.25seconds to generate data
Saved to dump_json/failed_19.json
It took 52.91seconds to generate data
Saved to dump_json/failed_20.json


In [71]:
###############################################
# Combines multiple JSON files into a single  #
# JSON file with unique queries               #
# Written by ChatGPT                          #
###############################################

import json
import os
from glob import glob

# Directory where JSON files are located
json_dir = 'dump_json'

# Set to store unique queries
unique_queries = set()

# List to store the final output
final_queries = []


# Function to process each JSON file
def process_json_file(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
        for query_obj in data.get("queries", []):
            # Convert query dict to a string for comparison
            query_str = json.dumps(query_obj, sort_keys=True)
            if query_str not in unique_queries:
                unique_queries.add(query_str)
                final_queries.append(query_obj)


# Read all JSON files
for json_file in glob(os.path.join(json_dir, '*.json')):
    process_json_file(json_file)

# Output the final result to a new file
output_data = {"queries": final_queries}
output_file = 'failed_queries.json'
with open(output_file, 'w') as f:
    json.dump(output_data, f, indent=4)

print(f'Unique queries written to {output_file}')

Unique queries written to failed_queries.json
