### GraphQL Query Generator

In [80]:
#GraphQL Query Generator
import random
import pandas as pd

# Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
import nltk
from nltk.corpus import wordnet as wn
nltk.download('omw-1.4')
nltk.download('wordnet')

def generate_graphql_query(input_elements):
    while True: # We keep generating queries until we've reached the desired query lenght
        query = "query { " 

        entity = random.choice(input_elements["entities"]) #We pick a random entity from the list of entities
        query += f"{entity}"

        arguments_applied = False
        
        # If less or eq to argument_probability, then true => apply random arguments
        
        if (random.randint(1, 100) <= input_elements["argument_probability"]):
            query += " ( "
            arguments_applied = True

            # Apply limit first
            if random.randint(1, 100) <= input_elements["limit_probability"]:
                query += f"limit : {random.randint(1, 20)} , "

            # Apply order_by
            if random.randint(1, 100) <= input_elements["order_by_probability"]:
                prop = random.choice(input_elements["properties"])
                order, _ = random.choices(input_elements["orderings"], [weight for _, weight in input_elements["orderings"]])[0]
                query += f"order_by : {{ {prop} : {order} }} "

            #Apply where clause
            if random.randint(1, 100) <= input_elements["filter_probability"]:
                query += "where : { "

                logical_operator = None
                logical_operator_applied = False

                if random.randint(1, 100) <= input_elements["logical_operator_probability"]:
                    logical_operator, _ = random.choices(input_elements["logical_operators"], [weight for _, weight in input_elements["logical_operators"]])[0]
                    query += f"{logical_operator} : "
                    logical_operator_applied = True

                if logical_operator_applied:
                    query += "{ "

                if logical_operator_applied and logical_operator == "_or":
                    query += "[ "

                prop = random.choice(input_elements["properties"])
                comparison_operator, _ = random.choices(input_elements["comparison_operators"], [weight for _, weight in input_elements["comparison_operators"]])[0]
                if comparison_operator == "_like":
                    value = '"' + f'%{random.choice(input_elements["like_arguments"])}%' + '"'
                else:
                    value = round(random.uniform(0, 1000))
                query += f"{{ {prop} : {{ {comparison_operator} : {value} }} }}"

                # Only add a second clause if a logical operator has been applied
                if logical_operator_applied:
                    if logical_operator == "_or":
                        query += " , "

                    if logical_operator == "_and":
                        query += " , "

                    prop = random.choice(input_elements["properties"])
                    comparison_operator, _ = random.choices(input_elements["comparison_operators"], [weight for _, weight in input_elements["comparison_operators"]])[0]
                    if comparison_operator == "_like":
                        value = '"' + f'%{random.choice(input_elements["like_arguments"])}%' + '"'
                    else:
                        value = round(random.uniform(0, 1000))
                    query += f"{{ {prop} : {{ {comparison_operator} : {value} }} }}"

                if logical_operator_applied:
                    if logical_operator == "_or":
                        query += " ] "
                    query += " } }"


            
            if random.randint(1, 100) <= input_elements["distinct_on_probability"]:
                    prop = random.choice(input_elements["properties"])
                    if query.endswith("} "):
                        query += f", distinct_on : {prop} "
                    else:
                        query += f", distinct_on : {prop} "

        if arguments_applied:
            query += " )"

        query += " { "

        # Select random properties and aggregators to be returned
        if random.randint(1, 100) <= input_elements["aggregator_probability"]:
            num_aggregators = random.randint(1, len(input_elements["aggregators"]))
            for _ in range(num_aggregators):
                aggregator, weight = random.choices(input_elements["aggregators"], weights=[w for _, w in input_elements["aggregators"]])[0]
                prop = random.choice(input_elements["properties"])
                query += f"{aggregator} {{ {prop} }} "
        else:
            # Choose a range of properties based on the property_weights
            property_range, _ = random.choices(input_elements["property_weights"], [weight for _, weight in input_elements["property_weights"]])[0]
            num_properties = random.randint(1, property_range)
            for _ in range(num_properties):
                prop = random.choice(input_elements["properties"])
                query += f"{prop} "
                if random.randint(1, 100) <= input_elements["nested_probability"]:
                    query += "{ "
                    # Choose a range of nested properties based on the nested_property_weights 
                    # Should we also enable the nested queries to have nested queries?
                    nested_property_range, _ = random.choices(input_elements["nested_property_weights"], [weight for _, weight in input_elements["nested_property_weights"]])[0]
                    nested_properties = [p for p in input_elements["properties"] if p != prop]
                    for _ in range(random.randint(1, nested_property_range)):
                        nested_prop = random.choice(nested_properties)
                        query += f"{nested_prop} "
                    query += "} "

        query += "} }"

        def is_query_length_within_range(query):
            rand_num = random.randint(18, 137)
            query_length = len(query)
            return rand_num - 10 <= query_length <= rand_num + 10

        # # To do: Control via weights 
        # if 80 <= len(query) <= 180:
        #     print(query)
        #     return query
        
        if is_query_length_within_range(query):
            return query


words_list = nltk.corpus.words.words()
words_list = [word.lower() for word in words_list]
short_words_list = [word for word in words_list if len(word) <= 5]
nouns = list({x.name().split('.', 1)[0] for x in wn.all_synsets('n')})

# Example input elements
input_elements = {  
    "entities" : random.choices(nouns, k=50000),
    "like_arguments" : random.choices(nouns, k=10000),
    "properties" : random.choices(nouns, k=10000),

    "property_weights": [(1, 50), (2, 37), (3, 15), (4, 3), (5, 0)],
    "nested_property_weights": [(1, 100), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0)],

    "nested_probability": 40, 
    
    "argument_probability": 83,

    "filter_probability": 83,

    "order_by_probability": 39, 

    "limit_probability": 24,

    "distinct_on_probability": 8,

    "orderings": [("asc", 10), ("desc", 17)],

    "logical_operator_probability":45,
    
    "logical_operators": [("_and", 50), ("_or", 50)],
    
    "aggregators": [("min", 2.2), ("max", 5.23), ("sum", 2.70), ("avg", 6.5)],

    "aggregator_probability": 9,

    "comparison_operators": [("_eq", 48), ("_gt", 6.71), ("_lt", 3.2), ("lte", 0.9), ("gte", 0.7), ("_neq", 2.88), ("_like", 2.16)],
}

# Generate a random GraphQL query

#Create a dataframe of random GraphQL queries

def create_random_queries(input_elements, num_queries):
    queries = []
    for _ in range(num_queries):
        query = generate_graphql_query(input_elements)
        queries.append(query)

    data = {'query': queries}
    df = pd.DataFrame(data)

    return df

# create_random_queries(input_elements, 10)

synthetic_queries_df = create_random_queries(input_elements, 4500)

synthetic_queries_df.head()

# Save the synthetic queries to a csv file

# synthetic_queries_df.to_csv('synthetic.csv', index=False)

import json
import os

# Convert DataFrame to a list of dictionaries
query_list = synthetic_queries_df.to_dict(orient='records')

# Define the file path
file_path = './SPEGQL-dataset/dataset/vanilla_error_mirror_4500.json'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Save the list as a JSON file with consistent formatting
with open(file_path, 'w') as f:
    json.dump(query_list, f, indent=2)


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jakobtolstrup/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jakobtolstrup/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
