## Entities

In [24]:
import pandas as pd
from itertools import product

entities = ["singer", "teacher", "town"]
question_templates = ["Give me {}", "What is {}", "Tell me about {}", "What do you know about {}"]


def create_simple_queries_and_questions(entities, question_templates):
    queries = []
    questions = []

    for entity, template in product(entities, question_templates):
        question = template.format(entity)
        query = f"query {{ {entity} }}"
        queries.append(query)
        questions.append(question)

    data = {'Question': questions, 'Query': queries}
    df = pd.DataFrame(data)

    return df

df = create_simple_queries_and_questions(entities, question_templates)
print(df)



                          Question              Query
0                   Give me singer   query { singer }
1                   What is singer   query { singer }
2             Tell me about singer   query { singer }
3    What do you know about singer   query { singer }
4                  Give me teacher  query { teacher }
5                  What is teacher  query { teacher }
6            Tell me about teacher  query { teacher }
7   What do you know about teacher  query { teacher }
8                     Give me town     query { town }
9                     What is town     query { town }
10              Tell me about town     query { town }
11     What do you know about town     query { town }


## Entities, and properties

In [26]:
import pandas as pd
from itertools import product

entities = ["singer", "teacher", "town"]
properties = ["children", "friend", "rival"]
question_templates = ["Give me {}", "What is {}", "Tell me about {}", "What do you know about {}"]

def create_simple_queries_and_questions(entities, properties, question_templates):
    queries = []
    questions = []

    all_properties = properties + [entity for entity in entities if entity not in properties]

    for entity, property_, template in product(entities, all_properties, question_templates):
        if entity != property_:  # Exclude cases where entity is used as its own property
            question_entity = f"the {entity}'s {property_}"
            question = template.format(question_entity)
            query = f"query {entity} {{ {property_} }}"
            queries.append(query)
            questions.append(question)

    data = {'Question': questions, 'Query': queries}
    df = pd.DataFrame(data)

    return df

df = create_simple_queries_and_questions(entities, properties, question_templates)
print(df)


                                         Question                       Query
0                   Give me the singer's children   query singer { children }
1                   What is the singer's children   query singer { children }
2             Tell me about the singer's children   query singer { children }
3    What do you know about the singer's children   query singer { children }
4                     Give me the singer's friend     query singer { friend }
5                     What is the singer's friend     query singer { friend }
6               Tell me about the singer's friend     query singer { friend }
7      What do you know about the singer's friend     query singer { friend }
8                      Give me the singer's rival      query singer { rival }
9                      What is the singer's rival      query singer { rival }
10               Tell me about the singer's rival      query singer { rival }
11      What do you know about the singer's rival      query sin

## Entities, nested properties

In [27]:
import pandas as pd
from itertools import product

entities = ["singer", "teacher", "town"]
properties = ["children", "friend", "rival"]
question_templates = ["Give me {}", "What is {}", "Tell me about {}", "What do you know about {}"]
nested_properties = ["age", "profession", "hobby", "salary", "height", "weight", "hair_color", "eye_color"]

def create_simple_queries_and_questions(entities, properties, question_templates, nested_properties):
    queries = []
    questions = []

    all_properties = properties + [entity for entity in entities if entity not in properties]

    for entity, property_, template in product(entities, all_properties, question_templates):
        if entity != property_:
            for nested_property in nested_properties:
                question_entity = f"the {entity}'s {property_}'s {nested_property}"
                question = template.format(question_entity)
                query = f"query {entity} {{ {property_} {{ {nested_property} }} }}"
                queries.append(query)
                questions.append(question)

    data = {'Question': questions, 'Query': queries}
    df = pd.DataFrame(data)

    return df

df = create_simple_queries_and_questions(entities, properties, question_templates, nested_properties)
print(df)


                                              Question  \
0                  Give me the singer's children's age   
1           Give me the singer's children's profession   
2                Give me the singer's children's hobby   
3               Give me the singer's children's salary   
4               Give me the singer's children's height   
..                                                 ...   
475  What do you know about the town's teacher's sa...   
476  What do you know about the town's teacher's he...   
477  What do you know about the town's teacher's we...   
478  What do you know about the town's teacher's ha...   
479  What do you know about the town's teacher's ey...   

                                        Query  
0           query singer { children { age } }  
1    query singer { children { profession } }  
2         query singer { children { hobby } }  
3        query singer { children { salary } }  
4        query singer { children { height } }  
..             

### Entities, filtering on properties

In [35]:
import pandas as pd
from itertools import product

# Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

entities = ["singer", "teacher", "town"]
properties = ["children", "friend", "rival"]
simple_logical_operators = ["_gt", "_lt", "_eq", "_neq"]
question_templates = ["What are the {} where {}", "What is the {} that {}", "Tell me about {} for the {}", "What do you know about {} that has {}", "Give me {} in which {}"]

def create_simple_queries_and_questions(entities, properties, simple_logical_operators, question_templates):
    queries = []
    questions = []

    for entity, property_, operator, template in product(entities, properties, simple_logical_operators, question_templates):
        condition_placeholder = f"{property_} : {{ {operator}: <value> }}"
        question = template.format(entity, condition_placeholder)
        query = f"query {{ {entity} (where : {{ {condition_placeholder} }}) }}"
        queries.append(query)
        questions.append(question)

    data = {'Question': questions, 'Query': queries}
    df = pd.DataFrame(data)

    return df

df = create_simple_queries_and_questions(entities, properties, simple_logical_operators, question_templates)

df


Unnamed: 0,Question,Query
0,What are the singer where children : { _gt: <v...,query { singer (where : { children : { _gt: <v...
1,What is the singer that children : { _gt: <val...,query { singer (where : { children : { _gt: <v...
2,Tell me about singer for the children : { _gt:...,query { singer (where : { children : { _gt: <v...
3,What do you know about singer that has childre...,query { singer (where : { children : { _gt: <v...
4,Give me singer in which children : { _gt: <val...,query { singer (where : { children : { _gt: <v...
5,What are the singer where children : { _lt: <v...,query { singer (where : { children : { _lt: <v...
6,What is the singer that children : { _lt: <val...,query { singer (where : { children : { _lt: <v...
7,Tell me about singer for the children : { _lt:...,query { singer (where : { children : { _lt: <v...
8,What do you know about singer that has childre...,query { singer (where : { children : { _lt: <v...
9,Give me singer in which children : { _lt: <val...,query { singer (where : { children : { _lt: <v...


In [25]:
import random

def query_to_natural_language(query, input_elements):
    question_templates = ["Give me {}", "What is the {}", "Tell me about {}", "What can you tell me about {}"]

    # Aggregators
    aggregator_map = {
        "min": "minimum",
        "max": "maximum",
        "sum": "sum",
        "avg": "average"
    }

    # Logical operators
    logical_operator_map = {
        "_and": "and",
        "_or": "or",
        "_not": "not"
    }

    # Comparison operators
    comparison_operator_map = {
        "_eq": "equal to",
        "_gt": "greater than",
        "_lt": "less than",
        "_neq": "not equal to",
        "_like": "like"
    }

    # Orderings
    order_map = {
        "asc": "ascending",
        "desc": "descending"
    }

    # Split the query string into tokens
    tokens = query.split()

    question_parts = []

    # Iterate through tokens and build the natural language question
    i = 1  # Start with the second token, skipping the first one ("query")
    while i < len(tokens):
        token = tokens[i]

        if token in input_elements["entities"]:
            question_parts.append(token)
        elif token in input_elements["properties"]:
            question_parts.append(token)
        elif token in aggregator_map:
            question_parts.append(aggregator_map[token])
        elif token in logical_operator_map:
            question_parts.append(logical_operator_map[token])
        elif token in comparison_operator_map:
            question_parts.append(comparison_operator_map[token])
        elif token in order_map:
            question_parts.append(order_map[token])
        else:
            question_parts.append(token)

        i += 1

    # Make the question more readable by replacing underscores with spaces
    question = " ".join(question_parts).replace("_", " ")

    # Choose a random question template
    template = random.choice(question_templates)

    # Format the question using the chosen template
    formatted_question = template.format(question)

    # Add a question mark at the end
    formatted_question += "?"

    return formatted_question


In [96]:
#GraphQL Query Generator
import random
import pandas as pd

# Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

def generate_graphql_query(input_elements):
    while True: # We keep generating queries until we've reached the desired query lenght
        query = "query { " 

        entity = random.choice(input_elements["entities"]) #We pick a random entity from the list of entities
        query += f"{entity}"

        arguments_applied = False 

        # If less or eq to argument_probability, then true => apply random arguments
        if random.randint(1, 100) <= input_elements["argument_probability"]: 

            # Apply limit first
            if random.randint(1, 100) <= input_elements["limit_probability"]:
                if not arguments_applied:
                    query += " ("
                    arguments_applied = True
                query += f"limit : {random.randint(1, 20)} , "

            if random.randint(1, 100) <= input_elements["order_by_probability"]:
                if not arguments_applied:
                    query += " ("
                    arguments_applied = True
                prop = random.choice(input_elements["properties"])
                order, _ = random.choices(input_elements["orderings"], [weight for _, weight in input_elements["orderings"]])[0]
                query += f"order_by : {{ {prop} : {order} }} "

            # Choose random query modifiers, we iterate through them and apply them to the query
                # random.choices takes a list of elements and a list of weights, and returns a list of elements
                # list of weights is generated by taking the second element of each tuple in the list of query modifiers
                # _ is used as a placeholder for the first element of the tuple, which is the query modifier itself, which we don't need
                # arg, _ here we unpack the tuple into two variables, arg (the modifer) and _ (the weight which we don't need)
            # Apply random filter conditions
            if random.randint(1, 100) <= input_elements["filter_probability"]:
                if not arguments_applied:
                    query += " ("
                    arguments_applied = True
                query += "where : { "
                logical_operator, _ = random.choices(input_elements["logical_operators"], [weight for _, weight in input_elements["logical_operators"]])[0]
                query += f"{logical_operator} : "

                if logical_operator == "_or":
                    query += "[ "

                prop = random.choice(input_elements["properties"])
                comparison_operator, _ = random.choices(input_elements["comparison_operators"], [weight for _, weight in input_elements["comparison_operators"]])[0]
                if comparison_operator == "_like":
                    value = '"' + f'%{random.choice(input_elements["like_arguments"])}%' + '"'
                else:
                    value = round(random.uniform(0, 1000))
                query += f"{{ {prop} : {{ {comparison_operator} : {value} }} }}"

                if logical_operator == "_or":
                    query += " , "

                if logical_operator == "_and":
                    query += " , "

                prop = random.choice(input_elements["properties"])
                comparison_operator, _ = random.choices(input_elements["comparison_operators"], [weight for _, weight in input_elements["comparison_operators"]])[0]
                if comparison_operator == "_like":
                    value = '"' + f'%{random.choice(input_elements["like_arguments"])}%' + '"'
                else:
                    value = round(random.uniform(0, 1000))
                query += f"{{ {prop} : {{ {comparison_operator} : {value} }} }}"

                if logical_operator == "_or":
                    query += " ] "

                query += " } "

            if random.randint(1, 100) <= input_elements["distinct_on_probability"]:
                    if not arguments_applied:
                        query += " ("
                        arguments_applied = True
                    prop = random.choice(input_elements["properties"])
                    if query.endswith("} "):
                        query += f", distinct_on : {prop} "
                    else:
                        query += f" distinct_on : {prop} "

        if arguments_applied:
            query += ")"

        query += " { "

        # Select random properties and aggregators to be returned
        if random.randint(1, 100) <= input_elements["aggregator_probability"]:
            num_aggregators = random.randint(1, len(input_elements["aggregators"]))
            for _ in range(num_aggregators):
                aggregator, weight = random.choices(input_elements["aggregators"], weights=[w for _, w in input_elements["aggregators"]])[0]
                prop = random.choice(input_elements["properties"])
                query += f"{aggregator} {{ {prop} }} "
        else:
            # Choose a range of properties based on the property_weights
            property_range, _ = random.choices(input_elements["property_weights"], [weight for _, weight in input_elements["property_weights"]])[0]
            num_properties = random.randint(1, property_range)
            for _ in range(num_properties):
                prop = random.choice(input_elements["properties"])
                query += f"{prop} "
                if random.randint(1, 100) <= input_elements["nested_probability"]:
                    query += "{ "
                    # Choose a range of nested properties based on the nested_property_weights 
                    # Should we also enable the nested queries to have nested queries?
                    nested_property_range, _ = random.choices(input_elements["nested_property_weights"], [weight for _, weight in input_elements["nested_property_weights"]])[0]
                    nested_properties = [p for p in input_elements["properties"] if p != prop]
                    for _ in range(random.randint(1, nested_property_range)):
                        nested_prop = random.choice(nested_properties)
                        query += f"{nested_prop} "
                    query += "} "

        query += "} }"

        # To do: Control via weights 
        if 80 <= len(query) <= 140:
            print(query)
            return query

# Example input elements
input_elements = {  
    "entities" : ["students", "cars_data", "matches", "car_names", "professionals", "departments", "countries", "books", "customers", "products", "orders", "employees", "schools", "events", "movies", "organizations", "hotels", "recipes", "songs", "athletes", "animals", "universities", "airports", "restaurants", "cities", "websites", "landmarks", "languages", "planets", "celebrities", "sports_teams", "paintings", "musicians", "festivals", "monuments", "diseases"],
    "like_arguments" : ["car", "name", "street", "dep", "man", "department", "cat", "dog", "book", "recipe", "movie", "song", "artist", "athlete", "hotel", "customer", "order", "employee", "school", "event", "country", "organization", "university", "restaurant", "city", "website", "landmark", "language", "planet", "monument", "disease", "airport", "sport", "painting", "musician", "festival"],

    "properties" : ["date_first_registered", "first_name", "middle_name", "last_name", "accelerate", "car_name", "model", "mpg", "cylinders", "winner_name", "loser_name", "minutes", "role_code", "street", "city", "state", "department_name", "department_description", "population", "author", "price", "quantity", "category", "rating", "director", "genre", "release_date", "ISBN", "publisher", "duration", "artist", "album", "track_number", "lyrics", "actor", "species", "habitat", "scientific_name", "enrollment", "mascot", "principal", "event_date", "venue", "cuisine", "chef", "address", "cuisine_type", "website_url"],
    "property_weights": [(1, 100), (2, 0), (3, 0), (4, 0), (5, 0)],
    "nested_property_weights": [(1, 100), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0)],
    "nested_probability": 100,
    
    "argument_probability": 100,

    "filters": ["where"],
    "filter_probability": 77,

    "order_by_probability": 50,

    "limit_probability": 100,

    "distinct_on_probability": 20,

    "query_modifiers_probability": 60,

    "orderings": [("asc", 50), ("desc", 50)],
    "logical_operators": [("_and", 50), ("_or", 40), ("_not", 10)],

    "aggregators": [("min", 25), ("max", 25), ("sum", 25), ("avg", 25)], # Missing count
    "aggregator_probability": 25,

    "comparison_operators": [("_eq", 30), ("_gt", 20), ("_lt", 20), ("_neq", 20), ("_like", 10)],
}

# Generate a random GraphQL query

#Create a dataframe of random GraphQL queries

def create_random_queries(input_elements, num_queries):
    queries = []
    for _ in range(num_queries):
        query = generate_graphql_query(input_elements)
        queries.append(query)

    data = {'query': queries}
    df = pd.DataFrame(data)

    return df

synthetic_queries_df = create_random_queries(input_elements, 300)

synthetic_queries_df.head()

# Save the synthetic queries to a csv file

synthetic_queries_df.to_csv('synthetic.csv', index=False)

import json
import os

# Convert DataFrame to a list of dictionaries
query_list = synthetic_queries_df.to_dict(orient='records')

# Define the file path
file_path = './SPEGQL-dataset/dataset/synthetic.json'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Save the list as a JSON file with consistent formatting
with open(file_path, 'w') as f:
    json.dump(query_list, f, indent=2)


query { monuments (limit : 8 , where : { _or : [ { city : { _neq : 690 } } , { street : { _gt : 106 } } ]  } ) { chef { enrollment } } }
query { orders (limit : 16 , where : { _and : { venue : { _eq : 309 } } , { track_number : { _lt : 912 } } } ) { rating { website_url } } }
query { monuments (limit : 12 , where : { _and : { species : { _lt : 422 } } , { principal : { _gt : 357 } } } ) { last_name { venue } } }
query { planets (limit : 14 , order_by : { department_description : asc } ) { venue { category } } }
query { recipes (limit : 10 , where : { _and : { release_date : { _neq : 48 } } , { publisher : { _lt : 844 } } } ) { quantity { city } } }
query { professionals (limit : 10 , where : { _or : [ { city : { _neq : 12 } } , { genre : { _neq : 195 } } ]  } ) { principal { price } } }
query { countries (limit : 17 , order_by : { address : desc } ) { mascot { city } } }
query { animals (limit : 19 , where : { _and : { website_url : { _lt : 891 } } , { actor : { _lt : 440 } } } ) { dir

In [20]:
import random

def generate_graphql_query(input_elements, min_length=80, max_length=120):
    def generate_random_query(input_elements):
        entity = random.choice(input_elements["entities"])
        property_ = random.choice(input_elements["properties"])

        query_parts = []

        if random.random() < 0.5:
            filter_ = random.choice(input_elements["filters"])
            logical_operator = random.choice(input_elements["logical_operators"])
            comparison_operator = random.choice(input_elements["comparison_operators"])
            query_parts.append(f'{filter_}: {{ {logical_operator}: {{ {property_}: {{ {comparison_operator}: "value" }} }} }}')

        if random.random() < 0.5:
            aggregator = random.choice(input_elements["aggregators"])
            query_parts.append(f'{aggregator}: {{ {property_} }}')

        if random.random() < 0.5:
            ordering = random.choice(input_elements["orderings"])
            query_parts.append(f'order_by: {{ {property_}: {ordering} }}')

        query_body = ", ".join(query_parts)
        return f'query {{ {entity} ({query_body}) {{ {property_} }} }}'

    query = generate_random_query(input_elements)
    while len(query) < min_length or len(query) > max_length:
        query = generate_random_query(input_elements)

    return query

# Example input elements
input_elements = {
    "entities": ["countrylanguage_aggregate", "country_aggregate", "cartoon", "country", "people"],
    "properties": ["isofficial", "country", "indepyear", "region", "tv_channel", "series_name", "name", "surfacearea", "continent", "population", "lifeexpectancy", "poker_players_aggregate", "final_table_made", "title"],
    "nested_properties": ["aggregate", "count", "sum", "avg", "min", "max"],
    "arguments": ["order_by", "limit", "distinct_on"],
    "logical_operators": ["_and", "_or", "_not"],
    "comparison_operators": ["_eq", "_neq", "_gt", "_lt", "_like"],
    "aggregators": ["aggregate"],
    "orderings": ["asc", "desc"],
    "filters": ["where"],
}

# Example usage
for _ in range(30):
    print(generate_graphql_query(input_elements))
    print("\n")

query { country (where: { _and: { region: { _neq: "value" } } }, aggregate: { region }) { region } }


query { cartoon (where: { _or: { name: { _neq: "value" } } }, aggregate: { name }, order_by: { name: asc }) { name } }


query { people (where: { _not: { population: { _like: "value" } } }) { population } }


query { country (where: { _or: { isofficial: { _gt: "value" } } }, aggregate: { isofficial }) { isofficial } }


query { country_aggregate (where: { _or: { final_table_made: { _like: "value" } } }) { final_table_made } }


query { cartoon (where: { _and: { indepyear: { _gt: "value" } } }) { indepyear } }


query { country (where: { _or: { indepyear: { _lt: "value" } } }) { indepyear } }


query { country_aggregate (where: { _or: { tv_channel: { _eq: "value" } } }, aggregate: { tv_channel }) { tv_channel } }


query { countrylanguage_aggregate (order_by: { lifeexpectancy: desc }) { lifeexpectancy } }


query { country (where: { _not: { series_name: { _like: "value" } } }, aggregat