## Entities

In [24]:
import pandas as pd
from itertools import product

entities = ["singer", "teacher", "town"]
question_templates = ["Give me {}", "What is {}", "Tell me about {}", "What do you know about {}"]


def create_simple_queries_and_questions(entities, question_templates):
    queries = []
    questions = []

    for entity, template in product(entities, question_templates):
        question = template.format(entity)
        query = f"query {{ {entity} }}"
        queries.append(query)
        questions.append(question)

    data = {'Question': questions, 'Query': queries}
    df = pd.DataFrame(data)

    return df

df = create_simple_queries_and_questions(entities, question_templates)
print(df)



                          Question              Query
0                   Give me singer   query { singer }
1                   What is singer   query { singer }
2             Tell me about singer   query { singer }
3    What do you know about singer   query { singer }
4                  Give me teacher  query { teacher }
5                  What is teacher  query { teacher }
6            Tell me about teacher  query { teacher }
7   What do you know about teacher  query { teacher }
8                     Give me town     query { town }
9                     What is town     query { town }
10              Tell me about town     query { town }
11     What do you know about town     query { town }


## Entities, and properties

In [26]:
import pandas as pd
from itertools import product

entities = ["singer", "teacher", "town"]
properties = ["children", "friend", "rival"]
question_templates = ["Give me {}", "What is {}", "Tell me about {}", "What do you know about {}"]

def create_simple_queries_and_questions(entities, properties, question_templates):
    queries = []
    questions = []

    all_properties = properties + [entity for entity in entities if entity not in properties]

    for entity, property_, template in product(entities, all_properties, question_templates):
        if entity != property_:  # Exclude cases where entity is used as its own property
            question_entity = f"the {entity}'s {property_}"
            question = template.format(question_entity)
            query = f"query {entity} {{ {property_} }}"
            queries.append(query)
            questions.append(question)

    data = {'Question': questions, 'Query': queries}
    df = pd.DataFrame(data)

    return df

df = create_simple_queries_and_questions(entities, properties, question_templates)
print(df)


                                         Question                       Query
0                   Give me the singer's children   query singer { children }
1                   What is the singer's children   query singer { children }
2             Tell me about the singer's children   query singer { children }
3    What do you know about the singer's children   query singer { children }
4                     Give me the singer's friend     query singer { friend }
5                     What is the singer's friend     query singer { friend }
6               Tell me about the singer's friend     query singer { friend }
7      What do you know about the singer's friend     query singer { friend }
8                      Give me the singer's rival      query singer { rival }
9                      What is the singer's rival      query singer { rival }
10               Tell me about the singer's rival      query singer { rival }
11      What do you know about the singer's rival      query sin

## Entities, nested properties

In [27]:
import pandas as pd
from itertools import product

entities = ["singer", "teacher", "town"]
properties = ["children", "friend", "rival"]
question_templates = ["Give me {}", "What is {}", "Tell me about {}", "What do you know about {}"]
nested_properties = ["age", "profession", "hobby", "salary", "height", "weight", "hair_color", "eye_color"]

def create_simple_queries_and_questions(entities, properties, question_templates, nested_properties):
    queries = []
    questions = []

    all_properties = properties + [entity for entity in entities if entity not in properties]

    for entity, property_, template in product(entities, all_properties, question_templates):
        if entity != property_:
            for nested_property in nested_properties:
                question_entity = f"the {entity}'s {property_}'s {nested_property}"
                question = template.format(question_entity)
                query = f"query {entity} {{ {property_} {{ {nested_property} }} }}"
                queries.append(query)
                questions.append(question)

    data = {'Question': questions, 'Query': queries}
    df = pd.DataFrame(data)

    return df

df = create_simple_queries_and_questions(entities, properties, question_templates, nested_properties)
print(df)


                                              Question  \
0                  Give me the singer's children's age   
1           Give me the singer's children's profession   
2                Give me the singer's children's hobby   
3               Give me the singer's children's salary   
4               Give me the singer's children's height   
..                                                 ...   
475  What do you know about the town's teacher's sa...   
476  What do you know about the town's teacher's he...   
477  What do you know about the town's teacher's we...   
478  What do you know about the town's teacher's ha...   
479  What do you know about the town's teacher's ey...   

                                        Query  
0           query singer { children { age } }  
1    query singer { children { profession } }  
2         query singer { children { hobby } }  
3        query singer { children { salary } }  
4        query singer { children { height } }  
..             

### Entities, filtering on properties

In [35]:
import pandas as pd
from itertools import product

# Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

entities = ["singer", "teacher", "town"]
properties = ["children", "friend", "rival"]
simple_logical_operators = ["_gt", "_lt", "_eq", "_neq"]
question_templates = ["What are the {} where {}", "What is the {} that {}", "Tell me about {} for the {}", "What do you know about {} that has {}", "Give me {} in which {}"]

def create_simple_queries_and_questions(entities, properties, simple_logical_operators, question_templates):
    queries = []
    questions = []

    for entity, property_, operator, template in product(entities, properties, simple_logical_operators, question_templates):
        condition_placeholder = f"{property_} : {{ {operator}: <value> }}"
        question = template.format(entity, condition_placeholder)
        query = f"query {{ {entity} (where : {{ {condition_placeholder} }}) }}"
        queries.append(query)
        questions.append(question)

    data = {'Question': questions, 'Query': queries}
    df = pd.DataFrame(data)

    return df

df = create_simple_queries_and_questions(entities, properties, simple_logical_operators, question_templates)

df


Unnamed: 0,Question,Query
0,What are the singer where children : { _gt: <v...,query { singer (where : { children : { _gt: <v...
1,What is the singer that children : { _gt: <val...,query { singer (where : { children : { _gt: <v...
2,Tell me about singer for the children : { _gt:...,query { singer (where : { children : { _gt: <v...
3,What do you know about singer that has childre...,query { singer (where : { children : { _gt: <v...
4,Give me singer in which children : { _gt: <val...,query { singer (where : { children : { _gt: <v...
5,What are the singer where children : { _lt: <v...,query { singer (where : { children : { _lt: <v...
6,What is the singer that children : { _lt: <val...,query { singer (where : { children : { _lt: <v...
7,Tell me about singer for the children : { _lt:...,query { singer (where : { children : { _lt: <v...
8,What do you know about singer that has childre...,query { singer (where : { children : { _lt: <v...
9,Give me singer in which children : { _lt: <val...,query { singer (where : { children : { _lt: <v...


In [25]:
import random

def query_to_natural_language(query, input_elements):
    question_templates = ["Give me {}", "What is the {}", "Tell me about {}", "What can you tell me about {}"]

    # Aggregators
    aggregator_map = {
        "min": "minimum",
        "max": "maximum",
        "sum": "sum",
        "avg": "average"
    }

    # Logical operators
    logical_operator_map = {
        "_and": "and",
        "_or": "or",
        "_not": "not"
    }

    # Comparison operators
    comparison_operator_map = {
        "_eq": "equal to",
        "_gt": "greater than",
        "_lt": "less than",
        "_neq": "not equal to",
        "_like": "like"
    }

    # Orderings
    order_map = {
        "asc": "ascending",
        "desc": "descending"
    }

    # Split the query string into tokens
    tokens = query.split()

    question_parts = []

    # Iterate through tokens and build the natural language question
    i = 1  # Start with the second token, skipping the first one ("query")
    while i < len(tokens):
        token = tokens[i]

        if token in input_elements["entities"]:
            question_parts.append(token)
        elif token in input_elements["properties"]:
            question_parts.append(token)
        elif token in aggregator_map:
            question_parts.append(aggregator_map[token])
        elif token in logical_operator_map:
            question_parts.append(logical_operator_map[token])
        elif token in comparison_operator_map:
            question_parts.append(comparison_operator_map[token])
        elif token in order_map:
            question_parts.append(order_map[token])
        else:
            question_parts.append(token)

        i += 1

    # Make the question more readable by replacing underscores with spaces
    question = " ".join(question_parts).replace("_", " ")

    # Choose a random question template
    template = random.choice(question_templates)

    # Format the question using the chosen template
    formatted_question = template.format(question)

    # Add a question mark at the end
    formatted_question += "?"

    return formatted_question


### GraphQL Query Generator

In [80]:
#GraphQL Query Generator
import random
import pandas as pd

# Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
import nltk
from nltk.corpus import wordnet as wn
nltk.download('omw-1.4')
nltk.download('wordnet')

def generate_graphql_query(input_elements):
    while True: # We keep generating queries until we've reached the desired query lenght
        query = "query { " 

        entity = random.choice(input_elements["entities"]) #We pick a random entity from the list of entities
        query += f"{entity}"

        arguments_applied = False
        
        # If less or eq to argument_probability, then true => apply random arguments
        
        if (random.randint(1, 100) <= input_elements["argument_probability"]):
            query += " ( "
            arguments_applied = True

            # Apply limit first
            if random.randint(1, 100) <= input_elements["limit_probability"]:
                query += f"limit : {random.randint(1, 20)} , "

            # Apply order_by
            if random.randint(1, 100) <= input_elements["order_by_probability"]:
                prop = random.choice(input_elements["properties"])
                order, _ = random.choices(input_elements["orderings"], [weight for _, weight in input_elements["orderings"]])[0]
                query += f"order_by : {{ {prop} : {order} }} "

            #Apply where clause
            if random.randint(1, 100) <= input_elements["filter_probability"]:
                query += "where : { "

                logical_operator = None
                logical_operator_applied = False

                if random.randint(1, 100) <= input_elements["logical_operator_probability"]:
                    logical_operator, _ = random.choices(input_elements["logical_operators"], [weight for _, weight in input_elements["logical_operators"]])[0]
                    query += f"{logical_operator} : "
                    logical_operator_applied = True

                if logical_operator_applied:
                    query += "{ "

                if logical_operator_applied and logical_operator == "_or":
                    query += "[ "

                prop = random.choice(input_elements["properties"])
                comparison_operator, _ = random.choices(input_elements["comparison_operators"], [weight for _, weight in input_elements["comparison_operators"]])[0]
                if comparison_operator == "_like":
                    value = '"' + f'%{random.choice(input_elements["like_arguments"])}%' + '"'
                else:
                    value = round(random.uniform(0, 1000))
                query += f"{{ {prop} : {{ {comparison_operator} : {value} }} }}"

                # Only add a second clause if a logical operator has been applied
                if logical_operator_applied:
                    if logical_operator == "_or":
                        query += " , "

                    if logical_operator == "_and":
                        query += " , "

                    prop = random.choice(input_elements["properties"])
                    comparison_operator, _ = random.choices(input_elements["comparison_operators"], [weight for _, weight in input_elements["comparison_operators"]])[0]
                    if comparison_operator == "_like":
                        value = '"' + f'%{random.choice(input_elements["like_arguments"])}%' + '"'
                    else:
                        value = round(random.uniform(0, 1000))
                    query += f"{{ {prop} : {{ {comparison_operator} : {value} }} }}"

                if logical_operator_applied:
                    if logical_operator == "_or":
                        query += " ] "
                    query += " } }"


            
            if random.randint(1, 100) <= input_elements["distinct_on_probability"]:
                    prop = random.choice(input_elements["properties"])
                    if query.endswith("} "):
                        query += f", distinct_on : {prop} "
                    else:
                        query += f", distinct_on : {prop} "

        if arguments_applied:
            query += " )"

        query += " { "

        # Select random properties and aggregators to be returned
        if random.randint(1, 100) <= input_elements["aggregator_probability"]:
            num_aggregators = random.randint(1, len(input_elements["aggregators"]))
            for _ in range(num_aggregators):
                aggregator, weight = random.choices(input_elements["aggregators"], weights=[w for _, w in input_elements["aggregators"]])[0]
                prop = random.choice(input_elements["properties"])
                query += f"{aggregator} {{ {prop} }} "
        else:
            # Choose a range of properties based on the property_weights
            property_range, _ = random.choices(input_elements["property_weights"], [weight for _, weight in input_elements["property_weights"]])[0]
            num_properties = random.randint(1, property_range)
            for _ in range(num_properties):
                prop = random.choice(input_elements["properties"])
                query += f"{prop} "
                if random.randint(1, 100) <= input_elements["nested_probability"]:
                    query += "{ "
                    # Choose a range of nested properties based on the nested_property_weights 
                    # Should we also enable the nested queries to have nested queries?
                    nested_property_range, _ = random.choices(input_elements["nested_property_weights"], [weight for _, weight in input_elements["nested_property_weights"]])[0]
                    nested_properties = [p for p in input_elements["properties"] if p != prop]
                    for _ in range(random.randint(1, nested_property_range)):
                        nested_prop = random.choice(nested_properties)
                        query += f"{nested_prop} "
                    query += "} "

        query += "} }"

        def is_query_length_within_range(query):
            rand_num = random.randint(18, 137)
            query_length = len(query)
            return rand_num - 10 <= query_length <= rand_num + 10

        # # To do: Control via weights 
        # if 80 <= len(query) <= 180:
        #     print(query)
        #     return query
        
        if is_query_length_within_range(query):
            return query


words_list = nltk.corpus.words.words()
words_list = [word.lower() for word in words_list]
short_words_list = [word for word in words_list if len(word) <= 5]
nouns = list({x.name().split('.', 1)[0] for x in wn.all_synsets('n')})

# Example input elements
input_elements = {  
    "entities" : random.choices(nouns, k=50000),
    "like_arguments" : random.choices(nouns, k=10000),
    "properties" : random.choices(nouns, k=10000),

    "property_weights": [(1, 50), (2, 37), (3, 15), (4, 3), (5, 0)],
    "nested_property_weights": [(1, 100), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0)],

    "nested_probability": 40, 
    
    "argument_probability": 83,

    "filter_probability": 83,

    "order_by_probability": 39, 

    "limit_probability": 24,

    "distinct_on_probability": 8,

    "orderings": [("asc", 10), ("desc", 17)],

    "logical_operator_probability":45,
    
    "logical_operators": [("_and", 50), ("_or", 50)],
    
    "aggregators": [("min", 2.2), ("max", 5.23), ("sum", 2.70), ("avg", 6.5)],

    "aggregator_probability": 9,

    "comparison_operators": [("_eq", 48), ("_gt", 6.71), ("_lt", 3.2), ("lte", 0.9), ("gte", 0.7), ("_neq", 2.88), ("_like", 2.16)],
}

# Generate a random GraphQL query

#Create a dataframe of random GraphQL queries

def create_random_queries(input_elements, num_queries):
    queries = []
    for _ in range(num_queries):
        query = generate_graphql_query(input_elements)
        queries.append(query)

    data = {'query': queries}
    df = pd.DataFrame(data)

    return df

# create_random_queries(input_elements, 10)

synthetic_queries_df = create_random_queries(input_elements, 4500)

synthetic_queries_df.head()

# Save the synthetic queries to a csv file

# synthetic_queries_df.to_csv('synthetic.csv', index=False)

import json
import os

# Convert DataFrame to a list of dictionaries
query_list = synthetic_queries_df.to_dict(orient='records')

# Define the file path
file_path = './SPEGQL-dataset/dataset/vanilla_error_mirror_4500.json'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Save the list as a JSON file with consistent formatting
with open(file_path, 'w') as f:
    json.dump(query_list, f, indent=2)


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jakobtolstrup/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jakobtolstrup/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### GraphQL Query Generator with Schema

In [54]:
#GraphQL Query Generator
import random
import pandas as pd
import json
import random


# Set display options to show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

def generate_graphql_query(input_elements):

    with open('SPEGQL-dataset/Schemas/student_1/schema_types.json') as json_file:
        schema_types = json.load(json_file)

    while True: # We keep generating queries until we've reached the desired query lenght
        query = "query { " 
        
        # Select a random entity from the schema types
        entity = random.choice(schema_types)
        entity_name = entity["name"]
        query += f"{entity_name}"

        entity_fields = entity["fields"]
        property_names = list(entity_fields.keys())
        prop = random.choice(property_names)
        prop_type = entity_fields[prop]
        # print(f"The properties and their types are : {entity_fields}")


        arguments_applied = False
        
        # If less or eq to argument_probability, then true => apply random arguments
        
        if (random.randint(1, 100) <= input_elements["argument_probability"]):
            query += " ( "
            arguments_applied = True

            # Apply limit first - I'm not sure we can use this.. 
            if random.randint(1, 100) <= input_elements["limit_probability"]:
                query += f"limit : {random.randint(1, 20)} , "

            # Apply order_by -  I'm not sure we can use this.. 
            if random.randint(1, 100) <= input_elements["order_by_probability"]:
                prop = random.choice(property_names)
                order, _ = random.choices(input_elements["orderings"], [weight for _, weight in input_elements["orderings"]])[0]
                query += f"order_by : {{ {prop} : {order} }} "

            # If prop is not null
            
            #Apply where clause
            #Apply where clause
            if random.randint(1, 100) <= input_elements["filter_probability"]:
                query += "where : "

                logical_operator = None
                logical_operator_applied = False

                if random.randint(1, 100) <= input_elements["logical_operator_probability"]:
                    logical_operator, _ = random.choices(input_elements["logical_operators"], [weight for _, weight in input_elements["logical_operators"]])[0]
                    query += f"{logical_operator} : "
                    logical_operator_applied = True

                if logical_operator_applied:
                    query += "{ "

                if logical_operator_applied and logical_operator == "_or":
                    query += "[ "

                prop = random.choice(property_names)
                comparison_operator, _ = random.choices(input_elements["comparison_operators"], [weight for _, weight in input_elements["comparison_operators"]])[0]
                if comparison_operator == "_like":
                    value = '"' + f'%{random.choice(input_elements["like_arguments"])}%' + '"'
                else:
                    value = round(random.uniform(0, 1000))
                query += f"{{ {prop} : {{ {comparison_operator} : {value} }} }}"

                # Only add a second clause if a logical operator has been applied
                if logical_operator_applied:
                    if logical_operator == "_or":
                        query += " , "

                    if logical_operator == "_and":
                        query += " , "

                    prop = random.choice(property_names)
                    comparison_operator, _ = random.choices(input_elements["comparison_operators"], [weight for _, weight in input_elements["comparison_operators"]])[0]
                    if comparison_operator == "_like":
                        value = '"' + f'%{random.choice(input_elements["like_arguments"])}%' + '"'
                    else:
                        value = round(random.uniform(0, 1000))
                    query += f"{{ {prop} : {{ {comparison_operator} : {value} }} }}"

                if logical_operator_applied:
                    if logical_operator == "_or":
                        query += " ] "
                    query += " } "
            
            # Lets ignore this for now
            if random.randint(1, 100) <= input_elements["distinct_on_probability"]:
                    prop = random.choice(property_names)
                    if query.endswith("} "):
                        query += f", distinct_on : {prop} "
                    else:
                        query += f" distinct_on : {prop} "

        if arguments_applied:
            query += ")"

        query += " { "

        # Select random properties and aggregators to be returned
        if random.randint(1, 100) <= input_elements["aggregator_probability"]:
            num_aggregators = random.randint(1, len(input_elements["aggregators"]))
            for _ in range(num_aggregators):
                aggregator, weight = random.choices(input_elements["aggregators"], weights=[w for _, w in input_elements["aggregators"]])[0]
                prop = random.choice(property_names)
                query += f"{aggregator} {{ {prop} }} "
        else:
            # Choose a range of properties based on the property_weights
            property_range, _ = random.choices(input_elements["property_weights"], [weight for _, weight in input_elements["property_weights"]])[0]
            num_properties = random.randint(1, property_range)
            for _ in range(num_properties):
                prop = random.choice(property_names)
                query += f"{prop} "
                if random.randint(1, 100) <= input_elements["nested_probability"]:
                    query += "{ "
                    # Choose a range of nested properties based on the nested_property_weights 
                    # Should we also enable the nested queries to have nested queries?
                    nested_property_range, _ = random.choices(input_elements["nested_property_weights"], [weight for _, weight in input_elements["nested_property_weights"]])[0]
                    nested_properties = [p for p in input_elements["properties"] if p != prop]
                    for _ in range(random.randint(1, nested_property_range)):
                        nested_prop = random.choice(nested_properties)
                        query += f"{nested_prop} "
                    query += "} "

        query += "} }"
        
        def is_query_length_within_range(query):
            rand_num = random.randint(18, 137)
            query_length = len(query)
            return rand_num - 10 <= query_length <= rand_num + 10

        # To do: Control via weights 
        # if 80 <= len(query) <= 180:
        #     return query

        if is_query_length_within_range(query):
            return query

        

        # It occured to me that 

# Example input elements
input_elements = {  
    "entities" : ["students", "cars_data", "matches", "car_names", "professionals", "departments", "countries", "books", "customers", "products", "orders", "employees", "schools", "events", "movies", "organizations", "hotels", "recipes", "songs", "athletes", "animals", "universities", "airports", "restaurants", "cities", "websites", "landmarks", "languages", "planets", "celebrities", "sports_teams", "paintings", "musicians", "festivals", "monuments", "diseases", "inventions", "politicians", "volcanoes", "languages", "foods", "celestial_bodies", "mountains", "video_games", "religions", "social_media_platforms", "tools", "artifacts", "plants", "constellations", "ancient_civilizations", "comedians", "vehicles", "musical_instruments", "weather_phenomena", "radio_stations", "board_games", "cartoon_characters", "sports", "dance_styles", "beverages", "bacteria", "superheroes", "singers", "mathematical_concepts", "periodic_elements", "mobile_apps", "oceans", "historical_events", "gemstones", "dancers", "currencies"],
    "like_arguments" : ["car", "name", "street", "dep", "man", "department", "cat", "dog", "book", "recipe", "movie", "song", "artist", "athlete", "hotel", "customer", "order", "employee", "school", "event", "country", "organization", "university", "restaurant", "city", "website", "landmark", "language", "planet", "monument", "disease", "airport", "sport", "painting", "musician", "festival", "game", "fruit", "vegetable", "mineral", "element", "mammal", "bird", "reptile", "insect", "amphibian", "fish", "planet", "constellation", "singer", "actor", "actress", "comedian", "politician", "author", "scientist", "philosopher", "inventor", "historical_figure", "athlete", "mythical_creature", "holiday", "continent", "island", "river", "mountain_range", "desert", "forest", "weather_condition", "instrument", "dance", "ballet", "opera", "painter", "architect", "author", "poet", "comic_book_character", "cartoon", "superhero", "villain", "martial_art", "board_game", "card_game", "video_game", "mobile_phone", "computer", "television", "camera", "appliance"],
    "properties" : ["date_first_registered", "first_name", "middle_name", "last_name", "accelerate", "car_name", "model", "mpg", "cylinders", "winner_name", "loser_name", "minutes", "role_code", "street", "city", "state", "department_name", "department_description", "population", "author", "price", "quantity", "category", "rating", "director", "genre", "release_date", "ISBN", "publisher", "duration", "artist", "album", "track_number", "lyrics", "actor", "species", "habitat", "scientific_name", "enrollment", "mascot", "principal", "event_date", "venue", "cuisine", "chef", "address", "cuisine_type", "website_url", "temperature", "humidity", "pressure", "wind_speed", "precipitation", "season", "water_source", "size", "color", "material", "flavor", "texture", "age", "income", "education_level", "employment_status", "marital_status", "religion", "political_affiliation", "language_spoken", "number_of_rooms", "amenities", "stars", "room_type", "check-in_time", "check-out_time", "number_of_guests", "business_name", "business_type", "owner_name", "location", "number_of_employees", "revenue", "stock_price", "market_cap", "earnings_per_share"],
    "property_weights": [(1, 50), (2, 20), (3, 15), (4, 15), (5, 0)],
    "nested_property_weights": [(1, 100), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0)],

    "nested_probability": 70, 
    
    "argument_probability": 83,

    "filter_probability": 75,

    "order_by_probability": 38, 

    "limit_probability": 14,

    "distinct_on_probability": 6,

    "orderings": [("asc", 10), ("desc", 17)],

    "logical_operator_probability":45,
    "logical_operators": [("_and", 50), ("_or", 50)],
    

    "aggregators": [("min", 6.74), ("max", 4.4), ("sum", 4.49), ("avg", 7.8)],

    "aggregator_probability": 0.01,

    "comparison_operators": [("_eq", 48), ("_gt", 6.71), ("_lt", 3.2), ("lte", 0.9), ("gte", 0.7), ("_neq", 2.88), ("_like", 2.16)],
}

# Generate a random GraphQL query

#Create a dataframe of random GraphQL queries



# call the function to generate a random query

generate_graphql_query(input_elements)


def create_random_queries(input_elements, num_queries):
    queries = []
    for _ in range(num_queries):
        query = generate_graphql_query(input_elements)
        queries.append(query)

    data = {'query': queries}
    df = pd.DataFrame(data)

    return df

create_random_queries(input_elements, 1)

# synthetic_queries_df = create_random_queries(input_elements, 1500)

# synthetic_queries_df.head()

# # Save the synthetic queries to a csv file

# synthetic_queries_df.to_csv('vanilla_error_mirror.csv', index=False)

# import json
# import os

# # Convert DataFrame to a list of dictionaries
# query_list = synthetic_queries_df.to_dict(orient='records')

# # Define the file path
# file_path = './SPEGQL-dataset/dataset/synthetic_mirror_4500.json'

# # Create the directory if it doesn't exist
# os.makedirs(os.path.dirname(file_path), exist_ok=True)

# # Save the list as a JSON file with consistent formatting
# with open(file_path, 'w') as f:
#     json.dump(query_list, f, indent=2)    


Unnamed: 0,query
0,query { list ( where : _or : { [ { teacher : {...


### Skeleton Query

In [200]:
import json
import random
import re

#    Generate Skeleton Query

def generate_skeleton_graphql_query(input_elements):
    query = "query { "
    query += "entity"

    arguments_applied = False

    if (random.randint(1, 100) <= input_elements["argument_probability"]):
        query += " ( "
        arguments_applied = True

        if random.randint(1, 100) <= input_elements["limit_probability"]:
            query += "limit : arg_limit , "

        if random.randint(1, 100) <= input_elements["order_by_probability"]:
            order, _ = random.choices(input_elements["orderings"], [weight for _, weight in input_elements["orderings"]])[0]
            query += f"order_by : {{ arg_property : {order} }} "

        if random.randint(1, 100) <= input_elements["filter_probability"]:
            query += "where : "

            logical_operator = None
            logical_operator_applied = False

            if random.randint(1, 100) <= input_elements["logical_operator_probability"]:
                logical_operator, _ = random.choices(input_elements["logical_operators"], [weight for _, weight in input_elements["logical_operators"]])[0]
                query += f"{logical_operator} : "
                logical_operator_applied = True

            if logical_operator_applied:
                query += "{ "

            if logical_operator_applied and logical_operator == "_or":
                query += "[ "

            comparison_operator, _ = random.choices(input_elements["comparison_operators"], [weight for _, weight in input_elements["comparison_operators"]])[0]
            query += f"{{ arg_property : {{ {comparison_operator} : arg_value }} }}"

            if logical_operator_applied:
                if logical_operator == "_or":
                    query += " , "

                if logical_operator == "_and":
                    query += " , "

                comparison_operator, _ = random.choices(input_elements["comparison_operators"], [weight for _, weight in input_elements["comparison_operators"]])[0]
                query += f"{{ arg_property : {{ {comparison_operator} : arg_value }} }}"

            if logical_operator_applied:
                if logical_operator == "_or":
                    query += " ] "
                query += " } "

        if random.randint(1, 100) <= input_elements["distinct_on_probability"]:
            if query.endswith("} "):
                query += f", distinct_on : arg_property "
            else:
                query += f" distinct_on : arg_property "

    if arguments_applied:
        query += ")"

    query += " { "

    if random.randint(1, 100) <= input_elements["aggregator_probability"]:
        num_aggregators = random.randint(1, len(input_elements["aggregators"]))
        for _ in range(num_aggregators):
            aggregator, weight = random.choices(input_elements["aggregators"], weights=[w for _, w in input_elements["aggregators"]])[0]
            query += f"{aggregator} {{ arg_property }} "
    else:
        property_range, _ = random.choices(input_elements["property_weights"], [weight for _, weight in input_elements["property_weights"]])[0]
        num_properties = random.randint(1, property_range)
        for _ in range(num_properties):
            query += "arg_property "
            if random.randint(1, 100) <= input_elements["nested_probability"]:
                query += "{ "
                nested_property_range, _ = random.choices(input_elements["nested_property_weights"], [weight for _, weight in input_elements["nested_property_weights"]])[0]
                for _ in range(random.randint(1, nested_property_range)):
                    query += "arg_nested_property "
                query += "} "

    query += "} }"

    return query

input_elements = {  
    "entities" : ["students", "cars_data", "matches", "car_names", "professionals", "departments", "countries", "books", "customers", "products", "orders", "employees", "schools", "events", "movies", "organizations", "hotels", "recipes", "songs", "athletes", "animals", "universities", "airports", "restaurants", "cities", "websites", "landmarks", "languages", "planets", "celebrities", "sports_teams", "paintings", "musicians", "festivals", "monuments", "diseases", "inventions", "politicians", "volcanoes", "languages", "foods", "celestial_bodies", "mountains", "video_games", "religions", "social_media_platforms", "tools", "artifacts", "plants", "constellations", "ancient_civilizations", "comedians", "vehicles", "musical_instruments", "weather_phenomena", "radio_stations", "board_games", "cartoon_characters", "sports", "dance_styles", "beverages", "bacteria", "superheroes", "singers", "mathematical_concepts", "periodic_elements", "mobile_apps", "oceans", "historical_events", "gemstones", "dancers", "currencies"],
    "like_arguments" : ["car", "name", "street", "dep", "man", "department", "cat", "dog", "book", "recipe", "movie", "song", "artist", "athlete", "hotel", "customer", "order", "employee", "school", "event", "country", "organization", "university", "restaurant", "city", "website", "landmark", "language", "planet", "monument", "disease", "airport", "sport", "painting", "musician", "festival", "game", "fruit", "vegetable", "mineral", "element", "mammal", "bird", "reptile", "insect", "amphibian", "fish", "planet", "constellation", "singer", "actor", "actress", "comedian", "politician", "author", "scientist", "philosopher", "inventor", "historical_figure", "athlete", "mythical_creature", "holiday", "continent", "island", "river", "mountain_range", "desert", "forest", "weather_condition", "instrument", "dance", "ballet", "opera", "painter", "architect", "author", "poet", "comic_book_character", "cartoon", "superhero", "villain", "martial_art", "board_game", "card_game", "video_game", "mobile_phone", "computer", "television", "camera", "appliance"],
    "properties" : ["date_first_registered", "first_name", "middle_name", "last_name", "accelerate", "car_name", "model", "mpg", "cylinders", "winner_name", "loser_name", "minutes", "role_code", "street", "city", "state", "department_name", "department_description", "population", "author", "price", "quantity", "category", "rating", "director", "genre", "release_date", "ISBN", "publisher", "duration", "artist", "album", "track_number", "lyrics", "actor", "species", "habitat", "scientific_name", "enrollment", "mascot", "principal", "event_date", "venue", "cuisine", "chef", "address", "cuisine_type", "website_url", "temperature", "humidity", "pressure", "wind_speed", "precipitation", "season", "water_source", "size", "color", "material", "flavor", "texture", "age", "income", "education_level", "employment_status", "marital_status", "religion", "political_affiliation", "language_spoken", "number_of_rooms", "amenities", "stars", "room_type", "check-in_time", "check-out_time", "number_of_guests", "business_name", "business_type", "owner_name", "location", "number_of_employees", "revenue", "stock_price", "market_cap", "earnings_per_share"],
    "property_weights": [(1, 100), (2, 0), (3, 0), (4, 0), (5, 0)],
    "nested_property_weights": [(1, 100), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0)],

    "nested_probability": 100, 
    
    "argument_probability": 100,

    "filter_probability": 100,

    "order_by_probability": 100, 

    "limit_probability": 100,

    "distinct_on_probability": 100,

    "orderings": [("asc", 40), ("desc", 60)],

    "logical_operators": [("_and", 70), ("_or", 30)],
    "logical_operator_probability":0,

    "aggregators": [("min", 6.74), ("max", 4.4), ("sum", 4.49), ("avg", 7.8)],

    "aggregator_probability": 5,

    "comparison_operators": [("_eq", 20), ("_gt", 11), ("_lt", 3), ("lte", 2.24), ("gte", 1.12), ("_neq", 9), ("_like", 7)],
}

generate_skeleton_graphql_query(input_elements=input_elements)




'query { entity ( limit : arg_limit , order_by : { arg_property : desc } where : { arg_property : { _eq : arg_value } } distinct_on : arg_property ) { arg_property { arg_nested_property } } }'

In [None]:

def extract_fields(query):
    # Find the last set of curly braces in the query
    last_brace_index = query.rfind("{")
    if last_brace_index == -1:
        return None
    
    # Extract the contents of the braces
    fields = query[last_brace_index+1 : -1].strip()
    if len(fields) == 0:
        return None
    
    # Remove the last } character
    if fields[-1] == "}":
        fields = fields[:-1].strip()
        
    # Split the fields into a list
    fields_list = [f.strip() for f in fields.split(" ")]
    
    # Remove any arguments or conditions from the fields
    fields_list = [f.split("(")[0].strip() for f in fields_list]
    
    # Return the fields as a list
    return fields_list

# query = "query { policies ( where : { customer : { customer_details : { _eq : \"Dayana Robel\" } } } ) { policy_type_code } }"
# fields = extract_fields(query)
# print(fields)

query2 = "query { apartment_buildings ( where : { building_manager : { _eq : \"Brenden\" } } ) { building_address { hey } building_phone } }"
fields2 = extract_fields(query2)
print(fields2)


['enrollment', 'primary_conference']


In [190]:
def extract_subquery(query: str) -> str:
    split_query = query.split(" ")
    
    opening_braces = 0
    closing_braces = 0
    
    for elem in reversed(split_query):
        opening_braces += elem.count("{")
        closing_braces += elem.count("}")

        if opening_braces == closing_braces - 1:
            break
            
    index_of_last_opening_brace = " ".join(split_query).rfind("{")
    subquery = query[index_of_last_opening_brace:]
    
    return subquery

# Test the function with the given query
query = "query { station_aggregate ( where : { _or : [ { location : { _eq : \"London\" } } , { location : { _eq : \"Glasgow\" } } ] } ) { aggregate { avg { total_passengers } max { total_passengers } } } }"
result = extract_subquery(query)
print(result)

{ total_passengers } } } }


In [None]:
# Run the softmax on this array [27, 14, 17, 29, 30, 9, 5]

# First, we need to calculate the sum of the exponentials of each element



## SCHEMA

In [210]:
import os

def find_shortest_schema_folder():
    base_dir = '/Users/jakobtolstrup/Desktop/Thesis/myvenv/T5-Carrera/SPEGQL-dataset/Schemas'
    shortest_length = float('inf')
    shortest_folder = None

    for folder in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder)
        if os.path.isdir(folder_path):
            schema_path = os.path.join(folder_path, 'schema.json')
            if os.path.isfile(schema_path):
                with open(schema_path, 'r') as schema_file:
                    schema_length = len(schema_file.read())
                    if schema_length < shortest_length:
                        shortest_length = schema_length
                        shortest_folder = folder

    return shortest_folder

# Usage example
shortest_folder = find_shortest_schema_folder()
print(f"The folder with the shortest schema.json file is: {shortest_folder}")


The folder with the shortest schema.json file is: student_1


In [222]:
import json
import os

def remove_directives(schema):
    if isinstance(schema, dict):
        return {
            k: remove_directives(v)
            for k, v in schema.items()
            if k != "directives"
        }
    elif isinstance(schema, list):
        return [remove_directives(item) for item in schema]
    else:
        return schema

def remove_descriptions(schema):
    if isinstance(schema, dict):
        return {
            k: remove_descriptions(v)
            for k, v in schema.items()
            if k != "description"
        }
    elif isinstance(schema, list):
        return [remove_descriptions(item) for item in schema]
    else:
        return schema

def remove_null_fields(schema):
    if isinstance(schema, dict):
        return {
            k: remove_null_fields(v) for k, v in schema.items() if v is not None
        }
    elif isinstance(schema, list):
        return [remove_null_fields(item) for item in schema if item is not None]
    else:
        return schema

def remove_deprecated_fields(schema):
    if isinstance(schema, dict):
        return {
            k: remove_deprecated_fields(v) for k, v in schema.items()
            if not (k == "isDeprecated" and v is False)
        }
    elif isinstance(schema, list):
        return [remove_deprecated_fields(item) for item in schema]
    else:
        return schema

def remove_empty_array_elements(schema):
    if isinstance(schema, dict):
        return {
            k: remove_empty_array_elements(v)
            for k, v in schema.items()
            if (v is not None) and (not isinstance(v, list) or v)
        }
    elif isinstance(schema, list):
        return [
            remove_empty_array_elements(item)
            for item in schema
            if (item is not None) and (not isinstance(item, list) or item)
        ]
    else:
        return schema

def simplify_schema(schema_name):
    schema_path = f"/Users/jakobtolstrup/Desktop/Thesis/myvenv/T5-Carrera/SPEGQL-dataset/Schemas/{schema_name}/schema.json"
    output_path = os.path.join(os.path.dirname(schema_path), "schema_boiled_down.json")

    with open(schema_path, "r") as file:
        schema = json.load(file)
    
    simplified_schema = {
        "__schema": {
            "types": schema["__schema"]["types"]
        }
    }

    simplified_schema = remove_null_fields(schema)
    simplified_schema = remove_deprecated_fields(simplified_schema)
    simplified_schema = remove_empty_array_elements(simplified_schema)
    simplified_schema = remove_descriptions(simplified_schema)
    simplified_schema = remove_directives(simplified_schema)
    
    with open(output_path, "w") as output_file:
        json.dump(simplified_schema, output_file, indent=2)
    
    print(f"The simplified schema has been saved as: {output_path}")


# Example usage
schema_name = "student_1"
simplify_schema(schema_name)


The simplified schema has been saved as: /Users/jakobtolstrup/Desktop/Thesis/myvenv/T5-Carrera/SPEGQL-dataset/Schemas/student_1/schema_boiled_down.json


In [156]:
import json
import re

def extract_graphql_types(schema_file_path, output_file_path):
    try:
        # Open and read the schema file
        with open(schema_file_path, 'r') as schema_file:
            schema = schema_file.read()

        # Replace 'bigint' with 'Int'
        schema = schema.replace('bigint', 'Int')

        # Find all type definitions using regex
        type_defs = re.findall(r'type\s+(\w+)\s+{([^}]+)}', schema)

        # Format definitions into a JSON structure
        types_json = []
        for name, fields in type_defs:
            # Skip types with '_root' in their name
            if '_root' in name:
                continue
            
            # Extract field names and their types
            fields_dict = {match[0]: match[1] for match in re.findall(r'(\w+):\s+(\w+)', fields)}
            types_json.append({
                'name': name,
                'fields': fields_dict
            })

        # Write JSON structure to output file
        with open(output_file_path, 'w') as output_file:
            json.dump(types_json, output_file, indent=4)

        print(f"Types successfully extracted to {output_file_path}")

    except Exception as e:
        print(f"Error extracting types: {e}")


In [155]:
import graphql
print(graphql.__version__)

schmema = 'academic'

extract_graphql_types(f'SPEGQL-dataset/Schemas/{schmema}/schema.graphql' , f'SPEGQL-dataset/Schemas/{schmema}/schema_types.json')


3.2.3
Types successfully extracted to SPEGQL-dataset/Schemas/academic/schema_types.json
