## Error Analysis

### Nesting Level

In [3]:
import re

# Function to calculate the nesting level of a query
def nesting_level(query):
    query = re.sub(r'\s+', '', query)  # Remove whitespace
    max_nesting = 0
    current_nesting = 0

    for char in query:
        if char == '{':
            current_nesting += 1
            max_nesting = max(max_nesting, current_nesting)
        elif char == '}':
            current_nesting -= 1

    return max_nesting

## Num Args

In [4]:
import re

def count_graphql_arguments(query: str) -> int:
    # Removing white spaces and escape characters
    query = re.sub(r'\s|\\', '', query)
    
    # Regular expression to find arguments in the query
    pattern = re.compile(r'(\{[^\{]*\})')
    matches = pattern.findall(query)

    # Counting the number of arguments
    count = 0
    for match in matches:
        count += match.count(':')
    return count

### Calculate Schema Complexity

In [39]:
weight_types = 1
weight_fields = 1
weight_input_objects = 1
weight_relationships = 2
weight_arguments = 1

def analyze_schema_complexity(schema_json):
    types_count = 0
    fields_count = 0
    input_objects_count = 0
    relationships_count = 0
    arguments_count = 0

    for type_ in schema_json["__schema"]["types"]:
        types_count += 1

        if type_["kind"] == "INPUT_OBJECT":
            input_objects_count += 1

        if "fields" in type_ and type_["fields"] is not None:
            for field in type_["fields"]:
                fields_count += 1

                if "args" in field:
                    arguments_count += len(field["args"])
                    relationships_count += 1

    complexity_score = (
    (types_count * weight_types)
    + (fields_count * weight_fields)
    + (input_objects_count * weight_input_objects)
    + (relationships_count * weight_relationships)
    + (arguments_count * weight_arguments)
)

    schema_length = len(json.dumps(schema_json))

    return complexity_score, types_count, fields_count, input_objects_count, relationships_count, arguments_count, schema_length

# Load your GraphQL schema JSON file
import json

with open("SPEGQL-dataset/Schemas/activity_1/schema.json", "r") as file:
    schema_json = json.load(file)

complexity_score, types_count, fields_count, input_objects_count, relationships_count, arguments_count, schema_length = analyze_schema_complexity(schema_json)

## Call the function on all schemas and add the counts and complexity score to the DataFrame
for schema_folder in schema_folders:
    if os.path.isdir(os.path.join(schemas_folder, schema_folder)):
        with open(os.path.join(schemas_folder, schema_folder, 'schema.json'), 'r') as schema_file:
            schema_json = json.load(schema_file)
            complexity_score, types_count, fields_count, input_objects_count, relationships_count, arguments_count, schema_length = analyze_schema_complexity(schema_json)
            df.loc[df['schemaId'] == schema_folder, 'schema_total_complexity'] = complexity_score
            df.loc[df['schemaId'] == schema_folder, 'schema_types_count'] = types_count
            df.loc[df['schemaId'] == schema_folder, 'schema_fields_count'] = fields_count
            df.loc[df['schemaId'] == schema_folder, 'schema_input_objects_count'] = input_objects_count
            df.loc[df['schemaId'] == schema_folder, 'schema_relationships_count'] = relationships_count
            df.loc[df['schemaId'] == schema_folder, 'schema_arguments_count'] = arguments_count
df.head()

NameError: name 'schema_folders' is not defined

## Count Query Components Function

In [8]:
import pandas as pd

def count_query_components(df):

    # Define component types
    arguments = ["\\("]
    filters = ["where"]
    query_modifiers = ["order_by", "limit", "distinct_on"]
    logical_operators = ["_and", "_or", "_not"]
    comparison_operators = ["_eq", "_neq", "_lt", "_gt", "_lte", "_gte", "_like"]
    aggregators = ["min", "max", "sum", "avg"]
    orderings = ["asc", "desc"]

    # Create a dictionary of the component types
    component_types = {
        "arguments": arguments,
        "filters": filters,
        "query_modifiers": query_modifiers,
        "logical_operators": logical_operators,
        "comparison_operators": comparison_operators,
        "aggregators": aggregators,
        "orderings": orderings
    }

    # Create an empty dataframe to store the results, initalized with the columns we want
    results_df = pd.DataFrame(columns=["Component_Type", "Component", "Value"])

    # Iterate through the component types and count the number of times each component appears in the query
    for component_type, components in component_types.items(): # .items() returns a tuple of the key and value, in this case the key is the component type and the value is the list of components
        # We initalize components_counts. This is a dictionary where the key is the component and the value is the number of times that component appears in the query
        component_counts = {component: 0 for component in components} 
        total_components = 0

        # Iterate through each component and count the number of times it appears in the query
        for component in components:
            component_count = df['query'].str.count(component).sum() # .str.count() counts the number of times a string appears in a column, we sum this to get the total number of times the component appears in the column
            component_counts[component] = component_count # Add the component count to the component_counts dictionary
            total_components += component_count # Add the component count to the total_components count

            component_percent = (component_count / len(df)) * 100 # Calculate the percentage of queries that contain the component

            new_row = pd.DataFrame({
                "Component_Type": [f"{component_type} relative"],
                "Component": [component],
                "Value": [component_percent]
            })

            results_df = pd.concat([results_df, new_row], ignore_index=True)

        total_percent_components = (total_components / len(df)) * 100

        new_row = pd.DataFrame({
            "Component_Type": [f"{component_type} relative (total)"],
            "Component": [""],
            "Value": [total_percent_components]
        })

        results_df = pd.concat([results_df, new_row], ignore_index=True)

    # Calculate the average nesting level
    avg_nesting = df['query'].apply(nesting_level).mean()

    # Add the average nesting level to the results dataframe
    new_row = pd.DataFrame({
        "Component_Type": ["avg_nesting_level"],
        "Component": [""],
        "Value": [avg_nesting]
    })

    results_df = pd.concat([results_df, new_row], ignore_index=True)

    # Calculate the average number of GraphQL arguments
    avg_args = df['query'].apply(count_graphql_arguments).mean()

    # Add the average number of GraphQL arguments to the results dataframe
    new_row = pd.DataFrame({
        "Component_Type": ["avg_graphql_arguments"],
        "Component": [""],
        "Value": [avg_args]
    })

    # Calculate the length of each query
    df['query_length'] = df['query'].str.len()

    # Calculate max, min and avg query length
    max_query_length = df['query_length'].max()
    min_query_length = df['query_length'].min()
    avg_query_length = df['query_length'].mean()

    # Add max query length to the results dataframe
    new_row = pd.DataFrame({
        "Component_Type": ["max_query_length"],
        "Component": [""],
        "Value": [max_query_length]
    })
    results_df = pd.concat([results_df, new_row], ignore_index=True)

    # Add min query length to the results dataframe
    new_row = pd.DataFrame({
        "Component_Type": ["min_query_length"],
        "Component": [""],
        "Value": [min_query_length]
    })
    results_df = pd.concat([results_df, new_row], ignore_index=True)

    # Add avg query length to the results dataframe
    new_row = pd.DataFrame({
        "Component_Type": ["avg_query_length"],
        "Component": [""],
        "Value": [avg_query_length]
    })

    results_df = pd.concat([results_df, new_row], ignore_index=True)

    if 'question_length' in df.columns:
        # Calculate max, min and avg question length
        max_question_length = df['question_length'].max()
        min_question_length = df['question_length'].min()
        avg_question_length = df['question_length'].mean()

        # Add max question length to the results dataframe
        new_row = pd.DataFrame({
            "Component_Type": ["max_question_length"],
            "Component": [""],
            "Value": [max_question_length]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

        # Add min question length to the results dataframe
        new_row = pd.DataFrame({
            "Component_Type": ["min_question_length"],
            "Component": [""],
            "Value": [min_question_length]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

        # Add avg question length to the results dataframe
        new_row = pd.DataFrame({
            "Component_Type": ["avg_question_length"],
            "Component": [""],
            "Value": [avg_question_length]
        })

    results_df = pd.concat([results_df, new_row], ignore_index=True)

    return results_df
