In [1]:
import re
import json

### Regex patterns for parsing sparql queries

In [2]:
where_pattern = re.compile(r"WHERE\s+{", re.I)
no_where_pattern = re.compile(r"SELECT\s+.*?(?!\bWHERE\b)\s*{", re.I)

# Regex pattern to match triples (subject predicate object .) and VALUES clauses
triple_pattern = re.compile(r"""
    (\?\w+|\<[^>]*\>|wd:[^\s]+)\s+   # Subject: variable, URI, or prefixed name
    \(?(\?\w+|\<[^>]*\>|wdt:[^\s]+\*?|p[\w]*:[^\s]+\*?|rdfs:[^\s]+|schema:[^\s]+|\w+:[^\s]+\*?\s*\/\s*\w+:[^\s]+\*?)\)?\s+  # Predicate: variable, URI, or prefixed name
    (\?\w+|\<[^>]*\>|"[^"]*"|wd:[^\s]+|\b\d+|\[\])\s*\.?\s*  # Object: variable, URI, literal, prefixed name, or number
    """, re.VERBOSE)

semicolon_pattern = re.compile(r"""
    \n\s+(\?\w+|\<[^>]*\>|wd:[^\s]+).*? # Subject
    \;\s+ # semicolon
    \(?(\?\w+|\<[^>]*\>|wdt:[^\s]+\*?|p[\w]*:[^\s]+\*?|rdfs:[^\s]+|schema:[^\s]+|\w+:[^\s]+\*?\s*\/\s*\w+:[^\s]+\*?)\)?\s+ # Predicate
    (\?\w+|\<[^>]*\>|"[^"]*"|wd:[^\s]+|\b\d+|\[\])\s*\.?\s* # Object
    """, re.VERBOSE)

# VALUES clause with variable and values
values_pattern = re.compile(r"VALUES\s+\?\w+\s*{\s*([^}]*)\s*}", re.I)

# FILTER clause with expression inside parentheses
filter_pattern = re.compile(r"FILTER\s*\(", re.I)

# WITH clause with subquery inside brackets
with_pattern = re.compile(r"WITH\s*\{(.*?)\}\s*as", re.I | re.DOTALL)

# MINUS clause with subquery inside brackets
minus_pattern = re.compile(r"MINUS\s*{\s*([^}]*)\s*}", re.I | re.DOTALL)

# OPTIONAL clause with subquery inside brackets
optional_pattern = re.compile(r"OPTIONAL\s*{\s*([^}]*)\s*}", re.I | re.DOTALL)

# limit, group by, and order by regexes for counting clauses
limit_pattern = re.compile(r"LIMIT\s*\d+", re.I)
having_pattern = re.compile(r"HAVING\s*\(", re.I)
group_by_pattern = re.compile(r"GROUP\s+BY\s*", re.I)
order_by_pattern = re.compile(r"ORDER\s+BY\s*", re.I)

# Extract variables and literals from FILTER expressions
variable_pattern = re.compile(r'\?\w+')
literal_pattern = re.compile(r'"[^"]*"|\b\d+(?!\w)')

### Helper functions for parsing a sparql query

In [3]:
def extract_where_content(sparql_query):
    # Find the starting point of the WHERE clause
    wheres = re.findall(where_pattern, sparql_query)
    if len(wheres) == 0:
        return None
    
    where_start = sparql_query.find(wheres[0])

    # Initialize stack and index
    stack = ['{']
    index = where_start + len(wheres[0])
    content_start = index

    # Traverse the query to find the matching closing brace
    while index < len(sparql_query):
        char = sparql_query[index]
        if char == '{':
            stack.append('{')
        elif char == '}':
            if stack:
                stack.pop()
                if not stack:
                    # Matching closing brace found
                    content_end = index
                    return sparql_query[content_start:content_end]
        index += 1

    return None

def extract_filter_content(text):
    filter_texts = []

    # Find the starting point of the WHERE clause
    filters = re.findall(filter_pattern, text)
    for filter in filters:
        filter_start = text.find(filter)
        if filter_start == -1:
            continue

        # Initialize stack and index
        stack = ['(']
        index = filter_start + len(filter)
        content_start = index

        # Traverse the query to find the matching closing brace
        while index < len(text):
            char = text[index]
            if char == '(':
                stack.append('(')
            elif char == ')':
                if stack:
                    stack.pop()
                    if not stack:
                        # Matching closing brace found
                        content_end = index
                        filter_texts.append(text[content_start:content_end])
            index += 1

    return filter_texts

def parse_content(content):
    clauses = 1 # if we're in this function, there's at least a SELECT or MINUS that we can assume
    relations = set()
    subjects = set()
    predicates = set()
    objects = set()
    literals = set()

    # Match triples in the content
    for triple in triple_pattern.findall(content):
        clauses += 1
        relations.add(triple)
        subjects.add(triple[0])
        predicates.add(triple[1])
        if triple[2].startswith('"') and triple[2].endswith('"'):
            literals.add(triple[2])
        elif triple[2].isdigit():
            literals.add(triple[2])
        else:
            objects.add(triple[2])
    
    for semicolon in semicolon_pattern.findall(content):
        clauses += 1
        relations.add(semicolon)
        subjects.add(semicolon[0])
        predicates.add(semicolon[1])
        if semicolon[2].startswith('"') and semicolon[2].endswith('"'):
            literals.add(semicolon[2])
        elif semicolon[2].isdigit():
            literals.add(semicolon[2])
        else:
            objects.add(semicolon[2])

    # Match VALUES clauses in the content
    for values_match in values_pattern.findall(content):
        clauses += 1
        for value in values_match.split():
            value = value.strip()
            if value.startswith('wd:'):
                objects.add(value)
            elif value.startswith('"') and value.endswith('"'):
                literals.add(value)
            elif value.isdigit():
                literals.add(value)

    # Match FILTER clauses in the content
    for filter_match in extract_filter_content(content):
        clauses += 1
        variables = variable_pattern.findall(filter_match)
        for var in variables:
            objects.add(var)

        literal_matches = literal_pattern.findall(filter_match)
        for lit in literal_matches:
            literals.add(lit)

    return clauses, relations, subjects, predicates, objects, literals

def calculate_content(sparql_query):
    total_clauses = 0
    total_relations = set()
    total_subjects = set()
    total_predicates = set()
    total_objects = set()
    total_literals = set()

    no_where = re.findall(no_where_pattern, sparql_query)
    for nw in no_where:
        sparql_query = sparql_query.replace(nw, f"{nw[:-1]}WHERE {{")
    
    sparql_query = re.sub(r';\s+', '; ', sparql_query)

    # Extract the content inside the WHERE clause
    where_content = extract_where_content(sparql_query)
    if where_content:
        # if there's a subquery within the where content
        subquery_content = extract_where_content(where_content)
        if subquery_content:
            # replace subquery content with nothing and calculate content for it
            where_content = where_content.replace(subquery_content, '')
            clauses, relations, subjects, predicates, objects, literals = parse_content(subquery_content)
            total_clauses += clauses
            total_relations.update(relations)
            total_subjects.update(subjects)
            total_predicates.update(predicates)
            total_objects.update(objects)
            total_literals.update(literals)

        # Parse the main WHERE content
        clauses, relations, subjects, predicates, objects, literals = parse_content(where_content)
        total_clauses += clauses
        total_relations.update(relations)
        total_subjects.update(subjects)
        total_predicates.update(predicates)
        total_objects.update(objects)
        total_literals.update(literals)

        # Handle MINUS clauses
        minus_queries = re.findall(minus_pattern, where_content)
        for minus_query in minus_queries:
            sub_clauses, sub_relations, sub_subjects, sub_predicates, sub_objects, sub_literals = parse_content(minus_query)
            total_clauses += sub_clauses
            total_relations.update(sub_relations)
            total_subjects.update(sub_subjects)
            total_predicates.update(sub_predicates)
            total_objects.update(sub_objects)
            total_literals.update(sub_literals)
        
        # Handle MINUS clauses
        optional_queries = re.findall(optional_pattern, where_content)
        for optional_query in optional_queries:
            sub_clauses, sub_relations, sub_subjects, sub_predicates, sub_objects, sub_literals = parse_content(optional_query)
            total_clauses += sub_clauses
            total_relations.update(sub_relations)
            total_subjects.update(sub_subjects)
            total_predicates.update(sub_predicates)
            total_objects.update(sub_objects)
            total_literals.update(sub_literals)
        
    total_clauses += len(re.findall(limit_pattern, sparql_query))
    total_clauses += len(re.findall(group_by_pattern, sparql_query))
    total_clauses += len(re.findall(order_by_pattern, sparql_query))
    total_clauses += len(re.findall(having_pattern, sparql_query))
    return total_clauses, total_relations, total_subjects, total_predicates, total_objects, total_literals

def regex_parse(sparql_query):
    total_clauses = 0
    total_relations = set()
    total_subjects = set()
    total_predicates = set()
    total_objects = set()
    total_literals = set()

    # for each with clause
    with_content = re.findall(with_pattern, sparql_query)
    if with_content:
        for w in with_content:
            sparql_query = sparql_query.replace(w, '')
            clauses, relations, subjects, predicates, objects, literals = calculate_content(w)
            # print(calculate_content(w))
            total_clauses += clauses
            total_relations.update(relations)
            total_subjects.update(subjects)
            total_predicates.update(predicates)
            total_objects.update(objects)
            total_literals.update(literals)

    clauses, relations, subjects, predicates, objects, literals = calculate_content(sparql_query)
    # print(calculate_content(sparql_query))
    total_clauses += clauses
    total_relations.update(relations)
    total_subjects.update(subjects)
    total_predicates.update(predicates)
    total_objects.update(objects)
    total_literals.update(literals)
    
    # print("relations: ", total_relations)
    # print("subjects: ", total_subjects)
    # print("predicates: ", total_predicates)
    # print("objects: ", total_objects)
    # print("literals: ", total_literals)
    return {
        "clauses": total_clauses,
        "relations": len(total_relations),
        "subjects": len(total_subjects),
        "predicates": len(total_predicates),
        "objects": len(total_objects),
        "literals": len(total_literals)
    }

### Sample queries with increasing complexity to validate helper functions
We're counting the following features of a sparql query:
- `clauses`: the total number of clauses in the sparql query, defined as:
    - `SELECT ... WHERE { ... }` clauses
    - triples (of the form `subject predicate object`)
    - constraining subqueries (e.g. `MINUS { ... }`, `OPTIONAL { ... }`, `VALUES { ... }`, `FILTER(...)`)
    - `GROUP BY`, `ORDER BY`, and `LIMIT` clauses
- `relations`: the total number of triples (`subject predicate object`) in the sparql query
- `subjects`: the total number of subjects (e.g. `?person`, `?item`, `?statement`) in the sparql query
- `predicates`: the total number of predicates (e.g. `wdt:Pwww`, `p:Pxxx`, `pq:Pyyy`, `schema:zzz`) in the sparql query, *excluding wikibase*
- `objects`: the total number of objects (e.g. `wd:Qaaa` or `?item`) in the sparql query
- `literals`: the total number of literals (e.g. `"string"` or `123`) in the sparql query

In [5]:
# normal, simple query, no filters or minuses or values
sq0 = '''
#olympic gold medalist born in Maryland
SELECT DISTINCT ?person ?personLabel WHERE {
  ?person p:P1344 ?statement .
  ?statement pq:P166 wd:Q15243387 .
  ?person wdt:P19 ?birthplace .
  ?birthplace wdt:P131* wd:Q1391
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
'''

result = regex_parse(sq0)
print(result)

{'clauses': 5, 'relations': 4, 'subjects': 3, 'predicates': 4, 'objects': 4, 'literals': 0}


In [5]:
# one step up — query with VALUES and FILTER
sq1 = """
SELECT ?item ?itemLabel ?image WHERE {
  ?item wdt:P31 wd:Q5 .
  ?item wdt:P18 ?image .
  VALUES ?item {wd:Q2104}
  FILTER(CONTAINS(?image, "1958"))
  SERVICE wikibase:label {
    bd:serviceParam wikibase:language "en,de,fr".
  }
}
LIMIT 1000
"""

result = regex_parse(sq1)
print(result)

{'clauses': 6, 'relations': 2, 'subjects': 1, 'predicates': 2, 'objects': 3, 'literals': 1}


In [6]:
sq2 = """
#Scientific authors known to Wikidata, who do not have an ORCID iD listed there
SELECT ?author ?authorLabel ?instit ?institLabel ?count WHERE {
  {
    SELECT ?author (COUNT(DISTINCT ?publication) AS ?count) WHERE {
      ?publication wdt:P31 wd:Q13442814 .
      ?publication wdt:P50 ?author .
      MINUS { ?author wdt:P496 [] } .
      MINUS {
        ?author wdt:P570 ?dod .
        FILTER((YEAR(?dod)) < 2012)
      } .
    } GROUP BY ?author
  } .
  MINUS {
    ?author p:P108/ps:P108 ?instit1;
            p:P108/ps:P108 ?instit2 .
    FILTER(!SAMETERM(?instit1, ?instit2))
  } .
  ?author wdt:P108 ?instit .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" } .
}
ORDER BY DESC(?count)
"""

result = regex_parse(sq2)
print(result)

{'clauses': 16, 'relations': 7, 'subjects': 2, 'predicates': 6, 'objects': 7, 'literals': 1}


In [7]:
# query using WITH syntax
sq3 = '''
SELECT ?film ?filmLabel ?count
WITH
{
  SELECT ?film (COUNT(?wikipage) AS ?count)
  WHERE
  {
    hint:Query hint:optimizer "None" .
    ?film wdt:P31 wd:Q11424 .
    ?wikipage schema:about ?film .
    ?wikipage schema:isPartOf/wikibase:wikiGroup "wikipedia" .
  }
  GROUP BY ?film HAVING (?count > 50)
} AS %get_films
WHERE
{
  INCLUDE %get_films
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
ORDER BY DESC(?count)
'''

result = regex_parse(sq3)
print(result)

{'clauses': 8, 'relations': 3, 'subjects': 2, 'predicates': 3, 'objects': 2, 'literals': 1}


In [8]:
# query using the `OPTIONAL` syntax
sq4 = '''
SELECT ?item ?area ?population ?pit
WHERE
{
  ?item wdt:P31 wd:Q16739079 .
  hint:Prior hint:rangeSafe true .
  optional {  ?item p:P1082 [ ps:P1082 ?population; 
                              pq:P585 ?pit; 
                              wikibase:rank wikibase:PreferredRank ] 
           }
  optional {  ?item wdt:P2046 ?a . 
              BIND(REPLACE(STR(?a),"\\.",",") AS ?area) 
           }
}
'''

result = regex_parse(sq4)
print(result)

{'clauses': 6, 'relations': 2, 'subjects': 1, 'predicates': 2, 'objects': 2, 'literals': 0}


In [9]:
# more complex query using a multi-subquery syntax
sq5 = '''
#defaultView:BarChart
SELECT  (STR(?claims) as ?no_of_incoming_links) (COUNT(*) as ?no_of_rivers)  WITH {
  SELECT distinct ?river WHERE {
  #  VALUES ?river {wd:Q19721}
   ?river wdt:P131/wdt:P131 wd:Q22 . 
    } } as %i
WITH {
  SELECT distinct ?river WHERE {
    INCLUDE %i
    ?river wdt:P31/wdt:P279* wd:Q55659167. hint:Prior hint:gearing "forward".
    } } as %j
WITH {
  SELECT ?river (COUNT(*) as ?items) (count(distinct ?item) as ?claims) WHERE {
    INCLUDE %j
    OPTIONAL { ?item ?predicate ?river .
    ?property wikibase:directClaim ?predicate . } 
    } group by ?river } as %k
WHERE
{
  INCLUDE %k
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } 
} GROUP BY ?claims ORDER BY DESC(?claims)
'''

result = regex_parse(sq5)
print(result)

{'clauses': 12, 'relations': 3, 'subjects': 2, 'predicates': 3, 'objects': 4, 'literals': 0}


In [10]:
sq6 = """
SELECT ?item ?itemLabel ?prop ?propLabel ?value WHERE {
  VALUES ?pred { wdt:P1628 wdt:P2235 } .
  ?item ?pred ?value .
  ?prop wikibase:directClaim ?pred .
  FILTER( STRSTARTS( STR(?value), "http://www.w3.org/2006/vcard/ns" ) ) .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" } .
}
"""

print(regex_parse(sq6))

{'clauses': 4, 'relations': 1, 'subjects': 1, 'predicates': 1, 'objects': 1, 'literals': 1}


In [4]:
with open('/home/oval/wikidata-dataset/new_dataset/dev_cleaned.json', "r") as f:
    d = json.load(f)
print(len(d))

with open('/home/oval/wikidata-dataset/new_dataset/test_0615_4am_cleaned.json', "r") as f:
    temp=json.load(f)
    print(len(temp))
    d += temp


155
168


In [6]:
# for i, convo in enumerate(d):
from collections import defaultdict

meta_statistics = defaultdict(int)

print(len(d))

for i, convo in enumerate(d):
    # query = convo['query']['sparql']
    query = convo['sparql']
    # if i % 250 == 0:
    #     print(i)
    out = regex_parse(query)
    meta_statistics['clauses'] += out['clauses']
    meta_statistics['relations'] += out['relations']
    meta_statistics['subjects'] += out['subjects']
    meta_statistics['predicates'] += out['predicates']
    meta_statistics['objects'] += out['objects']
    meta_statistics['literals'] += out['literals']
    
for key in ['clauses', 'relations', 'subjects', 'predicates', 'objects', 'literals']:
    meta_statistics[key] = meta_statistics[key] / len(d)
    
    
print(meta_statistics)

323
defaultdict(<class 'int'>, {'clauses': 8.959752321981425, 'relations': 4.0588235294117645, 'subjects': 1.758513931888545, 'predicates': 3.5789473684210527, 'objects': 4.563467492260062, 'literals': 0.47678018575851394})
