In [None]:
import json
from tqdm import tqdm
import pandas as pd
import urllib.parse
from SPARQLWrapper import SPARQLWrapper, JSON
import matplotlib.pyplot as plt
import numpy as np
from rdflib.plugins.sparql.parser import parseQuery

### Saving Wikidata5m.txt as proper turtle file

In [None]:
num_lines = sum(1 for line in open('../Datasets/wikidata/wikidata5m.txt','r'))

with open('../Datasets/wikidata/wikidata5m.txt') as input_file:
    entity_to_id = {}
    entity_id_counter = 1
    with open("wikidata.ttl", "w") as output_file, \
         open("mapping_wikidata.json", "w") as mapping_file:
        for line in tqdm(input_file, total=num_lines):
            parts = line.strip().split("\t")
            
            # Adding new entities to id set
            if parts[0] not in entity_to_id:
                entity_to_id[parts[0]] = entity_id_counter
                entity_id_counter += 1
            # Adding new entities to id set
            if parts[1] not in entity_to_id:
                entity_to_id[parts[1]] = entity_id_counter
                entity_id_counter += 1
            # Adding new entities to id set
            if parts[2] not in entity_to_id:
                entity_to_id[parts[2]] = entity_id_counter
                entity_id_counter += 1
                
            s = "<http://www.wikidata.org/entity/" + str(entity_to_id[parts[0]]) + ">"
            p = "<http://www.wikidata.org/prop/direct/" + str(entity_to_id[parts[1]]) + ">"
            o = "<http://www.wikidata.org/entity/" + str(entity_to_id[parts[2]]) + ">"
            
            s = "<http://www.wikidata.org/entity/" + parts[0] + ">"
            p = "<http://www.wikidata.org/prop/direct/" + parts[1] + ">"
            o = "<http://www.wikidata.org/entity/" + parts[2] + ">"

            output_file.write(s + " " + p + " " + o + " .")
            output_file.write("\n")
            #output_file.write(f"{entity_to_id[parts[0]]}\t{entity_to_id[parts[1]]}\t{entity_to_id[parts[2]]}\n")
            
          # Write the mapping dictionaries to the mapping file as JSON
        json.dump({"description": "Mapping of original entity names to unique ids. Entities and Predicates share the id set, i.e. there does not exist the same id for an entity and a predicate", "entity_to_id": entity_to_id}, mapping_file)

## Wikidata Turtle ID Mapping for LMKG

In [None]:
entity_mapping = {}
predicate_mapping = {}
entity_counter = 0
predicate_counter = 0


num_lines = sum(1 for line in open('../Datasets/wikidata/graph/wikidata.nt','r'))

with open("../Datasets/wikidata/graph/wikidata.nt", "r") as f:
    for line in tqdm(f, total=num_lines):
        triple = line.split(" ")[:-1]
        if not triple[0] in entity_mapping:
            entity_mapping[triple[0]] = entity_counter
            entity_counter += 1
        if not triple[2] in entity_mapping:
            entity_mapping[triple[2]] = entity_counter
            entity_counter += 1
        if not triple[1] in predicate_mapping:
            predicate_mapping[triple[1]] = predicate_counter
            predicate_counter += 1


In [None]:
predicate_counter

In [None]:
with open('../Datasets/wikidata/entity_mapping.json', 'w') as json_file:
    json.dump(entity_mapping, json_file)
with open('../Datasets/wikidata/predicate_mapping.json', 'w') as json_file:
    json.dump(predicate_mapping, json_file)

### Transforming and executing real queries

In [None]:
#queries = pd.read_csv('../Datasets/wikidata/I1_status2xx_userData_Joined.tsv', sep='\t')
queries2 = pd.read_csv('/home/tim/Downloads/2018-02-26_2018-03-25_all.tsv.gz', sep='\t', nrows=10000000,
                      skiprows=range(1,10000000+1))

In [None]:
len(queries2)

In [None]:
queries = pd.concat([queries, queries2], ignore_index=True)

In [None]:
queries = queries2

In [None]:
queries

# Transforming Queries to GNCE Format 

In [None]:
string_list = ['FILTER', "UNION", "OPTIONAL", "SERVICE", "COUNT", '"', "BIND", "DISTINCT", "GROUP", "LIMIT",
              "VALUES", "ASKWHERE", "ASK"]
endpoint_url = "http://localhost:8907/sparql"
sparql = SPARQLWrapper(endpoint_url)

new_queries = []
for q in tqdm(queries["anonymizedQuery"]):
    q = urllib.parse.unquote_plus(q).replace("\n", "")
    
    if any(s in q for s in string_list):
        continue
    skip = False
    
    q = {"query": q}
    
    if 'Describe' in q['query'] or 'VALUES' in q['query'] or 'MINUS' in q['query']:
        continue
    try:
        triples = [t.replace(" * ", "  ").split('  ') for t in q['query'].split('{')[1].split('}')[0].split(" . ")]
    except:
        print(q)
    #for q in queries:
    for t in  triples:
        if "" in t:
            t.remove("")
        if " * " in t:
            t.replace(" * ", "  ")
        for i in range(len(t)):
            t[i] = t[i].replace(" .", "")
            t[i] = t[i].replace(" ", "")
        q['triples'] = triples
    entities = []
    for t in triples:
        if len(t) < 3:
            skip =True
            break
        for e in t:
            if ";" in e:
                skip=True
                break

        if not "?" in t[0]:
            entities.append(t[0].replace("<","").replace(">","").replace(" .", ""))
        if not "?" in t[2]:
            entities.append(t[2].replace("<","").replace(">","").replace(" .", ""))
        if not "?" in t[1]:
            entities.append(t[1].replace("<","").replace(">","").replace(" .", ""))
    q["x"] = entities
    # Evaluating the Query over the Endpoint

    if not skip:
        try:
            sparql.setQuery(q['query'])
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
            #print(results)
        except urllib.error.URLError:
            input("Restart Virtuoso")
            #raise
            continue
        except:
            continue
        if len(results["results"]["bindings"]) > 0:
            if i%10 == 0:
                print(len(cleaned_queries))
                i += 1
            q['y'] = len(results["results"]["bindings"])
            new_queries.append(q)



In [None]:
with open("Wikidata_user_queries8.json", "w") as file:
    json.dump(new_queries, file)

In [None]:
len(new_queries)

### Final Cleaning to remove entities not occurring in Graph File: 

In [None]:
with open("/home/tim/Datasets/wikidata/user/Joined_Queries.json", "r") as file:
    queries = json.load(file)

In [None]:
with open("/home/tim/Datasets/wikidata/entity_mapping.json", "r") as f:
    entity_mapping = json.load(f)
with open("/home/tim/Datasets/wikidata/predicate_mapping.json", "r") as f:
    predicate_mapping = json.load(f)

In [None]:
# Mapping of entity and predicate ids to separate ids:
with open("/home/tim/Datasets/wikidata/id_to_id_mapping.json", "r") as f:
    id_to_id_mapping = json.load(f)
    
with open("/home/tim/Datasets/wikidata/id_to_id_mapping_predicate.json", "r") as f:
    id_to_id_mapping_predicate = json.load(f)

In [None]:

# Laoding Prone labels
with open("/home/tim/LSS/wikidata_mapping.txt", "r") as file:
    lines = file.readlines()  # Read all lines of the file and store them in a list

# Strip the newline character '\n' from each line and store the cleaned lines in a new list
cleaned_lines = [line.strip() for line in lines]

new_queries = []
for q in queries:
    add = True
    for tp in q['triples']:
        if entity_mapping:
            if not "?" in tp[0]:
                if not tp[0] in entity_mapping:
                    print("Skipping")
                    add = False
                    break
            if not "?" in tp[2]:
                if not tp[2] in entity_mapping:
                    add = False
                    print("Skipping")
                    break
        if predicate_mapping:
            if not "?" in tp[1]:
                if not tp[1] in predicate_mapping:
                    add = False
                    break
    # Cleaning away queries that have a triple with class assignment that is not in graph:
        if tp[1] == '<http://www.wikidata.org/prop/direct/P31>':
            if not "?" in tp[2]:
                if not str(id_to_id_mapping[tp[2]]) in cleaned_lines:
                    print(tp[2])
                    print(str(id_to_id_mapping[tp[2]]))
                    print("DontKnow that class")
                    add = False
                    break
    if add:
        new_queries.append(q)


## Cleaning Queries with Unknown Class:



In [None]:
len(new_queries)

In [None]:
with open("/home/tim/Datasets/wikidata/user/Joined_Queries.json", "w") as file:
    json.dump(new_queries, file)

In [None]:
new_queries = []
for q in queries:
    skip = False
    if 'Describe' in q['query'] or 'VALUES' in q['query'] or 'MINUS' in q['query']:
        continue
    try:
        triples = [t.replace(" * ", "  ").split('  ') for t in q['query'].split('{')[1].split('}')[0].split(" . ")]
    except:
        print(q)
    #for q in queries:
    for t in  triples:
        if "" in t:
            t.remove("")
        if " * " in t:
            t.replace(" * ", "  ")
        for i in range(len(t)):
            t[i] = t[i].replace(" .", "")
            t[i] = t[i].replace(" ", "")
        q['triples'] = triples
    entities = []
    for t in triples:
        if len(t) < 3:
            skip =True
            break
        for e in t:
            if ";" in e:
                skip=True
                break

        if not "?" in t[0]:
            entities.append(t[0].replace("<","").replace(">","").replace(" .", ""))
        if not "?" in t[2]:
            entities.append(t[2].replace("<","").replace(">","").replace(" .", ""))
        if not "?" in t[1]:
            entities.append(t[1].replace("<","").replace(">","").replace(" .", ""))
    q["x"] = entities
    if not skip:
        new_queries.append(q)



In [None]:
with open('Cleaned_wikidata_User_queries2.json', 'r') as file:
    # Load the JSON data
    queries = json.load(file)

In [None]:
new_queries = []
for q in queries:
    skip = False
    if 'Describe' in q['query'] or 'VALUES' in q['query'] or 'MINUS' in q['query']:
        continue
    try:
        triples = [t.replace(" * ", "  ").split('  ') for t in q['query'].split('{')[1].split('}')[0].split(" . ")]
    except:
        print(q)
    #for q in queries:
    for t in  triples:
        if "" in t:
            t.remove("")
        if " * " in t:
            t.replace(" * ", "  ")
        for i in range(len(t)):
            t[i] = t[i].replace(" .", "")
            t[i] = t[i].replace(" ", "")
        q['triples'] = triples
    entities = []
    for t in triples:
        if len(t) < 3:
            skip =True
            break
        for e in t:
            if ";" in e:
                skip=True
                break

        if not "?" in t[0]:
            entities.append(t[0].replace("<","").replace(">","").replace(" .", ""))
        if not "?" in t[2]:
            entities.append(t[2].replace("<","").replace(">","").replace(" .", ""))
        if not "?" in t[1]:
            entities.append(t[1].replace("<","").replace(">","").replace(" .", ""))
    q["x"] = entities
    if not skip:
        new_queries.append(q)



In [None]:
for q in new_queries:
    for e in q["x"]:
        if ";" in e:
            print(q)

In [None]:
new_queries

In [None]:
with open("Wikidata_user_queries.json", "w") as file:
    json.dump(new_queries, file)

In [None]:
limit = 1000000000
plt.hist([c for c in cardinalities if c < limit and c > 1])
plt.yscale("log")
