In [1]:
from rdflib import Graph
from SPARQLWrapper import SPARQLWrapper, BASIC

import spacy
from spacy.tokens import Doc
import pandas as pd
from tqdm import tqdm
import numpy as np
import time
import random

def empty_vector(x):
    for i in x:
        if sum(i.vector) == 0:
            return True
    return False

def connected_to_natural(x):
    res = ""
    len_x = len(x)
    for i, carac in enumerate(x):
        if (i >= 1) and (i<(len_x-1)) and (ord(carac)>=ord("A")) and (ord(carac)<=ord("Z")) and (ord(x[i+1])>=ord("a")) and (ord(x[i+1])<=ord("z")):
            res+=" "+carac
        else:
            res+=carac
    return res



from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')

# Retrieve the server info

In [None]:
import json
  
# Opening JSON file
f = open('./../../config.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)
  
url_server = data["ServerInfo"]["url"]
    
f.close()

# Retrieve the Data of Properties

In [2]:
%%time
sparql = SPARQLWrapper("http://Thibaut:7200/repositories/Catalog", agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36')
sparql.setReturnFormat('json')
sparql.method = 'GET'

data_props = {}

q = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?prop ?label ?comment ?ontology
WHERE {
    ?prop rdf:type rdf:Property.
    ?prop <http://graph/origin>  ?ontology.
    ?prop rdfs:label ?label.
    OPTIONAL{?prop rdfs:description ?comment}.
}
"""

sparql.setQuery(q)
response = sparql.queryAndConvert()
for r in response["results"]["bindings"]:
    
    data_props[r["prop"]["value"]] = {"label":r["label"]["value"], "comment":"", "domain":set(), "range":set(), "onto": r["ontology"]["value"]}
    if "comment" in r:
        data_props[r["prop"]["value"]]["comment"] = r["comment"]["value"]
        

Wall time: 10.9 s


In [3]:
%%time

q = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?prop ?domain
WHERE {
    ?prop rdf:type rdf:Property.
    ?prop rdfs:label ?label.
    ?prop rdfs:domain ?domain.
}"""

sparql.setQuery(q)
response = sparql.queryAndConvert()
for r in response["results"]["bindings"]:
    data_props[r["prop"]["value"]]["domain"].add(r["domain"]["value"])

Wall time: 9.06 s


In [4]:
%%time

q = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?prop ?range
WHERE {
    ?prop rdf:type rdf:Property.
    ?prop rdfs:label ?label.
    ?prop rdfs:range  ?range.
}"""

sparql.setQuery(q)
response = sparql.queryAndConvert()
for r in response["results"]["bindings"]:
    data_props[r["prop"]["value"]]["range"].add(r["range"]["value"])

Wall time: 6.5 s


# Retrieve the Data of Classes

In [5]:
%%time
sparql = SPARQLWrapper("http://Thibaut:7200/repositories/Catalog", agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36')
sparql.setReturnFormat('json')
sparql.method = 'GET'

data_classes = {}

q = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?prop ?label ?comment 
WHERE {
    ?prop rdf:type rdf:Class.
    ?prop rdfs:label ?label.
    OPTIONAL{?prop rdfs:description ?comment}.
}"""

sparql.setQuery(q)
response = sparql.queryAndConvert()
for r in response["results"]["bindings"]:
    
    data_classes[r["prop"]["value"]] = {"label":r["label"]["value"], "comment":""}
    if "comment" in r:
        data_classes[r["prop"]["value"]]["comment"] = r["comment"]["value"]
        

Wall time: 7.48 s


# Compute the similarity between the Classes

In [6]:
%%time

df_classes = pd.DataFrame.from_dict(data_classes, orient="index")

type_label_bert = model.encode(df_classes["label"])
type_comment_bert = model.encode(df_classes["comment"])
df_classes["comment not empty"] = df_classes["comment"].map(lambda x: len(x) != 0)

Wall time: 11min 15s


In [9]:
def compute_similarity_classes(name, queue, dict_shared_sim_classes, df_classes, sim_bert_label, sim_bert_comment):

    print(f"Process n°{name} : Launched", flush=True)

    dict_local_sim_classes = {}

    while not queue.empty():

        index, prop_1 = queue.get()

        for prop_2 in df_classes[index:].index:

            sim = 0
            nb_sim = 0

            sim += sim_bert_label[i][j]
            nb_sim += 1

            if df_classes["comment not empty"].loc[prop_1] and df_classes["comment not empty"].loc[prop_2]:
                sim += sim_bert_comment[i][j]
                nb_sim += 1

            sim /= nb_sim

            dict_local_sim_classes[(prop_1, prop_2)] = sim
            dict_local_sim_classes[(prop_2, prop_1)] = sim

    dict_shared_sim_classes.update(dict_local_sim_classes)
    
    print(f"Process n°{name} : Finished", flush=True)

In [8]:
%%time
dict_sim_classes = {}

sim_bert_label = pd.DataFrame(cosine_similarity(type_label_bert,type_label_bert))
sim_bert_comment = pd.DataFrame(cosine_similarity(type_comment_bert,type_comment_bert))

q = Queue()

for i, prop_1 in enumerate(df_classes.index):
    q.put((i,prop_1))
    
size_queue = q.qsize()

with Manager() as manager:

    processes_to_create = multiprocessing.cpu_count()-1
    processes = list()
    
    dict_shared_sim_classes = manager.dict()
    
    for name in range(processes_to_create):
        x = Process(target=compute_similarity_classes, args=(name, q, dict_shared_sim_classes, df_classes, sim_bert_label, sim_bert_comment))
        processes.append(x)
        x.start()
        
    for index, process in enumerate(processes):
        process.join()

    print(counter)
    
    dict_sim_classes = dict(dict_shared_sim_classes)

KeyboardInterrupt: 

# Compute the similarity between Properties

In [9]:
%%time

dict_sim_props = {}

df_props = pd.DataFrame.from_dict(data_props, orient="index")

df_props = df_props[df_props[["onto"]].apply(lambda x: x["onto"] != "http://wikidata.org" or x.name[:30]=="http://www.wikidata.org/prop/P", axis=1)]

Wall time: 586 ms


In [10]:
%%time 
prop_label_bert = model.encode(df_props["label"])
prop_comment_bert = model.encode(df_props["comment"])

df_props["comment not empty"] = df_props["comment"].map(lambda x: len(x) != 0)
df_props["domain not empty"] = df_props["domain"].map(lambda x: len(x) != 0)
df_props["range not empty"] = df_props["range"].map(lambda x: len(x) != 0)

len(df_props)

KeyboardInterrupt: 

In [None]:
def compute_similarity_property(name, queue, dict_shared_sim_properties, df_props, df_props_1_index, df_props_2_index):
    print(f"Process n°{name} : Launched", flush=True)

    dict_local_sim_properties = {}

    while not queue.empty():

        prop_1 = queue.get()

        for prop_2 in df_props_2.index:
            sim = 0
            nb_sim = 0

            for prop_1 in df_props_1_index:
            
            j = df_props.index.get_loc(prop_1)
            
            for prop_2 in df_props_2_index:
            
                k = df_props.index.get_loc(prop_2)
            
                sim = 0
                nb_sim = 0
                    
                sim += sim_bert_label[j][k]
                nb_sim += 1
                    
                if df_props["comment not empty"].loc[prop_1] and df_props["comment not empty"].loc[prop_2]:
                    sim += sim_bert_comment[j][k]
                    nb_sim += 1
                
                if df_props["domain not empty"].loc[prop_1] and df_props["domain not empty"].loc[prop_2]:
                    domain_1, domain_2 = df_props["domain"].loc[prop_1], df_props["domain"].loc[prop_2]
                    sim_domain = -1
                    for d_1 in domain_1:
                        for d_2 in domain_2:
                            # Because DBPEDIA uses property as domain (WTF)
                            if (d_1, d_2) in dict_sim_classes:
                                sim_domain = max(sim_domain, dict_sim_classes[(d_1, d_2)])
                    if sim_domain != -1:
                        sim += sim_domain
                        nb_sim += 1
                  
                if df_props["range not empty"].loc[prop_1] and df_props["range not empty"].loc[prop_2]:
                    range_1, range_2 = df_props["range"].loc[prop_1], df_props["range"].loc[prop_2]
                    sim_range = -1
                    for r_1 in range_1:
                        for r_2 in range_2:
                            if (r_1, r_2) in dict_sim_classes:
                                sim_range = max(sim_range, dict_sim_classes[(r_1, r_2)])
                    if sim_domain != -1:
                        sim += sim_range
                        nb_sim += 1

            sim/=nb_sim

            dict_local_sim_properties[(prop_1, prop_2)] = sim

    dict_shared_sim_properties.update(dict_local_sim_properties)
    
    print(f"Process n°{name} : Finished", flush=True)

In [37]:
%%time

ontos = list(df_props["onto"].value_counts().index)

sim_bert_label = pd.DataFrame(cosine_similarity(prop_label_bert,prop_label_bert))
sim_bert_comment = pd.DataFrame(cosine_similarity(prop_comment_bert,prop_comment_bert))

with Manager() as manager:

    processes_to_create = multiprocessing.cpu_count()-1
    dict_shared_sim_properties = manager.dict()
    
    for i, onto_1 in enumerate(ontos):
        df_props_1_index = df_props[df_props["onto"]==onto_1].index

        for onto_2 in ontos[i+1:]:
            df_props_2_index = df_props[df_props["onto"]==onto_2].index

            print(f"Working on {onto_1} & {onto_2} with {len(df_props_1_index)}x{len(df_props_2_index)}")
            start = time.time()

            for i, prop_1 in enumerate(df_props_1.index):
                q.put((i,prop_1))

            processes = list()

            for name in range(processes_to_create):
                x = Process(target=compute_similarity_property, args=(name, q, dict_shared_sim_properties, df_props_1, df_props_2))
                processes.append(x)
                x.start()

            for index, process in enumerate(processes):
                process.join()

            end = time.time()
            print(end - start)
        print(f"Finished with {onto_1}")
    print("Finished computation -> Copy the dictionary")    
    dict_sim_properties = dict(dict_shared_sim_properties)

Working on wikidata & schema with 10029x1448
1038.732256412506
Wall time: 17min 20s


In [None]:
f = open("similarity.ttl", "w", encoding="utf-8")

print("Finished Copy -> Writing result")    
for key in dict_sim_props:
    f.write(f"<{key[0]}> <http://graph/simComputed> <{key[1]}>.\n")
    f.write(f"<< <{key[0]}> <http://graph/simComputed> <{key[1]}> >> <http://graph/sim> {dict_sim_props[key]} .\n")
    
f.close()