In [5]:
from rdflib import Graph
from SPARQLWrapper import SPARQLWrapper, BASIC

import spacy
from spacy.tokens import Doc
import pandas as pd
import numpy as np
import time


import sys
from multiprocessing import Process, Manager, Queue
import multiprocessing

nlp = spacy.load('en_core_web_lg')

import gensim
import nltk

from tqdm import tqdm

stopwords = set(nltk.corpus.stopwords.words('english'))
stopwords.add("</s>")

def empty_vector(x):
    for i in x:
        if sum(i.vector) == 0:
            return True
    return False

def connected_to_natural(x):
    res = ""
    len_x = len(x)
    for i, carac in enumerate(x):
        if (i >= 1) and (i<(len_x-1)) and (ord(carac)>=ord("A")) and (ord(carac)<=ord("Z")) and (ord(x[i+1])>=ord("a")) and (ord(x[i+1])<=ord("z")):
            res+=" "+carac
        else:
            res+=carac
    return res

def preprocess(text):
    text = connected_to_natural(text)
    return [word for word in gensim.utils.simple_preprocess(text,min_len=1,max_len=50) if word not in stopwords]

# Retrieve the server info

In [6]:
import json
  
# Opening JSON file
f = open('./../../config.json')
  
# returns JSON object as 
# a dictionary
data = json.load(f)
  
url_server = data["ServerInfo"]["url"]
    
f.close()

# Retrieve the data of all the properties

In [8]:
# %%time
sparql = SPARQLWrapper(url_server)
sparql.setReturnFormat('json')
sparql.method = 'GET'

data_props = {}

q = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?prop ?label ?comment ?ontology
WHERE {
    ?prop rdf:type rdf:Property.
    ?prop <http://graph/origin>  ?ontology.
    ?prop rdfs:label ?label.
    OPTIONAL{?prop rdfs:description ?comment}.
}"""

sparql.setQuery(q)
response = sparql.queryAndConvert()
for r in response["results"]["bindings"]:
    
    data_props[r["prop"]["value"]] = {"label":r["label"]["value"], "comment":"", "domain":set(), "range":set(), "onto": r["ontology"]["value"]}
    if "comment" in r:
        data_props[r["prop"]["value"]]["comment"] = r["comment"]["value"]
        

In [3]:
# %%time

q = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?prop ?domain
WHERE {
    ?prop rdf:type rdf:Property.
    ?prop rdfs:label ?label.
    ?prop rdfs:domain ?domain.
}"""

sparql.setQuery(q)
response = sparql.queryAndConvert()
for r in response["results"]["bindings"]:
    data_props[r["prop"]["value"]]["domain"].add(r["domain"]["value"])

In [4]:
# %%time

q = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?prop ?range
WHERE {
    ?prop rdf:type rdf:Property.
    ?prop rdfs:label ?label.
    ?prop rdfs:range  ?range.
}"""

sparql.setQuery(q)
response = sparql.queryAndConvert()
for r in response["results"]["bindings"]:
    data_props[r["prop"]["value"]]["range"].add(r["range"]["value"])

# Retrieve the data of the Classes

In [5]:
# %%time
sparql = SPARQLWrapper(url_server)
sparql.setReturnFormat('json')
sparql.method = 'GET'

classes = {}

q = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?prop ?label ?comment 
WHERE {
    ?prop rdf:type rdf:Class.
    ?prop rdfs:label ?label.
    OPTIONAL{?prop rdfs:description ?comment}.
}"""

sparql.setQuery(q)
response = sparql.queryAndConvert()
for r in response["results"]["bindings"]:
    
    classes[r["prop"]["value"]] = {"label":r["label"]["value"], "comment":""}
    if "comment" in r:
        classes[r["prop"]["value"]]["comment"] = r["comment"]["value"]
        

# Compute the similarity between the Classes

In [6]:
df_classes = pd.DataFrame.from_dict(classes, orient="index")

df_classes["label doc"] = df_classes["label"].map(lambda x: Doc(nlp.vocab, words=preprocess(x)))
df_classes["comment doc"] = df_classes["comment"].map(lambda x: Doc(nlp.vocab, words=preprocess(x)))
df_classes["comment not empty"] = df_classes["comment doc"].map(lambda x: len(x) != 0)

In [7]:
df_classes = df_classes[:10]

In [8]:
def compute_similarity_classes(name, queue, dict_shared_sim_classes, df_classes, total_length, counter, next_print):

    print(f"Process n°{name} : Launched", flush=True)

    dict_local_sim_classes = {}

    while not queue.empty():

        index, prop_1 = queue.get()

        for prop_2 in df_classes[index:].index:

            sim = 0
            nb_sim = 0

            sim += df_classes["label doc"].loc[prop_1].similarity(df_classes["label doc"].loc[prop_2])
            nb_sim += 1

            if df_classes["comment not empty"].loc[prop_1] and df_classes["comment not empty"].loc[prop_2]:
                sim += df_classes["comment doc"].loc[prop_1].similarity(df_classes["comment doc"].loc[prop_2])
                nb_sim += 1

            sim /= nb_sim

            dict_local_sim_classes[(prop_1, prop_2)] = sim
            dict_local_sim_classes[(prop_2, prop_1)] = sim

        ## Every 0.5% we will print the advancement
        counter.value += 1
        if (counter.value/total_length > next_print.value):
            print(next_print.value*100, "%", flush=True)
            next_print.value+=0.5

    dict_shared_sim_classes.update(dict_local_sim_classes)
    
    print(f"Process n°{name} : Finished", flush=True)

In [None]:
# %%time
dict_sim_classes = {}

q = Queue()

for i, prop_1 in enumerate(df_classes.index):
    q.put((i,prop_1))
    
size_queue = q.qsize()

with Manager() as manager:

    processes_to_create = multiprocessing.cpu_count()-1
    processes = list()
    
    dict_shared_sim_classes = manager.dict()
    counter = manager.Value("counter",0)
    next_print = manager.Value("next_print",0)
    
    for name in range(processes_to_create):
#         x = Process(target=test_p, args=(name,counter))  
#         processes.append(x)
#         x.start()
        x = Process(target=compute_similarity_classes, args=(name, q, dict_shared_sim_classes, df_classes, size_queue, counter, next_print))
        processes.append(x)
        x.start()
        
    for index, process in enumerate(processes):
        process.join()

    print(counter)
    
    dict_sim_classes = dict(dict_shared_sim_classes)

In [None]:
pd.DataFrame.from_dict(dict_sim_classes, orient="index").to_csv("sim_classes.csv")

In [None]:
# %%time

# def compute_similarity_classes( index, dict_shared_sim_classes, df_classes):
        
#     prop_1 = df_classes.index[index]

#     for prop_2 in df_classes[index:]:

#         sim = 0
#         nb_sim = 0

#         sim += df_classes["label doc"].loc[prop_1].similarity(df_classes["label doc"].loc[prop_2])
#         nb_sim += 1

#         if df_classes["comment not empty"].loc[prop_1] and df_classes["comment not empty"].loc[prop_2]:
#             sim += df_classes["comment doc"].loc[prop_1].similarity(df_classes["comment doc"].loc[prop_2])
#             nb_sim += 1

#         sim /= nb_sim   

#         dict_shared_sim_classes[(prop_1, prop_2)] = sim
#         dict_shared_sim_classes[(prop_2, prop_1)] = sim

# dict_sim_classes = {}

# # q = Queue()

# # for i, prop_1 in enumerate(df_classes.index):
# #     q.put((i,prop_1))
    
# # size_queue = q.qsize()
            

# with Manager() as manager:
    
#     dict_shared_sim_classes = manager.dict()
    
#     PROCESSES = 2
#     with multiprocessing.Pool(PROCESSES) as pool:
#         params = [(index,dict_shared_sim_classes, df_classes) for index in range(len(df_classes))]
#         print(len(params))
#         results = [pool.apply_async(compute_similarity_classes, p) for p in params]
    
    

# Compute the sim between properties

In [44]:
df_props = pd.DataFrame.from_dict(data_props, orient="index")

df_props["onto"].value_counts()

http://wikidata.org                                                   60174
http://schema.org/                                                      803
http://dbpedia.org/ontology/                                            561
https://www.ica.org/standards/RiC/ontology                              485
https://w3id.org/arco/ontology/context-description                      337
                                                                      ...  
http://securitytoolbox.appspot.com/securityMain                           1
http://mex.aksw.org/mex-perf                                              1
http://vocab.data.gov/def/fea                                             1
https://w3id.org/seas/StatisticsOntology                                  1
http://www.semanticweb.org/ontologies/2008/11/OntologySecurity.owl        1
Name: onto, Length: 619, dtype: int64

In [6]:
# %%time

dict_sim_props = {}

df_props = pd.DataFrame.from_dict(data_props, orient="index")

df_props["label doc"] = df_props["label"].map(lambda x: Doc(nlp.vocab, words=preprocess(x)))
df_props["comment doc"] = df_props["comment"].map(lambda x: Doc(nlp.vocab, words=preprocess(x)))
df_props["comment not empty"] = df_props["comment doc"].map(lambda x: len(x) != 0)
df_props["domain not empty"] = df_props["domain"].map(lambda x: len(x) != 0)
df_props["range not empty"] = df_props["range"].map(lambda x: len(x) != 0)

len(df_props)

78890

For wikidata we have to use x6 properties because we have direct/statement/.... <br>
Thus we can reduce the number of prop for the wikidata onto and thus be faster (hopefully)

In [16]:
df_props = df_props[df_props[["onto"]].apply(lambda x: x["onto"] != "http://wikidata.org" or x.name[:30]=="http://www.wikidata.org/prop/P", axis=1)]
len(df_props)

28745

In [17]:
df_props["onto"].value_counts()

http://wikidata.org                                                   10029
http://schema.org/                                                      803
http://dbpedia.org/ontology/                                            561
https://www.ica.org/standards/RiC/ontology                              485
https://w3id.org/arco/ontology/context-description                      337
                                                                      ...  
http://securitytoolbox.appspot.com/securityMain                           1
http://mex.aksw.org/mex-perf                                              1
http://vocab.data.gov/def/fea                                             1
https://w3id.org/seas/StatisticsOntology                                  1
http://www.semanticweb.org/ontologies/2008/11/OntologySecurity.owl        1
Name: onto, Length: 619, dtype: int64

In [None]:
def compute_similarity_property(name, queue, dict_shared_sim_properties, df_props_1, df_props_2):
    print(f"Process n°{name} : Launched", flush=True)

    dict_local_sim_properties = {}

    while not queue.empty():

        index, prop_1 = queue.get()

        for prop_2 in df_props_2.index:
            sim = 0
            nb_sim = 0

            sim += df_props_1["label doc"].loc[prop_1].similarity(df_props_2["label doc"].loc[prop_2])
            nb_sim += 1

            if df_props_1["comment not empty"].loc[prop_1] and df_props_2["comment not empty"].loc[prop_2]:
                sim += df_props_1["comment doc"].loc[prop_1].similarity(df_props_2["comment doc"].loc[prop_2])
                nb_sim += 1

            if df_props_1["domain not empty"].loc[prop_1] and df_props_2["domain not empty"].loc[prop_2]:
                domain_1, domain_2 = df_props_1["domain"].loc[prop_1], df_props_2["domain"].loc[prop_2]
                sim_domain = -1
                for d_1 in domain_1:
                    for d_2 in domain_2:
                        if (d_1, d_2) in dict_sim_classes:
                            sim_domain = max(sim_domain, dict_sim_classes[(d_1, d_2)])
                if sim_domain != -1:
                    sim += sim_domain
                    nb_sim += 1


            if df_props_1["range not empty"].loc[prop_1] and df_props_2["range not empty"].loc[prop_2]:
                range_1, range_2 = df_props_1["range"].loc[prop_1], df_props_2["range"].loc[prop_2]
                sim_range = -1
                for r_1 in range_1:
                    for r_2 in range_2:
                        if (r_1, r_2) in dict_sim_classes:
                            sim_range = max(sim_range, dict_sim_classes[(r_1, r_2)])
                if sim_domain != -1:
                    sim += sim_range
                    nb_sim += 1

            sim/=nb_sim

            dict_local_sim_properties[(prop_1, prop_2)] = sim

    dict_shared_sim_properties.update(dict_local_sim_properties)
    
    print(f"Process n°{name} : Finished", flush=True)

In [10]:
# %%time

ontos = list(df_props["onto"].value_counts().index)

q = Queue()

with Manager() as manager:

    processes_to_create = multiprocessing.cpu_count()-1
    dict_shared_sim_properties = manager.dict()
    
    for i, onto_1 in enumerate(ontos[:2]):
        df_props_1 = df_props[df_props["onto"]==onto_1]

        for onto_2 in ontos[i+1:2]:
            df_props_2 = df_props[df_props["onto"]==onto_2]

            print(f"Working on {onto_1} & {onto_2} with {len(df_props_1)}x{len(df_props_2)}")
            start = time.time()

            for i, prop_1 in enumerate(df_props_1.index):
                q.put((i,prop_1))

            processes = list()

            for name in range(processes_to_create):
                x = Process(target=compute_similarity_property, args=(name, q, dict_shared_sim_properties, df_props_1, df_props_2))
                processes.append(x)
                x.start()

            for index, process in enumerate(processes):
                process.join()

            end = time.time()
            print(end - start)
            
        print(f"Finished with {onto_1}")
        
    print("Finished computation -> Copy the dictionary")    
    dict_sim_properties = dict(dict_shared_sim_properties)

Working on wikidata & dbpedia with 10029x3136




3578.836715698242
Working on wikidata & schema with 10029x1448
1670.19851064682
Working on wikidata & foaf with 10029x54
74.8915364742279
Working on wikidata & owl with 10029x44
60.0958411693573
Working on wikidata & w3 with 10029x9
12.567718744277954
Working on wikidata & rdf with 10029x7
9.622103452682495
Working on dbpedia & schema with 3136x1448
478.11180090904236
Working on dbpedia & foaf with 3136x54
20.74883508682251
Working on dbpedia & owl with 3136x44
16.98633885383606
Working on dbpedia & w3 with 3136x9
3.9177470207214355
Working on dbpedia & rdf with 3136x7
2.7023086547851562
Working on schema & foaf with 1448x54
13.512605667114258
Working on schema & owl with 1448x44
8.603918552398682
Working on schema & w3 with 1448x9
1.7357933521270752
Working on schema & rdf with 1448x7
1.3591079711914062
Working on foaf & owl with 54x44
0.39124321937561035
Working on foaf & w3 with 54x9
0.08173680305480957
Working on foaf & rdf with 54x7
0.06370234489440918
Working on owl & w3 with 44x

In [15]:
f = open("similarity.ttl", "w", encoding="utf-8")

print("Finished Copy -> Writing result")    
for key in dict_sim_props:
    f.write(f"<{key[0]}> <http://graph/simComputed> <{key[1]}>.\n")
    f.write(f"<< <{key[0]}> <http://graph/simComputed> <{key[1]}> >> <http://graph/sim> {dict_sim_props[key]} .\n")
    
f.close()