<a href="https://colab.research.google.com/github/AlbezJelt/compass-aligned-graph-embeddings/blob/main/notebooks/Neighborhood_based_sameAs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install rdflib sparqlwrapper

In [2]:
import re
from multiprocessing import Manager, Process
from multiprocessing.dummy import Pool as ThreadPool
from operator import itemgetter
from pprint import pprint
import time

import requests
from SPARQLWrapper import JSON, SPARQLWrapper
from tqdm.notebook import tqdm

In [3]:
#read dbpedia_walks_final
with requests.get('https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/data/dbpedia_walks_final.txt') as req:
  walks = [phrase.split(' ') for phrase in req.text.split('\n')]

In [4]:
manager = Manager()
dic_dbpedia_to_wikidata = manager.dict()
d_dbpedia_sn = manager.dict()
d_wikidata_n = manager.dict()

In [5]:
dbpedia_sameas_neighborhood_query = re.sub(' {2,}', ' ', """
    SELECT DISTINCT * WHERE {{
        {{
            SELECT ?we WHERE {{
                <http://dbpedia.org/resource/{entity}> ?p ?o.
                ?o owl:sameAs ?we.
                FILTER(CONTAINS(STR(?we), \"wikidata.org/entity/Q\"))
            }}
        }}
        UNION
        {{
            SELECT ?we WHERE {{
                <http://dbpedia.org/resource/{entity}> dbo:wikiPageRedirects ?r.
                ?r ?p ?o.
                ?o owl:sameAs ?we.
                FILTER(CONTAINS(STR(?we), \"wikidata.org/entity/Q\"))
            }}
        }}
    }}
""")

toWikidata_query = re.sub(' {2,}', ' ', """
    SELECT DISTINCT * WHERE {{
        {{
            SELECT ?o WHERE {{
                <http://dbpedia.org/resource/{entity}> dbo:wikiPageRedirects ?r.
                ?r owl:sameAs ?o.
                FILTER(CONTAINS(STR(?o), \"wikidata.org/entity/Q\"))
            }}
        }}
        UNION
        {{
            SELECT ?o WHERE {{
                <http://dbpedia.org/resource/{entity}> owl:sameAs ?o.
                FILTER(CONTAINS(STR(?o), \"wikidata.org/entity/Q\"))
            }}
        }}
    }}
""")

wikidata_neighborhood_query = re.sub(' {2,}', ' ', """
    SELECT DISTINCT ?o WHERE {{
        wd:{entity} ?p ?o.
        FILTER(CONTAINS(STR(?o), \"wikidata.org/entity/Q\"))
    }}
""")


In [6]:
def dbpedia_sameas_neighborhood(dbpedia_entity: str):
    if dbpedia_entity not in d_dbpedia_sn:
        sparql = SPARQLWrapper("http://dbpedia.org/sparql")
        sparql.setQuery(dbpedia_sameas_neighborhood_query.format(entity=dbpedia_entity))
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        d_dbpedia_sn[dbpedia_entity] = [r["we"]["value"]
                                        for r in results['results']['bindings']]
    return d_dbpedia_sn[dbpedia_entity]


In [7]:
def wikidata_neighborhood(wikidata_entity: str):
    if wikidata_entity not in d_wikidata_n:
        sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
        sparql.agent = 'AlbezJelt ' + sparql.agent
        sparql.setQuery(wikidata_neighborhood_query.format(entity=wikidata_entity))
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        d_wikidata_n[wikidata_entity] = [r["o"]["value"]
                                         for r in results['results']['bindings']]
    return d_wikidata_n[wikidata_entity]


In [8]:
def select_sameas(wikidata_sameas, dbpedia_entity: str) -> str:
    # Find dpedia neighborhood
    dbpedia_nb = set(dbpedia_sameas_neighborhood(dbpedia_entity))
    wikidata_nbs = ((we, wikidata_neighborhood(we)) for we in wikidata_sameas)
    matches = [(nb[0], len(set(nb[1]).intersection(dbpedia_nb)))
               for nb in wikidata_nbs]
    match = max(matches, key=itemgetter(1))
    tqdm.write(
        f"DBpedia {dbpedia_entity}: selected {match[0]} with {match[1]} matches.")
    return match[0]

In [9]:
def toWikidata(entity: str):
    if entity not in dic_dbpedia_to_wikidata:
        sparql = SPARQLWrapper("http://dbpedia.org/sparql")
        sparql.setQuery(toWikidata_query.format(entity=entity))
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        wikidata_sameas = [x.replace("http://www.wikidata.org/entity/", "")
                           for x in [r["o"]["value"] for r in results['results']['bindings']]]
        if len(wikidata_sameas) == 0:
            dic_dbpedia_to_wikidata[entity] = entity
        elif len(wikidata_sameas) == 1:
            dic_dbpedia_to_wikidata[entity] = wikidata_sameas[0]
        else:
            dic_dbpedia_to_wikidata[entity] = select_sameas(
                wikidata_sameas, entity)
    return dic_dbpedia_to_wikidata[entity]

In [10]:
def toWikidataWalk(walk: list):
    wikidata_walk = [toWikidata(link) for link in walk]
    return wikidata_walk

In [12]:
#some debug code
walk1 = walks[slice(0,500)]
walk2 = walks[slice(501,1000)]
walk3 = walks[slice(1001,1500)]
walk4 = walks[slice(1501,2000)]
walk5 = walks[slice(2001,len(walks))]

import json
with open('dic_dbpedia_to_wikidata.json') as json_file:
    dic_dbpedia_to_wikidata = json.load(json_file)


In [13]:
#thread pool for extract wikidata walks from same as link of dbpedia
with ThreadPool(4) as pool:
    res = list(tqdm(pool.imap(toWikidataWalk, walks), total=len(walks)))

  0%|          | 0/2501 [00:00<?, ?it/s]

In [15]:
with open('wiki_walks_from_dbpedia.txt', 'wt') as f:
  for w in res:
    f.write(' '.join(w) + '\n')

In [24]:
import json
with open("dic_dbpedia_to_wikidata.json", "w") as outfile:
    json.dump(dic_dbpedia_to_wikidata.copy(), outfile)

import json
with open("d_dbpedia_sn.json", "w") as outfile:
    json.dump(d_dbpedia_sn.copy(), outfile)

import json
with open("d_wikidata_n.json", "w") as outfile:
    json.dump(d_wikidata_n.copy(), outfile)

In [16]:
#compass.txt for creation of a dictionary that maps wikidata entity in their label
!wget https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/data/wikidata_walks_final.txt
!cat wiki_walks_from_dbpedia.txt wikidata_walks_final.txt >> compass.txt

--2022-02-14 13:32:23--  https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/data/wikidata_walks_final.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 456262 (446K) [text/plain]
Saving to: ‘wikidata_walks_final.txt.1’


2022-02-14 13:32:23 (12.3 MB/s) - ‘wikidata_walks_final.txt.1’ saved [456262/456262]



In [17]:
#read compass and transform in a set with elements with no duplicates
with open('compass.txt', 'rt') as f:
  corpus = map(lambda x: x.replace('\n', ''), f.readlines())
  walks = map(lambda x: x.split(' '), corpus)
  entities = (entity for walk in walks for entity in walk)
  entities = set(entities)

#create dictionary for labels
dic_wikidata_labels = manager.dict()

In [21]:
#function that extract labels from set of wikidata elements
#elements with no label have '*NOLABEL*' as label
import re
def toWikidateLabel(entity: str):
  if not re.match("^Q\d+$", entity):
    return entity, entity
  else:
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.agent = 'AlbezJelt ' + sparql.agent
    sparql.setQuery(f"""
        SELECT ?l WHERE {{
          wd:{entity} rdfs:label ?l.
          FILTER(LANGMATCHES(LANG(?l), "EN"))
          }}
        LIMIT 1
        """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    label = [r["l"]["value"] for r in results['results']['bindings']]
    label = label[0] if len(label) > 0 else '*NOLABEL*'
    return entity, label

In [22]:
#thread pool for extract labels from wikidata entity
with ThreadPool(4) as pool:
 dic_wikidata_labels = {entity : label for (entity, label) in tqdm(pool.imap(toWikidateLabel, entities), total=len(entities))}

#save dictionary in json
with open('wikidata_label_dictionary.json', 'wt') as f:
  json.dump(dic_wikidata_labels, f)

  0%|          | 0/15559 [00:00<?, ?it/s]