<a href="https://colab.research.google.com/github/AlbezJelt/compass-aligned-graph-embeddings/blob/main/notebooks/Neighborhood_based_sameAs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install rdflib sparqlwrapper

In [2]:
import re
from multiprocessing import Manager, Process
from multiprocessing.dummy import Pool as ThreadPool
from operator import itemgetter
from pprint import pprint
import time

import requests
from SPARQLWrapper import JSON, SPARQLWrapper
from tqdm.notebook import tqdm

In [3]:
#read dbpedia_walks_final
with requests.get('https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/data/dbpedia_walks_final.txt') as req:
  walks = [phrase.split(' ') for phrase in req.text.split('\n')]

In [4]:
manager = Manager()
dic_dbpedia_to_wikidata = manager.dict()
d_dbpedia_sn = manager.dict()
d_wikidata_n = manager.dict()

In [5]:
dbpedia_sameas_neighborhood_query = re.sub(' {2,}', ' ', """
    SELECT DISTINCT * WHERE {{
        {{
            SELECT ?we WHERE {{
                <http://dbpedia.org/resource/{entity}> ?p ?o.
                ?o owl:sameAs ?we.
                FILTER(CONTAINS(STR(?we), \"wikidata.org/entity/Q\"))
            }}
        }}
        UNION
        {{
            SELECT ?we WHERE {{
                <http://dbpedia.org/resource/{entity}> dbo:wikiPageRedirects ?r.
                ?r ?p ?o.
                ?o owl:sameAs ?we.
                FILTER(CONTAINS(STR(?we), \"wikidata.org/entity/Q\"))
            }}
        }}
    }}
""")

toWikidata_query = re.sub(' {2,}', ' ', """
    SELECT DISTINCT * WHERE {{
        {{
            SELECT ?o WHERE {{
                <http://dbpedia.org/resource/{entity}> dbo:wikiPageRedirects ?r.
                ?r owl:sameAs ?o.
                FILTER(CONTAINS(STR(?o), \"wikidata.org/entity/Q\"))
            }}
        }}
        UNION
        {{
            SELECT ?o WHERE {{
                <http://dbpedia.org/resource/{entity}> owl:sameAs ?o.
                FILTER(CONTAINS(STR(?o), \"wikidata.org/entity/Q\"))
            }}
        }}
    }}
""")

wikidata_neighborhood_query = re.sub(' {2,}', ' ', """
    SELECT DISTINCT ?o WHERE {{
        wd:{entity} ?p ?o.
        FILTER(CONTAINS(STR(?o), \"wikidata.org/entity/Q\"))
    }}
""")


In [6]:
def dbpedia_sameas_neighborhood(dbpedia_entity: str):
    if dbpedia_entity not in d_dbpedia_sn:
        sparql = SPARQLWrapper("http://dbpedia.org/sparql")
        sparql.setQuery(dbpedia_sameas_neighborhood_query.format(entity=dbpedia_entity))
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        d_dbpedia_sn[dbpedia_entity] = [r["we"]["value"]
                                        for r in results['results']['bindings']]
    return d_dbpedia_sn[dbpedia_entity]


In [7]:
def wikidata_neighborhood(wikidata_entity: str):
    if wikidata_entity not in d_wikidata_n:
        sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
        sparql.agent = 'AlbezJelt ' + sparql.agent
        sparql.setQuery(wikidata_neighborhood_query.format(entity=wikidata_entity))
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        d_wikidata_n[wikidata_entity] = [r["o"]["value"]
                                         for r in results['results']['bindings']]
    return d_wikidata_n[wikidata_entity]


In [8]:
def select_sameas(wikidata_sameas, dbpedia_entity: str) -> str:
    # Find dpedia neighborhood
    dbpedia_nb = set(dbpedia_sameas_neighborhood(dbpedia_entity))
    wikidata_nbs = ((we, wikidata_neighborhood(we)) for we in wikidata_sameas)
    matches = [(nb[0], len(set(nb[1]).intersection(dbpedia_nb)))
               for nb in wikidata_nbs]
    match = max(matches, key=itemgetter(1))
    tqdm.write(
        f"DBpedia {dbpedia_entity}: selected {match[0]} with {match[1]} matches.")
    return match[0]

In [9]:
def toWikidata(entity: str):
    if entity not in dic_dbpedia_to_wikidata:
        sparql = SPARQLWrapper("http://dbpedia.org/sparql")
        sparql.setQuery(toWikidata_query.format(entity=entity))
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        wikidata_sameas = [x.replace("http://www.wikidata.org/entity/", "")
                           for x in [r["o"]["value"] for r in results['results']['bindings']]]
        if len(wikidata_sameas) == 0:
            dic_dbpedia_to_wikidata[entity] = entity
        elif len(wikidata_sameas) == 1:
            dic_dbpedia_to_wikidata[entity] = wikidata_sameas[0]
        else:
            dic_dbpedia_to_wikidata[entity] = select_sameas(
                wikidata_sameas, entity)
    return dic_dbpedia_to_wikidata[entity]

In [10]:
def toWikidataWalk(walk: list):
    wikidata_walk = [toWikidata(link) for link in walk]
    return wikidata_walk

In [11]:
#thread pool for extract wikidata walks from same as link of dbpedia
with ThreadPool(4) as pool:
    res = list(tqdm(pool.imap(toWikidataWalk, walks), total=len(walks)))

  0%|          | 0/2501 [00:00<?, ?it/s]

DBpedia Meath_(Parliament_of_Ireland_constituency): selected Q183544 with 0 matches.
DBpedia Manhattan: selected Q11299 with 24 matches.
DBpedia Noam_Chomsky: selected Q9049 with 75 matches.
DBpedia Ottoman_Empire: selected Q12560 with 29 matches.
DBpedia Art_Institute_of_Chicago: selected Q239303 with 4 matches.
DBpedia Study_(art): selected Q5078274 with 1 matches.
DBpedia Royal_Dutch_Shell: selected Q154950 with 22 matches.
DBpedia Poet: selected Q49757 with 3 matches.
DBpedia Europe: selected Q46 with 11 matches.
DBpedia HathiTrust: selected Q3128305 with 3 matches.
DBpedia Perfect_fifth: selected Q12372854 with 5 matches.
DBpedia Småland: selected Q199957 with 13 matches.
DBpedia Zürich: selected Q72 with 29 matches.
DBpedia Dolcetto: selected Q1235396 with 1 matches.
DBpedia Aosta_Valley: selected Q1222 with 16 matches.
DBpedia Riposto: selected Q478782 with 7 matches.
DBpedia Harry_Dresden: selected Q2307373 with 20 matches.
DBpedia Polar_bear: selected Q122783 with 2 matches.
D

In [13]:
with open('wiki_walks_from_dbpedia.txt', 'wt') as f:
  for w in res:
    f.write(' '.join(w) + '\n')

In [14]:
import json
with open("dic_dbpedia_to_wikidata.json", "w") as outfile:
    json.dump(dic_dbpedia_to_wikidata.copy(), outfile)

import json
with open("d_dbpedia_sn.json", "w") as outfile:
    json.dump(d_dbpedia_sn.copy(), outfile)

import json
with open("d_wikidata_n.json", "w") as outfile:
    json.dump(d_wikidata_n.copy(), outfile)