In [1]:
%%capture
!pip install rdflib sparqlwrapper

In [29]:
from SPARQLWrapper import SPARQLWrapper, JSON
from multiprocessing.dummy import Pool as ThreadPool
from tqdm import tqdm
from multiprocessing import Process, Manager
from pprint import pprint
import json
import requests
from operator import itemgetter

In [30]:
#read dbpedia_walks_final
with requests.get('https://raw.githubusercontent.com/AlbezJelt/compass-aligned-graph-embeddings/main/data/dbpedia_walks_final.txt') as req:
  walks = [phrase.split(' ') for phrase in req.text.split('\n')]

In [31]:
# create only 1 mgr
manager = Manager()
# create dict for dbpedia to wikidata  
dic_dbpedia_to_wikidata = manager.dict()

In [32]:
def dbpedia_sameas_neighborhood(dbpedia_entity: str):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery(f"""
        SELECT ?we WHERE {{
            <http://dbpedia.org/resource/{dbpedia_entity}> ?p ?o.
            ?o owl:sameAs ?we.
            FILTER(CONTAINS(STR(?we), \"wikidata.org/entity\"))
        }}
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return [r["we"]["value"] for r in results['results']['bindings']]

In [46]:
def wikidata_neighborhood(wikidata_entity: str):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(f"""
        SELECT ?o WHERE {{
            wd:{wikidata_entity} ?p ?o.
            FILTER(CONTAINS(STR(?o), \"wikidata.org/entity\"))
        }}
    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return [r["o"]["value"] for r in results['results']['bindings']]

In [75]:
def select_sameas(wikidata_sameas, dbpedia_entity: str) -> str:
    # Find dpedia neighborhood
    dbpedia_nb = set(dbpedia_sameas_neighborhood(dbpedia_entity))
    wikidata_nbs = ((we, wikidata_neighborhood(we)) for we in wikidata_sameas)
    matches = [(nb[0], len(set(nb[1]).intersection(dbpedia_nb))) for nb in wikidata_nbs]
    return max(matches, key=itemgetter(1))[0] 

In [76]:
def toWikidata(entity: str):
    if entity in dic_dbpedia_to_wikidata:
      return dic_dbpedia_to_wikidata[entity]
    else:
      sparql = SPARQLWrapper("http://dbpedia.org/sparql")
      sparql.setQuery(f"""
          SELECT ?o
          WHERE {{
              <http://dbpedia.org/resource/{entity}> owl:sameAs ?o .
              FILTER(CONTAINS(STR(?o), \"wikidata.org/entity\"))  
              }}
          """)
      sparql.setReturnFormat(JSON)
      results = sparql.query().convert()
      wikidata_sameas = [x.replace("http://www.wikidata.org/entity/", "") for x in [r["o"]["value"] for r in results['results']['bindings']]]
      if len(wikidata_sameas) == 0:
        dic_dbpedia_to_wikidata[entity] = '?'
      elif len(wikidata_sameas) == 1:
        dic_dbpedia_to_wikidata[entity] = wikidata_sameas[0]
      else: 
        dic_dbpedia_to_wikidata[entity] = select_sameas(wikidata_sameas, entity)
      return dic_dbpedia_to_wikidata[entity]

In [77]:
def toWikidataWalk(walk: list):
    wikidata_walk = [toWikidata(link) for link in walk]
    return wikidata_walk

In [78]:
#thread pool for extract wikidata walks from same as link of dbpedia
with ThreadPool(4) as pool:
  res = list(tqdm(pool.imap(toWikidataWalk, walks), total=len(walks)))


  8%|▊         | 189/2501 [22:56<4:40:44,  7.29s/it]  


HTTPError: ignored

In [79]:
from pprint import pprint
pprint(dic_dbpedia_to_wikidata.copy())

{'.eu': 'Q41107',
 '0-4-2': 'Q2806492',
 '1815_eruption_of_Mount_Tambora': 'Q3591483',
 '1904_Summer_Olympics': 'Q8098',
 '1912_United_States_presidential_election': 'Q699289',
 '1932_United_States_presidential_election_in_Missouri': 'Q60772149',
 '1946_Italian_institutional_referendum': 'Q2260521',
 "1953_Giro_d'Italia": 'Q277415',
 '1954_FIBA_World_Championship': 'Q375654',
 '1972_Italian_general_election': 'Q2053456',
 '1973_oil_crisis': 'Q316817',
 "1974_Giro_d'Italia": 'Q745524',
 '1977_Brazilian_Grand_Prix': 'Q54400',
 '1977_South_African_Grand_Prix': 'Q171828',
 '1979_European_Parliament_election': 'Q1376068',
 '1983_Swiss_federal_election': 'Q3586921',
 '1987_World_Touring_Car_Championship': 'Q1314087',
 '1994_Cannes_Film_Festival': 'Q961852',
 "1995_FIVB_Volleyball_Men's_World_Cup": 'Q603729',
 '1995_Japanese_Grand_Prix': 'Q179328',
 '1996_United_States_presidential_election_in_New_York': 'Q7892959',
 '19th_Infantry_Division_Venezia': '?',
 '19th_meridian_east': 'Q2705853',
 '